Add FOR610 tool/workflow knowledge base and data pipeline
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,226 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scrape REMnux documentation to extract all documented tools.
|
||||
|
||||
Fetches docs.remnux.org tool listing pages and extracts tool names,
|
||||
descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
|
||||
"""
|
||||
|
||||
import re
|
||||
import urllib.request
|
||||
import yaml
|
||||
import os
|
||||
import time
|
||||
|
||||
BASE_URL = "https://docs.remnux.org/discover-the-tools"
|
||||
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")
|
||||
|
||||
# All known category pages from docs.remnux.org
|
||||
CATEGORY_PAGES = [
|
||||
# Examine Static Properties
|
||||
("Examine Static Properties > General", "examine+static+properties/general"),
|
||||
("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
|
||||
("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
|
||||
("Examine Static Properties > .NET", "examine+static+properties/.net"),
|
||||
("Examine Static Properties > Go", "examine+static+properties/go"),
|
||||
("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
|
||||
# Statically Analyze Code
|
||||
("Statically Analyze Code > General", "statically+analyze+code/general"),
|
||||
("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
|
||||
("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
|
||||
("Statically Analyze Code > Python", "statically+analyze+code/python"),
|
||||
("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
|
||||
("Statically Analyze Code > Java", "statically+analyze+code/java"),
|
||||
("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
|
||||
("Statically Analyze Code > Android", "statically+analyze+code/android"),
|
||||
# Dynamically Reverse-Engineer Code
|
||||
("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
|
||||
("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
|
||||
("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
|
||||
("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
|
||||
# Memory Forensics
|
||||
("Perform Memory Forensics", "perform+memory+forensics"),
|
||||
# Network Interactions
|
||||
("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
|
||||
("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
|
||||
("Explore Network Interactions > Services", "explore+network+interactions/services"),
|
||||
# System Interactions
|
||||
("Investigate System Interactions", "investigate+system+interactions"),
|
||||
# Documents
|
||||
("Analyze Documents > General", "analyze+documents/general"),
|
||||
("Analyze Documents > PDF", "analyze+documents/pdf"),
|
||||
("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
|
||||
("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
|
||||
# AI
|
||||
("Use Artificial Intelligence", "use+artificial+intelligence"),
|
||||
# Data
|
||||
("Gather and Analyze Data", "gather+and+analyze+data"),
|
||||
# View/Edit
|
||||
("View or Edit Files", "view+or+edit+files"),
|
||||
# Utilities
|
||||
("General Utilities", "general+utilities"),
|
||||
]
|
||||
|
||||
|
||||
def fetch_page(url):
|
||||
"""Fetch a page and return its text content."""
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
print(f" Warning: could not fetch {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def normalize_id(name):
|
||||
"""Convert tool name to a normalized kebab-case ID."""
|
||||
# Remove .py suffix for ID, keep display name
|
||||
n = name.lower().strip()
|
||||
n = re.sub(r'\.py$', '-py', n)
|
||||
n = re.sub(r'\.pl$', '-pl', n)
|
||||
n = re.sub(r'\.bat$', '-bat', n)
|
||||
n = re.sub(r'[^a-z0-9]+', '-', n)
|
||||
n = n.strip('-')
|
||||
return n
|
||||
|
||||
|
||||
def extract_tools_from_html(html, category, category_path):
|
||||
"""Extract tool entries from a docs page HTML."""
|
||||
tools = []
|
||||
|
||||
# GitBook pages use specific patterns for tool headings
|
||||
# Pattern 1: <h2> or <h3> headings with tool names
|
||||
# Pattern 2: Bold text followed by description
|
||||
# The docs use a pattern like: **Tool Name** description text
|
||||
|
||||
# Try to find tool sections - GitBook uses specific div/section patterns
|
||||
# Look for heading patterns with tool names
|
||||
heading_pattern = re.compile(
|
||||
r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
# Also try plain text patterns
|
||||
# GitBook often renders as: tool-name followed by description
|
||||
bold_pattern = re.compile(
|
||||
r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
# Find headings first
|
||||
for match in heading_pattern.finditer(html):
|
||||
anchor_id = match.group(1)
|
||||
heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
|
||||
if heading_text and len(heading_text) < 80:
|
||||
# Get description from content after heading
|
||||
pos = match.end()
|
||||
desc_chunk = html[pos:pos+500]
|
||||
desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
|
||||
desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
|
||||
# Take first sentence
|
||||
desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
|
||||
if len(desc) > 200:
|
||||
desc = desc[:197] + "..."
|
||||
|
||||
# Try to find website URL near this section
|
||||
website_chunk = html[pos:pos+2000]
|
||||
website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
|
||||
website = website_match.group(1) if website_match else ""
|
||||
|
||||
tool = {
|
||||
"name": heading_text,
|
||||
"id": normalize_id(heading_text),
|
||||
"category": category,
|
||||
"category_path": category_path,
|
||||
"description": desc,
|
||||
"docs_url": f"{BASE_URL}/{category_path}",
|
||||
"anchor": anchor_id,
|
||||
}
|
||||
if website:
|
||||
tool["website"] = website
|
||||
tools.append(tool)
|
||||
|
||||
# If we got nothing from headings, try the bold pattern
|
||||
if not tools:
|
||||
for match in bold_pattern.finditer(html):
|
||||
name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
|
||||
desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
|
||||
desc = re.sub(r'\s+', ' ', desc).strip()
|
||||
if name and len(name) < 80 and len(name) > 1:
|
||||
if len(desc) > 200:
|
||||
desc = desc[:197] + "..."
|
||||
tools.append({
|
||||
"name": name,
|
||||
"id": normalize_id(name),
|
||||
"category": category,
|
||||
"category_path": category_path,
|
||||
"description": desc,
|
||||
"docs_url": f"{BASE_URL}/{category_path}",
|
||||
})
|
||||
|
||||
return tools
|
||||
|
||||
|
||||
def main():
|
||||
print("Scraping REMnux documentation...")
|
||||
all_tools = []
|
||||
|
||||
for category, path in CATEGORY_PAGES:
|
||||
url = f"{BASE_URL}/{path}"
|
||||
print(f" Fetching: {category}")
|
||||
html = fetch_page(url)
|
||||
|
||||
if not html:
|
||||
print(f" Skipped (fetch failed)")
|
||||
continue
|
||||
|
||||
tools = extract_tools_from_html(html, category, path)
|
||||
print(f" Found {len(tools)} tools")
|
||||
all_tools.extend(tools)
|
||||
|
||||
time.sleep(0.3) # Be polite
|
||||
|
||||
# Deduplicate by id (same tool can appear in multiple categories)
|
||||
seen = {}
|
||||
for t in all_tools:
|
||||
tid = t["id"]
|
||||
if tid not in seen:
|
||||
seen[tid] = t
|
||||
else:
|
||||
# Tool appears in multiple categories - track both
|
||||
existing = seen[tid]
|
||||
if "additional_categories" not in existing:
|
||||
existing["additional_categories"] = []
|
||||
existing["additional_categories"].append(t["category"])
|
||||
|
||||
unique_tools = sorted(seen.values(), key=lambda t: t["id"])
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"source": "https://docs.remnux.org/discover-the-tools",
|
||||
"categories_scraped": len(CATEGORY_PAGES),
|
||||
"total_tools_extracted": len(unique_tools),
|
||||
"category_counts": {},
|
||||
},
|
||||
"tools": unique_tools,
|
||||
}
|
||||
|
||||
# Count per category
|
||||
for t in all_tools:
|
||||
cat = t["category"]
|
||||
output["metadata"]["category_counts"][cat] = \
|
||||
output["metadata"]["category_counts"].get(cat, 0) + 1
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||||
|
||||
print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
|
||||
print(f"Output: {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user