Add FOR610 tool/workflow knowledge base and data pipeline

Build comprehensive malware analysis knowledge base from 3 sources:
- SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes
- REMnux salt-states: 340 packages parsed from GitHub
- REMnux docs: 280+ tools scraped from docs.remnux.org

Master inventory merges all sources into 447 tools with help tiers
(rich/standard/basic). Pipeline generates: tools.db (397 entries),
397 cheatsheets with multi-tool recipes, 15 workflow guides, 224
TLDR pages, and coverage reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tobias
2026-03-28 17:38:15 +01:00
parent 06ebb09ab0
commit f3ccc09c3d
663 changed files with 36339 additions and 1 deletions
+226
View File
@@ -0,0 +1,226 @@
#!/usr/bin/env python3
"""Scrape REMnux documentation to extract all documented tools.
Fetches docs.remnux.org tool listing pages and extracts tool names,
descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
"""
import re
import urllib.request
import yaml
import os
import time
BASE_URL = "https://docs.remnux.org/discover-the-tools"
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")
# All known category pages from docs.remnux.org
CATEGORY_PAGES = [
# Examine Static Properties
("Examine Static Properties > General", "examine+static+properties/general"),
("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
("Examine Static Properties > .NET", "examine+static+properties/.net"),
("Examine Static Properties > Go", "examine+static+properties/go"),
("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
# Statically Analyze Code
("Statically Analyze Code > General", "statically+analyze+code/general"),
("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
("Statically Analyze Code > Python", "statically+analyze+code/python"),
("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
("Statically Analyze Code > Java", "statically+analyze+code/java"),
("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
("Statically Analyze Code > Android", "statically+analyze+code/android"),
# Dynamically Reverse-Engineer Code
("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
# Memory Forensics
("Perform Memory Forensics", "perform+memory+forensics"),
# Network Interactions
("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
("Explore Network Interactions > Services", "explore+network+interactions/services"),
# System Interactions
("Investigate System Interactions", "investigate+system+interactions"),
# Documents
("Analyze Documents > General", "analyze+documents/general"),
("Analyze Documents > PDF", "analyze+documents/pdf"),
("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
# AI
("Use Artificial Intelligence", "use+artificial+intelligence"),
# Data
("Gather and Analyze Data", "gather+and+analyze+data"),
# View/Edit
("View or Edit Files", "view+or+edit+files"),
# Utilities
("General Utilities", "general+utilities"),
]
def fetch_page(url):
"""Fetch a page and return its text content."""
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
"Accept": "text/html,application/xhtml+xml",
})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except Exception as e:
print(f" Warning: could not fetch {url}: {e}")
return None
def normalize_id(name):
"""Convert tool name to a normalized kebab-case ID."""
# Remove .py suffix for ID, keep display name
n = name.lower().strip()
n = re.sub(r'\.py$', '-py', n)
n = re.sub(r'\.pl$', '-pl', n)
n = re.sub(r'\.bat$', '-bat', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def extract_tools_from_html(html, category, category_path):
"""Extract tool entries from a docs page HTML."""
tools = []
# GitBook pages use specific patterns for tool headings
# Pattern 1: <h2> or <h3> headings with tool names
# Pattern 2: Bold text followed by description
# The docs use a pattern like: **Tool Name** description text
# Try to find tool sections - GitBook uses specific div/section patterns
# Look for heading patterns with tool names
heading_pattern = re.compile(
r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
re.DOTALL | re.IGNORECASE
)
# Also try plain text patterns
# GitBook often renders as: tool-name followed by description
bold_pattern = re.compile(
r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
re.DOTALL | re.IGNORECASE
)
# Find headings first
for match in heading_pattern.finditer(html):
anchor_id = match.group(1)
heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
if heading_text and len(heading_text) < 80:
# Get description from content after heading
pos = match.end()
desc_chunk = html[pos:pos+500]
desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
# Take first sentence
desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
if len(desc) > 200:
desc = desc[:197] + "..."
# Try to find website URL near this section
website_chunk = html[pos:pos+2000]
website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
website = website_match.group(1) if website_match else ""
tool = {
"name": heading_text,
"id": normalize_id(heading_text),
"category": category,
"category_path": category_path,
"description": desc,
"docs_url": f"{BASE_URL}/{category_path}",
"anchor": anchor_id,
}
if website:
tool["website"] = website
tools.append(tool)
# If we got nothing from headings, try the bold pattern
if not tools:
for match in bold_pattern.finditer(html):
name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
desc = re.sub(r'\s+', ' ', desc).strip()
if name and len(name) < 80 and len(name) > 1:
if len(desc) > 200:
desc = desc[:197] + "..."
tools.append({
"name": name,
"id": normalize_id(name),
"category": category,
"category_path": category_path,
"description": desc,
"docs_url": f"{BASE_URL}/{category_path}",
})
return tools
def main():
print("Scraping REMnux documentation...")
all_tools = []
for category, path in CATEGORY_PAGES:
url = f"{BASE_URL}/{path}"
print(f" Fetching: {category}")
html = fetch_page(url)
if not html:
print(f" Skipped (fetch failed)")
continue
tools = extract_tools_from_html(html, category, path)
print(f" Found {len(tools)} tools")
all_tools.extend(tools)
time.sleep(0.3) # Be polite
# Deduplicate by id (same tool can appear in multiple categories)
seen = {}
for t in all_tools:
tid = t["id"]
if tid not in seen:
seen[tid] = t
else:
# Tool appears in multiple categories - track both
existing = seen[tid]
if "additional_categories" not in existing:
existing["additional_categories"] = []
existing["additional_categories"].append(t["category"])
unique_tools = sorted(seen.values(), key=lambda t: t["id"])
output = {
"metadata": {
"source": "https://docs.remnux.org/discover-the-tools",
"categories_scraped": len(CATEGORY_PAGES),
"total_tools_extracted": len(unique_tools),
"category_counts": {},
},
"tools": unique_tools,
}
# Count per category
for t in all_tools:
cat = t["category"]
output["metadata"]["category_counts"][cat] = \
output["metadata"]["category_counts"].get(cat, 0) + 1
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
print(f"Output: {OUTPUT_PATH}")
if __name__ == "__main__":
main()