Add FOR610 tool/workflow knowledge base and data pipeline

Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 17:38:15 +01:00
parent 06ebb09ab0
commit f3ccc09c3d
663 changed files with 36339 additions and 1 deletions
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""Scrape REMnux documentation to extract all documented tools.
+
+Fetches docs.remnux.org tool listing pages and extracts tool names,
+descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
+"""
+
+import re
+import urllib.request
+import yaml
+import os
+import time
+
+BASE_URL = "https://docs.remnux.org/discover-the-tools"
+OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")
+
+# All known category pages from docs.remnux.org
+CATEGORY_PAGES = [
+    # Examine Static Properties
+    ("Examine Static Properties > General", "examine+static+properties/general"),
+    ("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
+    ("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
+    ("Examine Static Properties > .NET", "examine+static+properties/.net"),
+    ("Examine Static Properties > Go", "examine+static+properties/go"),
+    ("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
+    # Statically Analyze Code
+    ("Statically Analyze Code > General", "statically+analyze+code/general"),
+    ("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
+    ("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
+    ("Statically Analyze Code > Python", "statically+analyze+code/python"),
+    ("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
+    ("Statically Analyze Code > Java", "statically+analyze+code/java"),
+    ("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
+    ("Statically Analyze Code > Android", "statically+analyze+code/android"),
+    # Dynamically Reverse-Engineer Code
+    ("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
+    ("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
+    ("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
+    ("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
+    # Memory Forensics
+    ("Perform Memory Forensics", "perform+memory+forensics"),
+    # Network Interactions
+    ("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
+    ("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
+    ("Explore Network Interactions > Services", "explore+network+interactions/services"),
+    # System Interactions
+    ("Investigate System Interactions", "investigate+system+interactions"),
+    # Documents
+    ("Analyze Documents > General", "analyze+documents/general"),
+    ("Analyze Documents > PDF", "analyze+documents/pdf"),
+    ("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
+    ("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
+    # AI
+    ("Use Artificial Intelligence", "use+artificial+intelligence"),
+    # Data
+    ("Gather and Analyze Data", "gather+and+analyze+data"),
+    # View/Edit
+    ("View or Edit Files", "view+or+edit+files"),
+    # Utilities
+    ("General Utilities", "general+utilities"),
+]
+
+
+def fetch_page(url):
+    """Fetch a page and return its text content."""
+    req = urllib.request.Request(url, headers={
+        "User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
+        "Accept": "text/html,application/xhtml+xml",
+    })
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return resp.read().decode("utf-8", errors="replace")
+    except Exception as e:
+        print(f"  Warning: could not fetch {url}: {e}")
+        return None
+
+
+def normalize_id(name):
+    """Convert tool name to a normalized kebab-case ID."""
+    # Remove .py suffix for ID, keep display name
+    n = name.lower().strip()
+    n = re.sub(r'\.py$', '-py', n)
+    n = re.sub(r'\.pl$', '-pl', n)
+    n = re.sub(r'\.bat$', '-bat', n)
+    n = re.sub(r'[^a-z0-9]+', '-', n)
+    n = n.strip('-')
+    return n
+
+
+def extract_tools_from_html(html, category, category_path):
+    """Extract tool entries from a docs page HTML."""
+    tools = []
+
+    # GitBook pages use specific patterns for tool headings
+    # Pattern 1: <h2> or <h3> headings with tool names
+    # Pattern 2: Bold text followed by description
+    # The docs use a pattern like: **Tool Name** description text
+
+    # Try to find tool sections - GitBook uses specific div/section patterns
+    # Look for heading patterns with tool names
+    heading_pattern = re.compile(
+        r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
+        re.DOTALL | re.IGNORECASE
+    )
+
+    # Also try plain text patterns
+    # GitBook often renders as: tool-name followed by description
+    bold_pattern = re.compile(
+        r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
+        re.DOTALL | re.IGNORECASE
+    )
+
+    # Find headings first
+    for match in heading_pattern.finditer(html):
+        anchor_id = match.group(1)
+        heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
+        if heading_text and len(heading_text) < 80:
+            # Get description from content after heading
+            pos = match.end()
+            desc_chunk = html[pos:pos+500]
+            desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
+            desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
+            # Take first sentence
+            desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
+            if len(desc) > 200:
+                desc = desc[:197] + "..."
+
+            # Try to find website URL near this section
+            website_chunk = html[pos:pos+2000]
+            website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
+            website = website_match.group(1) if website_match else ""
+
+            tool = {
+                "name": heading_text,
+                "id": normalize_id(heading_text),
+                "category": category,
+                "category_path": category_path,
+                "description": desc,
+                "docs_url": f"{BASE_URL}/{category_path}",
+                "anchor": anchor_id,
+            }
+            if website:
+                tool["website"] = website
+            tools.append(tool)
+
+    # If we got nothing from headings, try the bold pattern
+    if not tools:
+        for match in bold_pattern.finditer(html):
+            name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
+            desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
+            desc = re.sub(r'\s+', ' ', desc).strip()
+            if name and len(name) < 80 and len(name) > 1:
+                if len(desc) > 200:
+                    desc = desc[:197] + "..."
+                tools.append({
+                    "name": name,
+                    "id": normalize_id(name),
+                    "category": category,
+                    "category_path": category_path,
+                    "description": desc,
+                    "docs_url": f"{BASE_URL}/{category_path}",
+                })
+
+    return tools
+
+
+def main():
+    print("Scraping REMnux documentation...")
+    all_tools = []
+
+    for category, path in CATEGORY_PAGES:
+        url = f"{BASE_URL}/{path}"
+        print(f"  Fetching: {category}")
+        html = fetch_page(url)
+
+        if not html:
+            print(f"    Skipped (fetch failed)")
+            continue
+
+        tools = extract_tools_from_html(html, category, path)
+        print(f"    Found {len(tools)} tools")
+        all_tools.extend(tools)
+
+        time.sleep(0.3)  # Be polite
+
+    # Deduplicate by id (same tool can appear in multiple categories)
+    seen = {}
+    for t in all_tools:
+        tid = t["id"]
+        if tid not in seen:
+            seen[tid] = t
+        else:
+            # Tool appears in multiple categories - track both
+            existing = seen[tid]
+            if "additional_categories" not in existing:
+                existing["additional_categories"] = []
+            existing["additional_categories"].append(t["category"])
+
+    unique_tools = sorted(seen.values(), key=lambda t: t["id"])
+
+    output = {
+        "metadata": {
+            "source": "https://docs.remnux.org/discover-the-tools",
+            "categories_scraped": len(CATEGORY_PAGES),
+            "total_tools_extracted": len(unique_tools),
+            "category_counts": {},
+        },
+        "tools": unique_tools,
+    }
+
+    # Count per category
+    for t in all_tools:
+        cat = t["category"]
+        output["metadata"]["category_counts"][cat] = \
+            output["metadata"]["category_counts"].get(cat, 0) + 1
+
+    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
+    with open(OUTPUT_PATH, "w") as f:
+        yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
+
+    print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
+    print(f"Output: {OUTPUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()