docker_file_analysis/scripts/scrape-remnux-docs.py

#!/usr/bin/env python3
"""Scrape REMnux documentation to extract all documented tools.

Fetches docs.remnux.org tool listing pages and extracts tool names,
descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
"""

import re
import urllib.request
import yaml
import os
import time

BASE_URL = "https://docs.remnux.org/discover-the-tools"
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")

# All known category pages from docs.remnux.org
CATEGORY_PAGES = [
    # Examine Static Properties
    ("Examine Static Properties > General", "examine+static+properties/general"),
    ("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
    ("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
    ("Examine Static Properties > .NET", "examine+static+properties/.net"),
    ("Examine Static Properties > Go", "examine+static+properties/go"),
    ("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
    # Statically Analyze Code
    ("Statically Analyze Code > General", "statically+analyze+code/general"),
    ("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
    ("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
    ("Statically Analyze Code > Python", "statically+analyze+code/python"),
    ("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
    ("Statically Analyze Code > Java", "statically+analyze+code/java"),
    ("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
    ("Statically Analyze Code > Android", "statically+analyze+code/android"),
    # Dynamically Reverse-Engineer Code
    ("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
    ("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
    ("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
    ("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
    # Memory Forensics
    ("Perform Memory Forensics", "perform+memory+forensics"),
    # Network Interactions
    ("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
    ("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
    ("Explore Network Interactions > Services", "explore+network+interactions/services"),
    # System Interactions
    ("Investigate System Interactions", "investigate+system+interactions"),
    # Documents
    ("Analyze Documents > General", "analyze+documents/general"),
    ("Analyze Documents > PDF", "analyze+documents/pdf"),
    ("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
    ("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
    # AI
    ("Use Artificial Intelligence", "use+artificial+intelligence"),
    # Data
    ("Gather and Analyze Data", "gather+and+analyze+data"),
    # View/Edit
    ("View or Edit Files", "view+or+edit+files"),
    # Utilities
    ("General Utilities", "general+utilities"),
]


def fetch_page(url):
    """Fetch a page and return its text content."""
    req = urllib.request.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
        "Accept": "text/html,application/xhtml+xml",
    })
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            return resp.read().decode("utf-8", errors="replace")
    except Exception as e:
        print(f"  Warning: could not fetch {url}: {e}")
        return None


def normalize_id(name):
    """Convert tool name to a normalized kebab-case ID."""
    # Remove .py suffix for ID, keep display name
    n = name.lower().strip()
    n = re.sub(r'\.py$', '-py', n)
    n = re.sub(r'\.pl$', '-pl', n)
    n = re.sub(r'\.bat$', '-bat', n)
    n = re.sub(r'[^a-z0-9]+', '-', n)
    n = n.strip('-')
    return n


def extract_tools_from_html(html, category, category_path):
    """Extract tool entries from a docs page HTML."""
    tools = []

    # GitBook pages use specific patterns for tool headings
    # Pattern 1: <h2> or <h3> headings with tool names
    # Pattern 2: Bold text followed by description
    # The docs use a pattern like: **Tool Name** description text

    # Try to find tool sections - GitBook uses specific div/section patterns
    # Look for heading patterns with tool names
    heading_pattern = re.compile(
        r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
        re.DOTALL | re.IGNORECASE
    )

    # Also try plain text patterns
    # GitBook often renders as: tool-name followed by description
    bold_pattern = re.compile(
        r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
        re.DOTALL | re.IGNORECASE
    )

    # Find headings first
    for match in heading_pattern.finditer(html):
        anchor_id = match.group(1)
        heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
        if heading_text and len(heading_text) < 80:
            # Get description from content after heading
            pos = match.end()
            desc_chunk = html[pos:pos+500]
            desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
            desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
            # Take first sentence
            desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
            if len(desc) > 200:
                desc = desc[:197] + "..."

            # Try to find website URL near this section
            website_chunk = html[pos:pos+2000]
            website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
            website = website_match.group(1) if website_match else ""

            tool = {
                "name": heading_text,
                "id": normalize_id(heading_text),
                "category": category,
                "category_path": category_path,
                "description": desc,
                "docs_url": f"{BASE_URL}/{category_path}",
                "anchor": anchor_id,
            }
            if website:
                tool["website"] = website
            tools.append(tool)

    # If we got nothing from headings, try the bold pattern
    if not tools:
        for match in bold_pattern.finditer(html):
            name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
            desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
            desc = re.sub(r'\s+', ' ', desc).strip()
            if name and len(name) < 80 and len(name) > 1:
                if len(desc) > 200:
                    desc = desc[:197] + "..."
                tools.append({
                    "name": name,
                    "id": normalize_id(name),
                    "category": category,
                    "category_path": category_path,
                    "description": desc,
                    "docs_url": f"{BASE_URL}/{category_path}",
                })

    return tools


def main():
    print("Scraping REMnux documentation...")
    all_tools = []

    for category, path in CATEGORY_PAGES:
        url = f"{BASE_URL}/{path}"
        print(f"  Fetching: {category}")
        html = fetch_page(url)

        if not html:
            print(f"    Skipped (fetch failed)")
            continue

        tools = extract_tools_from_html(html, category, path)
        print(f"    Found {len(tools)} tools")
        all_tools.extend(tools)

        time.sleep(0.3)  # Be polite

    # Deduplicate by id (same tool can appear in multiple categories)
    seen = {}
    for t in all_tools:
        tid = t["id"]
        if tid not in seen:
            seen[tid] = t
        else:
            # Tool appears in multiple categories - track both
            existing = seen[tid]
            if "additional_categories" not in existing:
                existing["additional_categories"] = []
            existing["additional_categories"].append(t["category"])

    unique_tools = sorted(seen.values(), key=lambda t: t["id"])

    output = {
        "metadata": {
            "source": "https://docs.remnux.org/discover-the-tools",
            "categories_scraped": len(CATEGORY_PAGES),
            "total_tools_extracted": len(unique_tools),
            "category_counts": {},
        },
        "tools": unique_tools,
    }

    # Count per category
    for t in all_tools:
        cat = t["category"]
        output["metadata"]["category_counts"][cat] = \
            output["metadata"]["category_counts"].get(cat, 0) + 1

    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    with open(OUTPUT_PATH, "w") as f:
        yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

    print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
    print(f"Output: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()