#!/usr/bin/env python3 """Scrape REMnux documentation to extract all documented tools. Fetches docs.remnux.org tool listing pages and extracts tool names, descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml. """ import re import urllib.request import yaml import os import time BASE_URL = "https://docs.remnux.org/discover-the-tools" OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml") # All known category pages from docs.remnux.org CATEGORY_PAGES = [ # Examine Static Properties ("Examine Static Properties > General", "examine+static+properties/general"), ("Examine Static Properties > PE Files", "examine+static+properties/pe-files"), ("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"), ("Examine Static Properties > .NET", "examine+static+properties/.net"), ("Examine Static Properties > Go", "examine+static+properties/go"), ("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"), # Statically Analyze Code ("Statically Analyze Code > General", "statically+analyze+code/general"), ("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"), ("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"), ("Statically Analyze Code > Python", "statically+analyze+code/python"), ("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"), ("Statically Analyze Code > Java", "statically+analyze+code/java"), ("Statically Analyze Code > .NET", "statically+analyze+code/.net"), ("Statically Analyze Code > Android", "statically+analyze+code/android"), # Dynamically Reverse-Engineer Code ("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"), ("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"), ("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"), ("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"), # Memory Forensics ("Perform Memory Forensics", "perform+memory+forensics"), # Network Interactions ("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"), ("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"), ("Explore Network Interactions > Services", "explore+network+interactions/services"), # System Interactions ("Investigate System Interactions", "investigate+system+interactions"), # Documents ("Analyze Documents > General", "analyze+documents/general"), ("Analyze Documents > PDF", "analyze+documents/pdf"), ("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"), ("Analyze Documents > Email Messages", "analyze+documents/email+messages"), # AI ("Use Artificial Intelligence", "use+artificial+intelligence"), # Data ("Gather and Analyze Data", "gather+and+analyze+data"), # View/Edit ("View or Edit Files", "view+or+edit+files"), # Utilities ("General Utilities", "general+utilities"), ] def fetch_page(url): """Fetch a page and return its text content.""" req = urllib.request.Request(url, headers={ "User-Agent": "Mozilla/5.0 (remnux-doc-scraper)", "Accept": "text/html,application/xhtml+xml", }) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") except Exception as e: print(f" Warning: could not fetch {url}: {e}") return None def normalize_id(name): """Convert tool name to a normalized kebab-case ID.""" # Remove .py suffix for ID, keep display name n = name.lower().strip() n = re.sub(r'\.py$', '-py', n) n = re.sub(r'\.pl$', '-pl', n) n = re.sub(r'\.bat$', '-bat', n) n = re.sub(r'[^a-z0-9]+', '-', n) n = n.strip('-') return n def extract_tools_from_html(html, category, category_path): """Extract tool entries from a docs page HTML.""" tools = [] # GitBook pages use specific patterns for tool headings # Pattern 1: