#!/usr/bin/env python3 """Scrape REMnux documentation to extract all documented tools. Fetches docs.remnux.org tool listing pages and extracts tool names, descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml. """ import re import urllib.request import yaml import os import time BASE_URL = "https://docs.remnux.org/discover-the-tools" OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml") # All known category pages from docs.remnux.org CATEGORY_PAGES = [ # Examine Static Properties ("Examine Static Properties > General", "examine+static+properties/general"), ("Examine Static Properties > PE Files", "examine+static+properties/pe-files"), ("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"), ("Examine Static Properties > .NET", "examine+static+properties/.net"), ("Examine Static Properties > Go", "examine+static+properties/go"), ("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"), # Statically Analyze Code ("Statically Analyze Code > General", "statically+analyze+code/general"), ("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"), ("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"), ("Statically Analyze Code > Python", "statically+analyze+code/python"), ("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"), ("Statically Analyze Code > Java", "statically+analyze+code/java"), ("Statically Analyze Code > .NET", "statically+analyze+code/.net"), ("Statically Analyze Code > Android", "statically+analyze+code/android"), # Dynamically Reverse-Engineer Code ("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"), ("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"), ("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"), ("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"), # Memory Forensics ("Perform Memory Forensics", "perform+memory+forensics"), # Network Interactions ("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"), ("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"), ("Explore Network Interactions > Services", "explore+network+interactions/services"), # System Interactions ("Investigate System Interactions", "investigate+system+interactions"), # Documents ("Analyze Documents > General", "analyze+documents/general"), ("Analyze Documents > PDF", "analyze+documents/pdf"), ("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"), ("Analyze Documents > Email Messages", "analyze+documents/email+messages"), # AI ("Use Artificial Intelligence", "use+artificial+intelligence"), # Data ("Gather and Analyze Data", "gather+and+analyze+data"), # View/Edit ("View or Edit Files", "view+or+edit+files"), # Utilities ("General Utilities", "general+utilities"), ] def fetch_page(url): """Fetch a page and return its text content.""" req = urllib.request.Request(url, headers={ "User-Agent": "Mozilla/5.0 (remnux-doc-scraper)", "Accept": "text/html,application/xhtml+xml", }) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode("utf-8", errors="replace") except Exception as e: print(f" Warning: could not fetch {url}: {e}") return None def normalize_id(name): """Convert tool name to a normalized kebab-case ID.""" # Remove .py suffix for ID, keep display name n = name.lower().strip() n = re.sub(r'\.py$', '-py', n) n = re.sub(r'\.pl$', '-pl', n) n = re.sub(r'\.bat$', '-bat', n) n = re.sub(r'[^a-z0-9]+', '-', n) n = n.strip('-') return n def extract_tools_from_html(html, category, category_path): """Extract tool entries from a docs page HTML.""" tools = [] # GitBook pages use specific patterns for tool headings # Pattern 1:

or

headings with tool names # Pattern 2: Bold text followed by description # The docs use a pattern like: **Tool Name** description text # Try to find tool sections - GitBook uses specific div/section patterns # Look for heading patterns with tool names heading_pattern = re.compile( r']*id="([^"]*)"[^>]*>.*?]*>.*?\s*(.*?)\s*', re.DOTALL | re.IGNORECASE ) # Also try plain text patterns # GitBook often renders as: tool-name followed by description bold_pattern = re.compile( r'(.*?)\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)', re.DOTALL | re.IGNORECASE ) # Find headings first for match in heading_pattern.finditer(html): anchor_id = match.group(1) heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip() if heading_text and len(heading_text) < 80: # Get description from content after heading pos = match.end() desc_chunk = html[pos:pos+500] desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk) desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip() # Take first sentence desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else "" if len(desc) > 200: desc = desc[:197] + "..." # Try to find website URL near this section website_chunk = html[pos:pos+2000] website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk) website = website_match.group(1) if website_match else "" tool = { "name": heading_text, "id": normalize_id(heading_text), "category": category, "category_path": category_path, "description": desc, "docs_url": f"{BASE_URL}/{category_path}", "anchor": anchor_id, } if website: tool["website"] = website tools.append(tool) # If we got nothing from headings, try the bold pattern if not tools: for match in bold_pattern.finditer(html): name = re.sub(r'<[^>]+>', '', match.group(1)).strip() desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip() desc = re.sub(r'\s+', ' ', desc).strip() if name and len(name) < 80 and len(name) > 1: if len(desc) > 200: desc = desc[:197] + "..." tools.append({ "name": name, "id": normalize_id(name), "category": category, "category_path": category_path, "description": desc, "docs_url": f"{BASE_URL}/{category_path}", }) return tools def main(): print("Scraping REMnux documentation...") all_tools = [] for category, path in CATEGORY_PAGES: url = f"{BASE_URL}/{path}" print(f" Fetching: {category}") html = fetch_page(url) if not html: print(f" Skipped (fetch failed)") continue tools = extract_tools_from_html(html, category, path) print(f" Found {len(tools)} tools") all_tools.extend(tools) time.sleep(0.3) # Be polite # Deduplicate by id (same tool can appear in multiple categories) seen = {} for t in all_tools: tid = t["id"] if tid not in seen: seen[tid] = t else: # Tool appears in multiple categories - track both existing = seen[tid] if "additional_categories" not in existing: existing["additional_categories"] = [] existing["additional_categories"].append(t["category"]) unique_tools = sorted(seen.values(), key=lambda t: t["id"]) output = { "metadata": { "source": "https://docs.remnux.org/discover-the-tools", "categories_scraped": len(CATEGORY_PAGES), "total_tools_extracted": len(unique_tools), "category_counts": {}, }, "tools": unique_tools, } # Count per category for t in all_tools: cat = t["category"] output["metadata"]["category_counts"][cat] = \ output["metadata"]["category_counts"].get(cat, 0) + 1 os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) with open(OUTPUT_PATH, "w") as f: yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True) print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages") print(f"Output: {OUTPUT_PATH}") if __name__ == "__main__": main()