f3ccc09c3d
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
227 lines
8.8 KiB
Python
227 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Scrape REMnux documentation to extract all documented tools.
|
|
|
|
Fetches docs.remnux.org tool listing pages and extracts tool names,
|
|
descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
|
|
"""
|
|
|
|
import re
|
|
import urllib.request
|
|
import yaml
|
|
import os
|
|
import time
|
|
|
|
BASE_URL = "https://docs.remnux.org/discover-the-tools"
|
|
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")
|
|
|
|
# All known category pages from docs.remnux.org
|
|
CATEGORY_PAGES = [
|
|
# Examine Static Properties
|
|
("Examine Static Properties > General", "examine+static+properties/general"),
|
|
("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
|
|
("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
|
|
("Examine Static Properties > .NET", "examine+static+properties/.net"),
|
|
("Examine Static Properties > Go", "examine+static+properties/go"),
|
|
("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
|
|
# Statically Analyze Code
|
|
("Statically Analyze Code > General", "statically+analyze+code/general"),
|
|
("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
|
|
("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
|
|
("Statically Analyze Code > Python", "statically+analyze+code/python"),
|
|
("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
|
|
("Statically Analyze Code > Java", "statically+analyze+code/java"),
|
|
("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
|
|
("Statically Analyze Code > Android", "statically+analyze+code/android"),
|
|
# Dynamically Reverse-Engineer Code
|
|
("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
|
|
("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
|
|
("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
|
|
("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
|
|
# Memory Forensics
|
|
("Perform Memory Forensics", "perform+memory+forensics"),
|
|
# Network Interactions
|
|
("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
|
|
("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
|
|
("Explore Network Interactions > Services", "explore+network+interactions/services"),
|
|
# System Interactions
|
|
("Investigate System Interactions", "investigate+system+interactions"),
|
|
# Documents
|
|
("Analyze Documents > General", "analyze+documents/general"),
|
|
("Analyze Documents > PDF", "analyze+documents/pdf"),
|
|
("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
|
|
("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
|
|
# AI
|
|
("Use Artificial Intelligence", "use+artificial+intelligence"),
|
|
# Data
|
|
("Gather and Analyze Data", "gather+and+analyze+data"),
|
|
# View/Edit
|
|
("View or Edit Files", "view+or+edit+files"),
|
|
# Utilities
|
|
("General Utilities", "general+utilities"),
|
|
]
|
|
|
|
|
|
def fetch_page(url):
|
|
"""Fetch a page and return its text content."""
|
|
req = urllib.request.Request(url, headers={
|
|
"User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return resp.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
print(f" Warning: could not fetch {url}: {e}")
|
|
return None
|
|
|
|
|
|
def normalize_id(name):
|
|
"""Convert tool name to a normalized kebab-case ID."""
|
|
# Remove .py suffix for ID, keep display name
|
|
n = name.lower().strip()
|
|
n = re.sub(r'\.py$', '-py', n)
|
|
n = re.sub(r'\.pl$', '-pl', n)
|
|
n = re.sub(r'\.bat$', '-bat', n)
|
|
n = re.sub(r'[^a-z0-9]+', '-', n)
|
|
n = n.strip('-')
|
|
return n
|
|
|
|
|
|
def extract_tools_from_html(html, category, category_path):
|
|
"""Extract tool entries from a docs page HTML."""
|
|
tools = []
|
|
|
|
# GitBook pages use specific patterns for tool headings
|
|
# Pattern 1: <h2> or <h3> headings with tool names
|
|
# Pattern 2: Bold text followed by description
|
|
# The docs use a pattern like: **Tool Name** description text
|
|
|
|
# Try to find tool sections - GitBook uses specific div/section patterns
|
|
# Look for heading patterns with tool names
|
|
heading_pattern = re.compile(
|
|
r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
# Also try plain text patterns
|
|
# GitBook often renders as: tool-name followed by description
|
|
bold_pattern = re.compile(
|
|
r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
# Find headings first
|
|
for match in heading_pattern.finditer(html):
|
|
anchor_id = match.group(1)
|
|
heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
|
|
if heading_text and len(heading_text) < 80:
|
|
# Get description from content after heading
|
|
pos = match.end()
|
|
desc_chunk = html[pos:pos+500]
|
|
desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
|
|
desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
|
|
# Take first sentence
|
|
desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
|
|
if len(desc) > 200:
|
|
desc = desc[:197] + "..."
|
|
|
|
# Try to find website URL near this section
|
|
website_chunk = html[pos:pos+2000]
|
|
website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
|
|
website = website_match.group(1) if website_match else ""
|
|
|
|
tool = {
|
|
"name": heading_text,
|
|
"id": normalize_id(heading_text),
|
|
"category": category,
|
|
"category_path": category_path,
|
|
"description": desc,
|
|
"docs_url": f"{BASE_URL}/{category_path}",
|
|
"anchor": anchor_id,
|
|
}
|
|
if website:
|
|
tool["website"] = website
|
|
tools.append(tool)
|
|
|
|
# If we got nothing from headings, try the bold pattern
|
|
if not tools:
|
|
for match in bold_pattern.finditer(html):
|
|
name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
|
|
desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
|
|
desc = re.sub(r'\s+', ' ', desc).strip()
|
|
if name and len(name) < 80 and len(name) > 1:
|
|
if len(desc) > 200:
|
|
desc = desc[:197] + "..."
|
|
tools.append({
|
|
"name": name,
|
|
"id": normalize_id(name),
|
|
"category": category,
|
|
"category_path": category_path,
|
|
"description": desc,
|
|
"docs_url": f"{BASE_URL}/{category_path}",
|
|
})
|
|
|
|
return tools
|
|
|
|
|
|
def main():
|
|
print("Scraping REMnux documentation...")
|
|
all_tools = []
|
|
|
|
for category, path in CATEGORY_PAGES:
|
|
url = f"{BASE_URL}/{path}"
|
|
print(f" Fetching: {category}")
|
|
html = fetch_page(url)
|
|
|
|
if not html:
|
|
print(f" Skipped (fetch failed)")
|
|
continue
|
|
|
|
tools = extract_tools_from_html(html, category, path)
|
|
print(f" Found {len(tools)} tools")
|
|
all_tools.extend(tools)
|
|
|
|
time.sleep(0.3) # Be polite
|
|
|
|
# Deduplicate by id (same tool can appear in multiple categories)
|
|
seen = {}
|
|
for t in all_tools:
|
|
tid = t["id"]
|
|
if tid not in seen:
|
|
seen[tid] = t
|
|
else:
|
|
# Tool appears in multiple categories - track both
|
|
existing = seen[tid]
|
|
if "additional_categories" not in existing:
|
|
existing["additional_categories"] = []
|
|
existing["additional_categories"].append(t["category"])
|
|
|
|
unique_tools = sorted(seen.values(), key=lambda t: t["id"])
|
|
|
|
output = {
|
|
"metadata": {
|
|
"source": "https://docs.remnux.org/discover-the-tools",
|
|
"categories_scraped": len(CATEGORY_PAGES),
|
|
"total_tools_extracted": len(unique_tools),
|
|
"category_counts": {},
|
|
},
|
|
"tools": unique_tools,
|
|
}
|
|
|
|
# Count per category
|
|
for t in all_tools:
|
|
cat = t["category"]
|
|
output["metadata"]["category_counts"][cat] = \
|
|
output["metadata"]["category_counts"].get(cat, 0) + 1
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
|
|
with open(OUTPUT_PATH, "w") as f:
|
|
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
|
|
print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
|
|
print(f"Output: {OUTPUT_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|