Files
docker_file_analysis/scripts/build-master-inventory.py
tobias f3ccc09c3d Add FOR610 tool/workflow knowledge base and data pipeline
Build comprehensive malware analysis knowledge base from 3 sources:
- SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes
- REMnux salt-states: 340 packages parsed from GitHub
- REMnux docs: 280+ tools scraped from docs.remnux.org

Master inventory merges all sources into 447 tools with help tiers
(rich/standard/basic). Pipeline generates: tools.db (397 entries),
397 cheatsheets with multi-tool recipes, 15 workflow guides, 224
TLDR pages, and coverage reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 17:38:15 +01:00

467 lines
17 KiB
Python

#!/usr/bin/env python3
"""Build the master tool inventory by merging three sources.
Merges:
1. FOR610 course data (data/for610/tools.yaml)
2. Salt-states installation data (data/remnux/sources/salt-states.yaml)
3. REMnux docs (data/remnux/sources/remnux-docs.yaml)
Output: data/remnux/tools-master.yaml
"""
import os
import re
import yaml
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml")
SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml")
REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml")
ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml")
OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
# Manual override mapping for tools that have different names across sources
# Format: normalized_key -> canonical_id
NAME_OVERRIDES = {
"die": "diec",
"detect-it-easy": "diec",
"detect it easy": "diec",
"js": "spidermonkey",
"js-patched": "spidermonkey",
"spidermonkey-patched": "spidermonkey",
"mozilla-spidermonkey": "spidermonkey",
"vol": "volatility3",
"vol-py": "volatility3",
"volatility-framework": "volatility3",
"volatility": "volatility3",
"process-hacker": "system-informer",
"yara-rules": "yara",
"yara-forge": "yara",
"yara-x": "yara-x",
"jsbeautifier": "js-beautify",
"js-beautifier": "js-beautify",
"ilspycmd": "ilspycmd",
"ilspy": "ilspy",
"upx-ucl": "upx",
"unrar-free": "rar",
"netcat-openbsd": "netcat",
"net-tools": "net-tools",
"oletools": "olevba",
"pev": "readpe",
"scdbg": "scdbgc",
"origamindee": "origami",
"pdftk-java": "pdftk",
"fakenet-ng": "fakenet-ng",
"accept-all-ips": "httpd",
"7zip": "7zip",
"7z": "7zip",
"p7zip": "7zip",
"info-zip": "unzip",
"cutter": "cutter",
"r2pipe": "radare2",
"r2": "radare2",
"stpyv8": "spidermonkey",
"rhino-debugger": "spidermonkey",
"powershell-core": "powershell",
"powershell": "powershell",
"didier-stevens-scripts": "didier-stevens-suite",
"docker-compose": "docker",
"docker": "docker",
"ghidrassist-mcp": "ghidra",
"remnux-mcp-server": "remnux-mcp-server",
}
def normalize_name(name):
"""Normalize a tool name for matching."""
n = name.lower().strip()
n = re.sub(r'\.py$', '', n)
n = re.sub(r'\.pl$', '', n)
n = re.sub(r'\.bat$', '', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def make_id(name):
"""Create a kebab-case ID from a name."""
n = name.lower().strip()
# Keep .py/.pl as -py/-pl in the ID
n = re.sub(r'\.py$', '-py', n)
n = re.sub(r'\.pl$', '-pl', n)
n = re.sub(r'\.bat$', '-bat', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def load_for610():
"""Load FOR610 tools."""
with open(FOR610_TOOLS) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def load_salt_states():
"""Load salt-states parsed data."""
if not os.path.exists(SALT_STATES):
print(f" Warning: {SALT_STATES} not found, skipping")
return []
with open(SALT_STATES) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def load_remnux_docs():
"""Load REMnux docs scraped data."""
if not os.path.exists(REMNUX_DOCS):
print(f" Warning: {REMNUX_DOCS} not found, skipping")
return []
with open(REMNUX_DOCS) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def build_lookup_index(master_tools):
"""Build a multi-key lookup index for matching."""
index = {}
for tool in master_tools:
tid = tool["id"]
# Index by id
index[tid] = tid
# Index by normalized name
index[normalize_name(tool["name"])] = tid
# Index by aliases
for alias in tool.get("aliases", []):
index[normalize_name(alias)] = tid
return index
def find_match(name, index):
"""Try to find a matching tool in the index."""
normalized = normalize_name(name)
# Check overrides first
if normalized in NAME_OVERRIDES:
override_id = NAME_OVERRIDES[normalized]
if override_id in index:
return index[override_id]
return override_id
# Direct match
if normalized in index:
return index[normalized]
# Try with -py suffix
if normalized + "-py" in index:
return index[normalized + "-py"]
# Try without trailing digits
stripped = re.sub(r'-?\d+$', '', normalized)
if stripped and stripped in index:
return index[stripped]
return None
def compute_help_tier(tool):
"""Determine the help tier based on coverage."""
has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False)
has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False)
has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False)
if has_for610:
return "rich"
elif has_docs:
return "standard"
elif has_salt:
return "basic"
else:
return "stub"
def main():
print("Building master tool inventory...")
# --- Step 1: Load FOR610 tools as base ---
print("\n1. Loading FOR610 tools...")
for610_tools = load_for610()
print(f" Loaded {len(for610_tools)} tools")
master = {}
for t in for610_tools:
tid = t["id"]
entry = {
"id": tid,
"name": t["name"],
"aliases": t.get("aliases", []),
"description": t.get("description", ""),
"in_remnux": t.get("in_remnux", False),
"platform": t.get("platform", "linux"),
"sources": {
"for610": {
"covered": True,
"description": t.get("description", ""),
"category": t.get("category", ""),
"labs": t.get("labs", []),
"sections": t.get("for610_sections", []),
"typical_usage": t.get("typical_usage", []),
"tags": t.get("tags", []),
},
"salt_states": {"covered": False},
"remnux_docs": {"covered": False},
},
}
if t.get("author"):
entry["sources"]["for610"]["author"] = t["author"]
master[tid] = entry
# --- Step 2: Merge salt-states ---
print("\n2. Loading salt-states...")
salt_tools = load_salt_states()
print(f" Loaded {len(salt_tools)} entries")
index = build_lookup_index(list(master.values()))
salt_matched = 0
salt_new = 0
for st in salt_tools:
st_id = st["id"]
st_names = st.get("package_names", [st_id])
# Try to match against existing tools
matched_id = None
for name in [st_id] + st_names:
matched_id = find_match(name, index)
if matched_id:
break
if matched_id and matched_id in master:
# Enrich existing tool
master[matched_id]["sources"]["salt_states"] = {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
}
master[matched_id]["in_remnux"] = True
salt_matched += 1
else:
# Create new tool entry
new_id = make_id(st_id)
# Check if override maps to something we don't have yet
if normalize_name(st_id) in NAME_OVERRIDES:
new_id = NAME_OVERRIDES[normalize_name(st_id)]
if new_id not in master:
master[new_id] = {
"id": new_id,
"name": st_id,
"aliases": [n for n in st_names if n != st_id][:3],
"description": "",
"in_remnux": True,
"platform": "linux",
"sources": {
"for610": {"covered": False},
"salt_states": {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
},
"remnux_docs": {"covered": False},
},
}
# Update index
index[new_id] = new_id
index[normalize_name(st_id)] = new_id
for n in st_names:
index[normalize_name(n)] = new_id
salt_new += 1
else:
# Already exists under the override ID
master[new_id]["sources"]["salt_states"] = {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
}
salt_matched += 1
print(f" Matched: {salt_matched}, New: {salt_new}")
# --- Step 3: Merge REMnux docs ---
print("\n3. Loading REMnux docs...")
doc_tools = load_remnux_docs()
print(f" Loaded {len(doc_tools)} entries")
# Rebuild index after salt-states additions
index = build_lookup_index(list(master.values()))
docs_matched = 0
docs_new = 0
for dt in doc_tools:
dt_name = dt.get("name", "")
dt_id = dt.get("id", make_id(dt_name))
matched_id = find_match(dt_name, index)
if not matched_id:
matched_id = find_match(dt_id, index)
if matched_id and matched_id in master:
# Enrich existing tool
doc_entry = {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
}
if dt.get("website"):
doc_entry["website"] = dt["website"]
if dt.get("anchor"):
doc_entry["anchor"] = dt["anchor"]
master[matched_id]["sources"]["remnux_docs"] = doc_entry
# Use REMnux docs description if we don't have one
if not master[matched_id]["description"] and dt.get("description"):
master[matched_id]["description"] = dt["description"]
docs_matched += 1
else:
# Create new entry
new_id = make_id(dt_name) if dt_name else dt_id
if new_id not in master:
master[new_id] = {
"id": new_id,
"name": dt_name,
"aliases": [],
"description": dt.get("description", ""),
"in_remnux": True,
"platform": "linux",
"sources": {
"for610": {"covered": False},
"salt_states": {"covered": False},
"remnux_docs": {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
},
},
}
if dt.get("website"):
master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"]
index[new_id] = new_id
index[normalize_name(dt_name)] = new_id
docs_new += 1
else:
master[new_id]["sources"]["remnux_docs"] = {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
}
docs_matched += 1
print(f" Matched: {docs_matched}, New: {docs_new}")
# --- Step 4: Apply manual enrichments ---
print("\n4. Applying manual enrichments...")
if os.path.exists(ENRICHMENTS):
with open(ENRICHMENTS) as f:
enrich_data = yaml.safe_load(f)
enrichments = enrich_data.get("enrichments", {})
enriched = 0
for tool_key, enrich in enrichments.items():
# Find the tool in master by key or normalized name
matched_id = find_match(tool_key, index)
if not matched_id:
matched_id = tool_key
if matched_id in master:
tool = master[matched_id]
# Apply enrichment data
if enrich.get("description") and not tool.get("description"):
tool["description"] = enrich["description"]
elif enrich.get("description"):
tool["description"] = enrich["description"]
# Add usage examples to for610 source (or create enrichment source)
if enrich.get("typical_usage"):
if not tool["sources"]["for610"].get("covered"):
tool["sources"]["for610"]["covered"] = True
tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"]
tool["sources"]["for610"]["tags"] = enrich.get("tags", [])
tool["sources"]["for610"]["description"] = enrich.get("description", "")
else:
# Merge usage examples
existing = tool["sources"]["for610"].get("typical_usage", [])
for u in enrich["typical_usage"]:
if u not in existing:
existing.append(u)
tool["sources"]["for610"]["typical_usage"] = existing
enriched += 1
else:
print(f" Warning: enrichment key '{tool_key}' not found in master")
print(f" Enriched: {enriched} tools")
else:
print(" No enrichments file found, skipping")
# Rebuild index after enrichments
index = build_lookup_index(list(master.values()))
# --- Step 5: Compute derived fields ---
print("\n5. Computing derived fields...")
for tool in master.values():
tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False)
tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False)
tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False)
tool["help_tier"] = compute_help_tier(tool)
# --- Step 6: Sort and output ---
tools_list = sorted(master.values(), key=lambda t: t["id"])
# Remove windows-only/online tools that aren't in remnux
# (keep them for reference but flag appropriately)
tiers = {}
for t in tools_list:
tier = t["help_tier"]
tiers[tier] = tiers.get(tier, 0) + 1
output = {
"metadata": {
"total_tools": len(tools_list),
"in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]),
"help_tier_counts": tiers,
"source_coverage": {
"for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
"remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]),
"salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]),
"all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]),
"for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]),
"for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]),
"docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]),
"no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
},
},
"tools": tools_list,
}
with open(OUTPUT, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\n{'='*50}")
print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools")
print(f" In REMnux: {output['metadata']['in_remnux_count']}")
print(f"\nHelp Tiers:")
for tier, count in sorted(tiers.items()):
print(f" {tier}: {count}")
print(f"\nSource Coverage:")
for key, val in output["metadata"]["source_coverage"].items():
print(f" {key}: {val}")
print(f"\nOutput: {OUTPUT}")
if __name__ == "__main__":
main()