f3ccc09c3d
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
467 lines
17 KiB
Python
467 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""Build the master tool inventory by merging three sources.
|
|
|
|
Merges:
|
|
1. FOR610 course data (data/for610/tools.yaml)
|
|
2. Salt-states installation data (data/remnux/sources/salt-states.yaml)
|
|
3. REMnux docs (data/remnux/sources/remnux-docs.yaml)
|
|
|
|
Output: data/remnux/tools-master.yaml
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
|
|
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
|
|
FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml")
|
|
SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml")
|
|
REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml")
|
|
ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml")
|
|
OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
|
|
|
|
# Manual override mapping for tools that have different names across sources
|
|
# Format: normalized_key -> canonical_id
|
|
NAME_OVERRIDES = {
|
|
"die": "diec",
|
|
"detect-it-easy": "diec",
|
|
"detect it easy": "diec",
|
|
"js": "spidermonkey",
|
|
"js-patched": "spidermonkey",
|
|
"spidermonkey-patched": "spidermonkey",
|
|
"mozilla-spidermonkey": "spidermonkey",
|
|
"vol": "volatility3",
|
|
"vol-py": "volatility3",
|
|
"volatility-framework": "volatility3",
|
|
"volatility": "volatility3",
|
|
"process-hacker": "system-informer",
|
|
"yara-rules": "yara",
|
|
"yara-forge": "yara",
|
|
"yara-x": "yara-x",
|
|
"jsbeautifier": "js-beautify",
|
|
"js-beautifier": "js-beautify",
|
|
"ilspycmd": "ilspycmd",
|
|
"ilspy": "ilspy",
|
|
"upx-ucl": "upx",
|
|
"unrar-free": "rar",
|
|
"netcat-openbsd": "netcat",
|
|
"net-tools": "net-tools",
|
|
"oletools": "olevba",
|
|
"pev": "readpe",
|
|
"scdbg": "scdbgc",
|
|
"origamindee": "origami",
|
|
"pdftk-java": "pdftk",
|
|
"fakenet-ng": "fakenet-ng",
|
|
"accept-all-ips": "httpd",
|
|
"7zip": "7zip",
|
|
"7z": "7zip",
|
|
"p7zip": "7zip",
|
|
"info-zip": "unzip",
|
|
"cutter": "cutter",
|
|
"r2pipe": "radare2",
|
|
"r2": "radare2",
|
|
"stpyv8": "spidermonkey",
|
|
"rhino-debugger": "spidermonkey",
|
|
"powershell-core": "powershell",
|
|
"powershell": "powershell",
|
|
"didier-stevens-scripts": "didier-stevens-suite",
|
|
"docker-compose": "docker",
|
|
"docker": "docker",
|
|
"ghidrassist-mcp": "ghidra",
|
|
"remnux-mcp-server": "remnux-mcp-server",
|
|
}
|
|
|
|
|
|
def normalize_name(name):
|
|
"""Normalize a tool name for matching."""
|
|
n = name.lower().strip()
|
|
n = re.sub(r'\.py$', '', n)
|
|
n = re.sub(r'\.pl$', '', n)
|
|
n = re.sub(r'\.bat$', '', n)
|
|
n = re.sub(r'[^a-z0-9]+', '-', n)
|
|
n = n.strip('-')
|
|
return n
|
|
|
|
|
|
def make_id(name):
|
|
"""Create a kebab-case ID from a name."""
|
|
n = name.lower().strip()
|
|
# Keep .py/.pl as -py/-pl in the ID
|
|
n = re.sub(r'\.py$', '-py', n)
|
|
n = re.sub(r'\.pl$', '-pl', n)
|
|
n = re.sub(r'\.bat$', '-bat', n)
|
|
n = re.sub(r'[^a-z0-9]+', '-', n)
|
|
n = n.strip('-')
|
|
return n
|
|
|
|
|
|
def load_for610():
|
|
"""Load FOR610 tools."""
|
|
with open(FOR610_TOOLS) as f:
|
|
data = yaml.safe_load(f)
|
|
return data.get("tools", [])
|
|
|
|
|
|
def load_salt_states():
|
|
"""Load salt-states parsed data."""
|
|
if not os.path.exists(SALT_STATES):
|
|
print(f" Warning: {SALT_STATES} not found, skipping")
|
|
return []
|
|
with open(SALT_STATES) as f:
|
|
data = yaml.safe_load(f)
|
|
return data.get("tools", [])
|
|
|
|
|
|
def load_remnux_docs():
|
|
"""Load REMnux docs scraped data."""
|
|
if not os.path.exists(REMNUX_DOCS):
|
|
print(f" Warning: {REMNUX_DOCS} not found, skipping")
|
|
return []
|
|
with open(REMNUX_DOCS) as f:
|
|
data = yaml.safe_load(f)
|
|
return data.get("tools", [])
|
|
|
|
|
|
def build_lookup_index(master_tools):
|
|
"""Build a multi-key lookup index for matching."""
|
|
index = {}
|
|
for tool in master_tools:
|
|
tid = tool["id"]
|
|
# Index by id
|
|
index[tid] = tid
|
|
# Index by normalized name
|
|
index[normalize_name(tool["name"])] = tid
|
|
# Index by aliases
|
|
for alias in tool.get("aliases", []):
|
|
index[normalize_name(alias)] = tid
|
|
return index
|
|
|
|
|
|
def find_match(name, index):
|
|
"""Try to find a matching tool in the index."""
|
|
normalized = normalize_name(name)
|
|
|
|
# Check overrides first
|
|
if normalized in NAME_OVERRIDES:
|
|
override_id = NAME_OVERRIDES[normalized]
|
|
if override_id in index:
|
|
return index[override_id]
|
|
return override_id
|
|
|
|
# Direct match
|
|
if normalized in index:
|
|
return index[normalized]
|
|
|
|
# Try with -py suffix
|
|
if normalized + "-py" in index:
|
|
return index[normalized + "-py"]
|
|
|
|
# Try without trailing digits
|
|
stripped = re.sub(r'-?\d+$', '', normalized)
|
|
if stripped and stripped in index:
|
|
return index[stripped]
|
|
|
|
return None
|
|
|
|
|
|
def compute_help_tier(tool):
|
|
"""Determine the help tier based on coverage."""
|
|
has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False)
|
|
has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False)
|
|
has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False)
|
|
|
|
if has_for610:
|
|
return "rich"
|
|
elif has_docs:
|
|
return "standard"
|
|
elif has_salt:
|
|
return "basic"
|
|
else:
|
|
return "stub"
|
|
|
|
|
|
def main():
|
|
print("Building master tool inventory...")
|
|
|
|
# --- Step 1: Load FOR610 tools as base ---
|
|
print("\n1. Loading FOR610 tools...")
|
|
for610_tools = load_for610()
|
|
print(f" Loaded {len(for610_tools)} tools")
|
|
|
|
master = {}
|
|
for t in for610_tools:
|
|
tid = t["id"]
|
|
entry = {
|
|
"id": tid,
|
|
"name": t["name"],
|
|
"aliases": t.get("aliases", []),
|
|
"description": t.get("description", ""),
|
|
"in_remnux": t.get("in_remnux", False),
|
|
"platform": t.get("platform", "linux"),
|
|
"sources": {
|
|
"for610": {
|
|
"covered": True,
|
|
"description": t.get("description", ""),
|
|
"category": t.get("category", ""),
|
|
"labs": t.get("labs", []),
|
|
"sections": t.get("for610_sections", []),
|
|
"typical_usage": t.get("typical_usage", []),
|
|
"tags": t.get("tags", []),
|
|
},
|
|
"salt_states": {"covered": False},
|
|
"remnux_docs": {"covered": False},
|
|
},
|
|
}
|
|
if t.get("author"):
|
|
entry["sources"]["for610"]["author"] = t["author"]
|
|
master[tid] = entry
|
|
|
|
# --- Step 2: Merge salt-states ---
|
|
print("\n2. Loading salt-states...")
|
|
salt_tools = load_salt_states()
|
|
print(f" Loaded {len(salt_tools)} entries")
|
|
|
|
index = build_lookup_index(list(master.values()))
|
|
salt_matched = 0
|
|
salt_new = 0
|
|
|
|
for st in salt_tools:
|
|
st_id = st["id"]
|
|
st_names = st.get("package_names", [st_id])
|
|
|
|
# Try to match against existing tools
|
|
matched_id = None
|
|
for name in [st_id] + st_names:
|
|
matched_id = find_match(name, index)
|
|
if matched_id:
|
|
break
|
|
|
|
if matched_id and matched_id in master:
|
|
# Enrich existing tool
|
|
master[matched_id]["sources"]["salt_states"] = {
|
|
"covered": True,
|
|
"install_method": st.get("install_method", "unknown"),
|
|
"package_name": st_names[0] if st_names else st_id,
|
|
"salt_state_path": st.get("salt_state_path", ""),
|
|
}
|
|
master[matched_id]["in_remnux"] = True
|
|
salt_matched += 1
|
|
else:
|
|
# Create new tool entry
|
|
new_id = make_id(st_id)
|
|
# Check if override maps to something we don't have yet
|
|
if normalize_name(st_id) in NAME_OVERRIDES:
|
|
new_id = NAME_OVERRIDES[normalize_name(st_id)]
|
|
|
|
if new_id not in master:
|
|
master[new_id] = {
|
|
"id": new_id,
|
|
"name": st_id,
|
|
"aliases": [n for n in st_names if n != st_id][:3],
|
|
"description": "",
|
|
"in_remnux": True,
|
|
"platform": "linux",
|
|
"sources": {
|
|
"for610": {"covered": False},
|
|
"salt_states": {
|
|
"covered": True,
|
|
"install_method": st.get("install_method", "unknown"),
|
|
"package_name": st_names[0] if st_names else st_id,
|
|
"salt_state_path": st.get("salt_state_path", ""),
|
|
},
|
|
"remnux_docs": {"covered": False},
|
|
},
|
|
}
|
|
# Update index
|
|
index[new_id] = new_id
|
|
index[normalize_name(st_id)] = new_id
|
|
for n in st_names:
|
|
index[normalize_name(n)] = new_id
|
|
salt_new += 1
|
|
else:
|
|
# Already exists under the override ID
|
|
master[new_id]["sources"]["salt_states"] = {
|
|
"covered": True,
|
|
"install_method": st.get("install_method", "unknown"),
|
|
"package_name": st_names[0] if st_names else st_id,
|
|
"salt_state_path": st.get("salt_state_path", ""),
|
|
}
|
|
salt_matched += 1
|
|
|
|
print(f" Matched: {salt_matched}, New: {salt_new}")
|
|
|
|
# --- Step 3: Merge REMnux docs ---
|
|
print("\n3. Loading REMnux docs...")
|
|
doc_tools = load_remnux_docs()
|
|
print(f" Loaded {len(doc_tools)} entries")
|
|
|
|
# Rebuild index after salt-states additions
|
|
index = build_lookup_index(list(master.values()))
|
|
docs_matched = 0
|
|
docs_new = 0
|
|
|
|
for dt in doc_tools:
|
|
dt_name = dt.get("name", "")
|
|
dt_id = dt.get("id", make_id(dt_name))
|
|
|
|
matched_id = find_match(dt_name, index)
|
|
if not matched_id:
|
|
matched_id = find_match(dt_id, index)
|
|
|
|
if matched_id and matched_id in master:
|
|
# Enrich existing tool
|
|
doc_entry = {
|
|
"covered": True,
|
|
"category": dt.get("category", ""),
|
|
"description": dt.get("description", ""),
|
|
"docs_url": dt.get("docs_url", ""),
|
|
}
|
|
if dt.get("website"):
|
|
doc_entry["website"] = dt["website"]
|
|
if dt.get("anchor"):
|
|
doc_entry["anchor"] = dt["anchor"]
|
|
|
|
master[matched_id]["sources"]["remnux_docs"] = doc_entry
|
|
|
|
# Use REMnux docs description if we don't have one
|
|
if not master[matched_id]["description"] and dt.get("description"):
|
|
master[matched_id]["description"] = dt["description"]
|
|
|
|
docs_matched += 1
|
|
else:
|
|
# Create new entry
|
|
new_id = make_id(dt_name) if dt_name else dt_id
|
|
if new_id not in master:
|
|
master[new_id] = {
|
|
"id": new_id,
|
|
"name": dt_name,
|
|
"aliases": [],
|
|
"description": dt.get("description", ""),
|
|
"in_remnux": True,
|
|
"platform": "linux",
|
|
"sources": {
|
|
"for610": {"covered": False},
|
|
"salt_states": {"covered": False},
|
|
"remnux_docs": {
|
|
"covered": True,
|
|
"category": dt.get("category", ""),
|
|
"description": dt.get("description", ""),
|
|
"docs_url": dt.get("docs_url", ""),
|
|
},
|
|
},
|
|
}
|
|
if dt.get("website"):
|
|
master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"]
|
|
index[new_id] = new_id
|
|
index[normalize_name(dt_name)] = new_id
|
|
docs_new += 1
|
|
else:
|
|
master[new_id]["sources"]["remnux_docs"] = {
|
|
"covered": True,
|
|
"category": dt.get("category", ""),
|
|
"description": dt.get("description", ""),
|
|
"docs_url": dt.get("docs_url", ""),
|
|
}
|
|
docs_matched += 1
|
|
|
|
print(f" Matched: {docs_matched}, New: {docs_new}")
|
|
|
|
# --- Step 4: Apply manual enrichments ---
|
|
print("\n4. Applying manual enrichments...")
|
|
if os.path.exists(ENRICHMENTS):
|
|
with open(ENRICHMENTS) as f:
|
|
enrich_data = yaml.safe_load(f)
|
|
enrichments = enrich_data.get("enrichments", {})
|
|
enriched = 0
|
|
for tool_key, enrich in enrichments.items():
|
|
# Find the tool in master by key or normalized name
|
|
matched_id = find_match(tool_key, index)
|
|
if not matched_id:
|
|
matched_id = tool_key
|
|
if matched_id in master:
|
|
tool = master[matched_id]
|
|
# Apply enrichment data
|
|
if enrich.get("description") and not tool.get("description"):
|
|
tool["description"] = enrich["description"]
|
|
elif enrich.get("description"):
|
|
tool["description"] = enrich["description"]
|
|
# Add usage examples to for610 source (or create enrichment source)
|
|
if enrich.get("typical_usage"):
|
|
if not tool["sources"]["for610"].get("covered"):
|
|
tool["sources"]["for610"]["covered"] = True
|
|
tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"]
|
|
tool["sources"]["for610"]["tags"] = enrich.get("tags", [])
|
|
tool["sources"]["for610"]["description"] = enrich.get("description", "")
|
|
else:
|
|
# Merge usage examples
|
|
existing = tool["sources"]["for610"].get("typical_usage", [])
|
|
for u in enrich["typical_usage"]:
|
|
if u not in existing:
|
|
existing.append(u)
|
|
tool["sources"]["for610"]["typical_usage"] = existing
|
|
enriched += 1
|
|
else:
|
|
print(f" Warning: enrichment key '{tool_key}' not found in master")
|
|
print(f" Enriched: {enriched} tools")
|
|
else:
|
|
print(" No enrichments file found, skipping")
|
|
|
|
# Rebuild index after enrichments
|
|
index = build_lookup_index(list(master.values()))
|
|
|
|
# --- Step 5: Compute derived fields ---
|
|
print("\n5. Computing derived fields...")
|
|
for tool in master.values():
|
|
tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False)
|
|
tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False)
|
|
tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False)
|
|
tool["help_tier"] = compute_help_tier(tool)
|
|
|
|
# --- Step 6: Sort and output ---
|
|
tools_list = sorted(master.values(), key=lambda t: t["id"])
|
|
|
|
# Remove windows-only/online tools that aren't in remnux
|
|
# (keep them for reference but flag appropriately)
|
|
|
|
tiers = {}
|
|
for t in tools_list:
|
|
tier = t["help_tier"]
|
|
tiers[tier] = tiers.get(tier, 0) + 1
|
|
|
|
output = {
|
|
"metadata": {
|
|
"total_tools": len(tools_list),
|
|
"in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]),
|
|
"help_tier_counts": tiers,
|
|
"source_coverage": {
|
|
"for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
|
|
"remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]),
|
|
"salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]),
|
|
"all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]),
|
|
"for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]),
|
|
"for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]),
|
|
"docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]),
|
|
"no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
|
|
},
|
|
},
|
|
"tools": tools_list,
|
|
}
|
|
|
|
with open(OUTPUT, "w") as f:
|
|
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools")
|
|
print(f" In REMnux: {output['metadata']['in_remnux_count']}")
|
|
print(f"\nHelp Tiers:")
|
|
for tier, count in sorted(tiers.items()):
|
|
print(f" {tier}: {count}")
|
|
print(f"\nSource Coverage:")
|
|
for key, val in output["metadata"]["source_coverage"].items():
|
|
print(f" {key}: {val}")
|
|
print(f"\nOutput: {OUTPUT}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|