Add FOR610 tool/workflow knowledge base and data pipeline
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,466 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build the master tool inventory by merging three sources.
|
||||
|
||||
Merges:
|
||||
1. FOR610 course data (data/for610/tools.yaml)
|
||||
2. Salt-states installation data (data/remnux/sources/salt-states.yaml)
|
||||
3. REMnux docs (data/remnux/sources/remnux-docs.yaml)
|
||||
|
||||
Output: data/remnux/tools-master.yaml
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import yaml
|
||||
|
||||
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
|
||||
FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml")
|
||||
SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml")
|
||||
REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml")
|
||||
ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml")
|
||||
OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
|
||||
|
||||
# Manual override mapping for tools that have different names across sources
|
||||
# Format: normalized_key -> canonical_id
|
||||
NAME_OVERRIDES = {
|
||||
"die": "diec",
|
||||
"detect-it-easy": "diec",
|
||||
"detect it easy": "diec",
|
||||
"js": "spidermonkey",
|
||||
"js-patched": "spidermonkey",
|
||||
"spidermonkey-patched": "spidermonkey",
|
||||
"mozilla-spidermonkey": "spidermonkey",
|
||||
"vol": "volatility3",
|
||||
"vol-py": "volatility3",
|
||||
"volatility-framework": "volatility3",
|
||||
"volatility": "volatility3",
|
||||
"process-hacker": "system-informer",
|
||||
"yara-rules": "yara",
|
||||
"yara-forge": "yara",
|
||||
"yara-x": "yara-x",
|
||||
"jsbeautifier": "js-beautify",
|
||||
"js-beautifier": "js-beautify",
|
||||
"ilspycmd": "ilspycmd",
|
||||
"ilspy": "ilspy",
|
||||
"upx-ucl": "upx",
|
||||
"unrar-free": "rar",
|
||||
"netcat-openbsd": "netcat",
|
||||
"net-tools": "net-tools",
|
||||
"oletools": "olevba",
|
||||
"pev": "readpe",
|
||||
"scdbg": "scdbgc",
|
||||
"origamindee": "origami",
|
||||
"pdftk-java": "pdftk",
|
||||
"fakenet-ng": "fakenet-ng",
|
||||
"accept-all-ips": "httpd",
|
||||
"7zip": "7zip",
|
||||
"7z": "7zip",
|
||||
"p7zip": "7zip",
|
||||
"info-zip": "unzip",
|
||||
"cutter": "cutter",
|
||||
"r2pipe": "radare2",
|
||||
"r2": "radare2",
|
||||
"stpyv8": "spidermonkey",
|
||||
"rhino-debugger": "spidermonkey",
|
||||
"powershell-core": "powershell",
|
||||
"powershell": "powershell",
|
||||
"didier-stevens-scripts": "didier-stevens-suite",
|
||||
"docker-compose": "docker",
|
||||
"docker": "docker",
|
||||
"ghidrassist-mcp": "ghidra",
|
||||
"remnux-mcp-server": "remnux-mcp-server",
|
||||
}
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
"""Normalize a tool name for matching."""
|
||||
n = name.lower().strip()
|
||||
n = re.sub(r'\.py$', '', n)
|
||||
n = re.sub(r'\.pl$', '', n)
|
||||
n = re.sub(r'\.bat$', '', n)
|
||||
n = re.sub(r'[^a-z0-9]+', '-', n)
|
||||
n = n.strip('-')
|
||||
return n
|
||||
|
||||
|
||||
def make_id(name):
|
||||
"""Create a kebab-case ID from a name."""
|
||||
n = name.lower().strip()
|
||||
# Keep .py/.pl as -py/-pl in the ID
|
||||
n = re.sub(r'\.py$', '-py', n)
|
||||
n = re.sub(r'\.pl$', '-pl', n)
|
||||
n = re.sub(r'\.bat$', '-bat', n)
|
||||
n = re.sub(r'[^a-z0-9]+', '-', n)
|
||||
n = n.strip('-')
|
||||
return n
|
||||
|
||||
|
||||
def load_for610():
|
||||
"""Load FOR610 tools."""
|
||||
with open(FOR610_TOOLS) as f:
|
||||
data = yaml.safe_load(f)
|
||||
return data.get("tools", [])
|
||||
|
||||
|
||||
def load_salt_states():
|
||||
"""Load salt-states parsed data."""
|
||||
if not os.path.exists(SALT_STATES):
|
||||
print(f" Warning: {SALT_STATES} not found, skipping")
|
||||
return []
|
||||
with open(SALT_STATES) as f:
|
||||
data = yaml.safe_load(f)
|
||||
return data.get("tools", [])
|
||||
|
||||
|
||||
def load_remnux_docs():
|
||||
"""Load REMnux docs scraped data."""
|
||||
if not os.path.exists(REMNUX_DOCS):
|
||||
print(f" Warning: {REMNUX_DOCS} not found, skipping")
|
||||
return []
|
||||
with open(REMNUX_DOCS) as f:
|
||||
data = yaml.safe_load(f)
|
||||
return data.get("tools", [])
|
||||
|
||||
|
||||
def build_lookup_index(master_tools):
|
||||
"""Build a multi-key lookup index for matching."""
|
||||
index = {}
|
||||
for tool in master_tools:
|
||||
tid = tool["id"]
|
||||
# Index by id
|
||||
index[tid] = tid
|
||||
# Index by normalized name
|
||||
index[normalize_name(tool["name"])] = tid
|
||||
# Index by aliases
|
||||
for alias in tool.get("aliases", []):
|
||||
index[normalize_name(alias)] = tid
|
||||
return index
|
||||
|
||||
|
||||
def find_match(name, index):
|
||||
"""Try to find a matching tool in the index."""
|
||||
normalized = normalize_name(name)
|
||||
|
||||
# Check overrides first
|
||||
if normalized in NAME_OVERRIDES:
|
||||
override_id = NAME_OVERRIDES[normalized]
|
||||
if override_id in index:
|
||||
return index[override_id]
|
||||
return override_id
|
||||
|
||||
# Direct match
|
||||
if normalized in index:
|
||||
return index[normalized]
|
||||
|
||||
# Try with -py suffix
|
||||
if normalized + "-py" in index:
|
||||
return index[normalized + "-py"]
|
||||
|
||||
# Try without trailing digits
|
||||
stripped = re.sub(r'-?\d+$', '', normalized)
|
||||
if stripped and stripped in index:
|
||||
return index[stripped]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def compute_help_tier(tool):
|
||||
"""Determine the help tier based on coverage."""
|
||||
has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False)
|
||||
has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False)
|
||||
has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False)
|
||||
|
||||
if has_for610:
|
||||
return "rich"
|
||||
elif has_docs:
|
||||
return "standard"
|
||||
elif has_salt:
|
||||
return "basic"
|
||||
else:
|
||||
return "stub"
|
||||
|
||||
|
||||
def main():
|
||||
print("Building master tool inventory...")
|
||||
|
||||
# --- Step 1: Load FOR610 tools as base ---
|
||||
print("\n1. Loading FOR610 tools...")
|
||||
for610_tools = load_for610()
|
||||
print(f" Loaded {len(for610_tools)} tools")
|
||||
|
||||
master = {}
|
||||
for t in for610_tools:
|
||||
tid = t["id"]
|
||||
entry = {
|
||||
"id": tid,
|
||||
"name": t["name"],
|
||||
"aliases": t.get("aliases", []),
|
||||
"description": t.get("description", ""),
|
||||
"in_remnux": t.get("in_remnux", False),
|
||||
"platform": t.get("platform", "linux"),
|
||||
"sources": {
|
||||
"for610": {
|
||||
"covered": True,
|
||||
"description": t.get("description", ""),
|
||||
"category": t.get("category", ""),
|
||||
"labs": t.get("labs", []),
|
||||
"sections": t.get("for610_sections", []),
|
||||
"typical_usage": t.get("typical_usage", []),
|
||||
"tags": t.get("tags", []),
|
||||
},
|
||||
"salt_states": {"covered": False},
|
||||
"remnux_docs": {"covered": False},
|
||||
},
|
||||
}
|
||||
if t.get("author"):
|
||||
entry["sources"]["for610"]["author"] = t["author"]
|
||||
master[tid] = entry
|
||||
|
||||
# --- Step 2: Merge salt-states ---
|
||||
print("\n2. Loading salt-states...")
|
||||
salt_tools = load_salt_states()
|
||||
print(f" Loaded {len(salt_tools)} entries")
|
||||
|
||||
index = build_lookup_index(list(master.values()))
|
||||
salt_matched = 0
|
||||
salt_new = 0
|
||||
|
||||
for st in salt_tools:
|
||||
st_id = st["id"]
|
||||
st_names = st.get("package_names", [st_id])
|
||||
|
||||
# Try to match against existing tools
|
||||
matched_id = None
|
||||
for name in [st_id] + st_names:
|
||||
matched_id = find_match(name, index)
|
||||
if matched_id:
|
||||
break
|
||||
|
||||
if matched_id and matched_id in master:
|
||||
# Enrich existing tool
|
||||
master[matched_id]["sources"]["salt_states"] = {
|
||||
"covered": True,
|
||||
"install_method": st.get("install_method", "unknown"),
|
||||
"package_name": st_names[0] if st_names else st_id,
|
||||
"salt_state_path": st.get("salt_state_path", ""),
|
||||
}
|
||||
master[matched_id]["in_remnux"] = True
|
||||
salt_matched += 1
|
||||
else:
|
||||
# Create new tool entry
|
||||
new_id = make_id(st_id)
|
||||
# Check if override maps to something we don't have yet
|
||||
if normalize_name(st_id) in NAME_OVERRIDES:
|
||||
new_id = NAME_OVERRIDES[normalize_name(st_id)]
|
||||
|
||||
if new_id not in master:
|
||||
master[new_id] = {
|
||||
"id": new_id,
|
||||
"name": st_id,
|
||||
"aliases": [n for n in st_names if n != st_id][:3],
|
||||
"description": "",
|
||||
"in_remnux": True,
|
||||
"platform": "linux",
|
||||
"sources": {
|
||||
"for610": {"covered": False},
|
||||
"salt_states": {
|
||||
"covered": True,
|
||||
"install_method": st.get("install_method", "unknown"),
|
||||
"package_name": st_names[0] if st_names else st_id,
|
||||
"salt_state_path": st.get("salt_state_path", ""),
|
||||
},
|
||||
"remnux_docs": {"covered": False},
|
||||
},
|
||||
}
|
||||
# Update index
|
||||
index[new_id] = new_id
|
||||
index[normalize_name(st_id)] = new_id
|
||||
for n in st_names:
|
||||
index[normalize_name(n)] = new_id
|
||||
salt_new += 1
|
||||
else:
|
||||
# Already exists under the override ID
|
||||
master[new_id]["sources"]["salt_states"] = {
|
||||
"covered": True,
|
||||
"install_method": st.get("install_method", "unknown"),
|
||||
"package_name": st_names[0] if st_names else st_id,
|
||||
"salt_state_path": st.get("salt_state_path", ""),
|
||||
}
|
||||
salt_matched += 1
|
||||
|
||||
print(f" Matched: {salt_matched}, New: {salt_new}")
|
||||
|
||||
# --- Step 3: Merge REMnux docs ---
|
||||
print("\n3. Loading REMnux docs...")
|
||||
doc_tools = load_remnux_docs()
|
||||
print(f" Loaded {len(doc_tools)} entries")
|
||||
|
||||
# Rebuild index after salt-states additions
|
||||
index = build_lookup_index(list(master.values()))
|
||||
docs_matched = 0
|
||||
docs_new = 0
|
||||
|
||||
for dt in doc_tools:
|
||||
dt_name = dt.get("name", "")
|
||||
dt_id = dt.get("id", make_id(dt_name))
|
||||
|
||||
matched_id = find_match(dt_name, index)
|
||||
if not matched_id:
|
||||
matched_id = find_match(dt_id, index)
|
||||
|
||||
if matched_id and matched_id in master:
|
||||
# Enrich existing tool
|
||||
doc_entry = {
|
||||
"covered": True,
|
||||
"category": dt.get("category", ""),
|
||||
"description": dt.get("description", ""),
|
||||
"docs_url": dt.get("docs_url", ""),
|
||||
}
|
||||
if dt.get("website"):
|
||||
doc_entry["website"] = dt["website"]
|
||||
if dt.get("anchor"):
|
||||
doc_entry["anchor"] = dt["anchor"]
|
||||
|
||||
master[matched_id]["sources"]["remnux_docs"] = doc_entry
|
||||
|
||||
# Use REMnux docs description if we don't have one
|
||||
if not master[matched_id]["description"] and dt.get("description"):
|
||||
master[matched_id]["description"] = dt["description"]
|
||||
|
||||
docs_matched += 1
|
||||
else:
|
||||
# Create new entry
|
||||
new_id = make_id(dt_name) if dt_name else dt_id
|
||||
if new_id not in master:
|
||||
master[new_id] = {
|
||||
"id": new_id,
|
||||
"name": dt_name,
|
||||
"aliases": [],
|
||||
"description": dt.get("description", ""),
|
||||
"in_remnux": True,
|
||||
"platform": "linux",
|
||||
"sources": {
|
||||
"for610": {"covered": False},
|
||||
"salt_states": {"covered": False},
|
||||
"remnux_docs": {
|
||||
"covered": True,
|
||||
"category": dt.get("category", ""),
|
||||
"description": dt.get("description", ""),
|
||||
"docs_url": dt.get("docs_url", ""),
|
||||
},
|
||||
},
|
||||
}
|
||||
if dt.get("website"):
|
||||
master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"]
|
||||
index[new_id] = new_id
|
||||
index[normalize_name(dt_name)] = new_id
|
||||
docs_new += 1
|
||||
else:
|
||||
master[new_id]["sources"]["remnux_docs"] = {
|
||||
"covered": True,
|
||||
"category": dt.get("category", ""),
|
||||
"description": dt.get("description", ""),
|
||||
"docs_url": dt.get("docs_url", ""),
|
||||
}
|
||||
docs_matched += 1
|
||||
|
||||
print(f" Matched: {docs_matched}, New: {docs_new}")
|
||||
|
||||
# --- Step 4: Apply manual enrichments ---
|
||||
print("\n4. Applying manual enrichments...")
|
||||
if os.path.exists(ENRICHMENTS):
|
||||
with open(ENRICHMENTS) as f:
|
||||
enrich_data = yaml.safe_load(f)
|
||||
enrichments = enrich_data.get("enrichments", {})
|
||||
enriched = 0
|
||||
for tool_key, enrich in enrichments.items():
|
||||
# Find the tool in master by key or normalized name
|
||||
matched_id = find_match(tool_key, index)
|
||||
if not matched_id:
|
||||
matched_id = tool_key
|
||||
if matched_id in master:
|
||||
tool = master[matched_id]
|
||||
# Apply enrichment data
|
||||
if enrich.get("description") and not tool.get("description"):
|
||||
tool["description"] = enrich["description"]
|
||||
elif enrich.get("description"):
|
||||
tool["description"] = enrich["description"]
|
||||
# Add usage examples to for610 source (or create enrichment source)
|
||||
if enrich.get("typical_usage"):
|
||||
if not tool["sources"]["for610"].get("covered"):
|
||||
tool["sources"]["for610"]["covered"] = True
|
||||
tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"]
|
||||
tool["sources"]["for610"]["tags"] = enrich.get("tags", [])
|
||||
tool["sources"]["for610"]["description"] = enrich.get("description", "")
|
||||
else:
|
||||
# Merge usage examples
|
||||
existing = tool["sources"]["for610"].get("typical_usage", [])
|
||||
for u in enrich["typical_usage"]:
|
||||
if u not in existing:
|
||||
existing.append(u)
|
||||
tool["sources"]["for610"]["typical_usage"] = existing
|
||||
enriched += 1
|
||||
else:
|
||||
print(f" Warning: enrichment key '{tool_key}' not found in master")
|
||||
print(f" Enriched: {enriched} tools")
|
||||
else:
|
||||
print(" No enrichments file found, skipping")
|
||||
|
||||
# Rebuild index after enrichments
|
||||
index = build_lookup_index(list(master.values()))
|
||||
|
||||
# --- Step 5: Compute derived fields ---
|
||||
print("\n5. Computing derived fields...")
|
||||
for tool in master.values():
|
||||
tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False)
|
||||
tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False)
|
||||
tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False)
|
||||
tool["help_tier"] = compute_help_tier(tool)
|
||||
|
||||
# --- Step 6: Sort and output ---
|
||||
tools_list = sorted(master.values(), key=lambda t: t["id"])
|
||||
|
||||
# Remove windows-only/online tools that aren't in remnux
|
||||
# (keep them for reference but flag appropriately)
|
||||
|
||||
tiers = {}
|
||||
for t in tools_list:
|
||||
tier = t["help_tier"]
|
||||
tiers[tier] = tiers.get(tier, 0) + 1
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"total_tools": len(tools_list),
|
||||
"in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]),
|
||||
"help_tier_counts": tiers,
|
||||
"source_coverage": {
|
||||
"for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
|
||||
"remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]),
|
||||
"salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]),
|
||||
"all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]),
|
||||
"for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]),
|
||||
"for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]),
|
||||
"docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]),
|
||||
"no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
|
||||
},
|
||||
},
|
||||
"tools": tools_list,
|
||||
}
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools")
|
||||
print(f" In REMnux: {output['metadata']['in_remnux_count']}")
|
||||
print(f"\nHelp Tiers:")
|
||||
for tier, count in sorted(tiers.items()):
|
||||
print(f" {tier}: {count}")
|
||||
print(f"\nSource Coverage:")
|
||||
for key, val in output["metadata"]["source_coverage"].items():
|
||||
print(f" {key}: {val}")
|
||||
print(f"\nOutput: {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user