#!/usr/bin/env python3 """Build the master tool inventory by merging three sources. Merges: 1. FOR610 course data (data/for610/tools.yaml) 2. Salt-states installation data (data/remnux/sources/salt-states.yaml) 3. REMnux docs (data/remnux/sources/remnux-docs.yaml) Output: data/remnux/tools-master.yaml """ import os import re import yaml BASE_DIR = os.path.join(os.path.dirname(__file__), "..") FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml") SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml") REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml") ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml") OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml") # Manual override mapping for tools that have different names across sources # Format: normalized_key -> canonical_id NAME_OVERRIDES = { "die": "diec", "detect-it-easy": "diec", "detect it easy": "diec", "js": "spidermonkey", "js-patched": "spidermonkey", "spidermonkey-patched": "spidermonkey", "mozilla-spidermonkey": "spidermonkey", "vol": "volatility3", "vol-py": "volatility3", "volatility-framework": "volatility3", "volatility": "volatility3", "process-hacker": "system-informer", "yara-rules": "yara", "yara-forge": "yara", "yara-x": "yara-x", "jsbeautifier": "js-beautify", "js-beautifier": "js-beautify", "ilspycmd": "ilspycmd", "ilspy": "ilspy", "upx-ucl": "upx", "unrar-free": "rar", "netcat-openbsd": "netcat", "net-tools": "net-tools", "oletools": "olevba", "pev": "readpe", "scdbg": "scdbgc", "origamindee": "origami", "pdftk-java": "pdftk", "fakenet-ng": "fakenet-ng", "accept-all-ips": "httpd", "7zip": "7zip", "7z": "7zip", "p7zip": "7zip", "info-zip": "unzip", "cutter": "cutter", "r2pipe": "radare2", "r2": "radare2", "stpyv8": "spidermonkey", "rhino-debugger": "spidermonkey", "powershell-core": "powershell", "powershell": "powershell", "didier-stevens-scripts": "didier-stevens-suite", "docker-compose": "docker", "docker": "docker", "ghidrassist-mcp": "ghidra", "remnux-mcp-server": "remnux-mcp-server", } def normalize_name(name): """Normalize a tool name for matching.""" n = name.lower().strip() n = re.sub(r'\.py$', '', n) n = re.sub(r'\.pl$', '', n) n = re.sub(r'\.bat$', '', n) n = re.sub(r'[^a-z0-9]+', '-', n) n = n.strip('-') return n def make_id(name): """Create a kebab-case ID from a name.""" n = name.lower().strip() # Keep .py/.pl as -py/-pl in the ID n = re.sub(r'\.py$', '-py', n) n = re.sub(r'\.pl$', '-pl', n) n = re.sub(r'\.bat$', '-bat', n) n = re.sub(r'[^a-z0-9]+', '-', n) n = n.strip('-') return n def load_for610(): """Load FOR610 tools.""" with open(FOR610_TOOLS) as f: data = yaml.safe_load(f) return data.get("tools", []) def load_salt_states(): """Load salt-states parsed data.""" if not os.path.exists(SALT_STATES): print(f" Warning: {SALT_STATES} not found, skipping") return [] with open(SALT_STATES) as f: data = yaml.safe_load(f) return data.get("tools", []) def load_remnux_docs(): """Load REMnux docs scraped data.""" if not os.path.exists(REMNUX_DOCS): print(f" Warning: {REMNUX_DOCS} not found, skipping") return [] with open(REMNUX_DOCS) as f: data = yaml.safe_load(f) return data.get("tools", []) def build_lookup_index(master_tools): """Build a multi-key lookup index for matching.""" index = {} for tool in master_tools: tid = tool["id"] # Index by id index[tid] = tid # Index by normalized name index[normalize_name(tool["name"])] = tid # Index by aliases for alias in tool.get("aliases", []): index[normalize_name(alias)] = tid return index def find_match(name, index): """Try to find a matching tool in the index.""" normalized = normalize_name(name) # Check overrides first if normalized in NAME_OVERRIDES: override_id = NAME_OVERRIDES[normalized] if override_id in index: return index[override_id] return override_id # Direct match if normalized in index: return index[normalized] # Try with -py suffix if normalized + "-py" in index: return index[normalized + "-py"] # Try without trailing digits stripped = re.sub(r'-?\d+$', '', normalized) if stripped and stripped in index: return index[stripped] return None def compute_help_tier(tool): """Determine the help tier based on coverage.""" has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False) has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False) has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False) if has_for610: return "rich" elif has_docs: return "standard" elif has_salt: return "basic" else: return "stub" def main(): print("Building master tool inventory...") # --- Step 1: Load FOR610 tools as base --- print("\n1. Loading FOR610 tools...") for610_tools = load_for610() print(f" Loaded {len(for610_tools)} tools") master = {} for t in for610_tools: tid = t["id"] entry = { "id": tid, "name": t["name"], "aliases": t.get("aliases", []), "description": t.get("description", ""), "in_remnux": t.get("in_remnux", False), "platform": t.get("platform", "linux"), "sources": { "for610": { "covered": True, "description": t.get("description", ""), "category": t.get("category", ""), "labs": t.get("labs", []), "sections": t.get("for610_sections", []), "typical_usage": t.get("typical_usage", []), "tags": t.get("tags", []), }, "salt_states": {"covered": False}, "remnux_docs": {"covered": False}, }, } if t.get("author"): entry["sources"]["for610"]["author"] = t["author"] master[tid] = entry # --- Step 2: Merge salt-states --- print("\n2. Loading salt-states...") salt_tools = load_salt_states() print(f" Loaded {len(salt_tools)} entries") index = build_lookup_index(list(master.values())) salt_matched = 0 salt_new = 0 for st in salt_tools: st_id = st["id"] st_names = st.get("package_names", [st_id]) # Try to match against existing tools matched_id = None for name in [st_id] + st_names: matched_id = find_match(name, index) if matched_id: break if matched_id and matched_id in master: # Enrich existing tool master[matched_id]["sources"]["salt_states"] = { "covered": True, "install_method": st.get("install_method", "unknown"), "package_name": st_names[0] if st_names else st_id, "salt_state_path": st.get("salt_state_path", ""), } master[matched_id]["in_remnux"] = True salt_matched += 1 else: # Create new tool entry new_id = make_id(st_id) # Check if override maps to something we don't have yet if normalize_name(st_id) in NAME_OVERRIDES: new_id = NAME_OVERRIDES[normalize_name(st_id)] if new_id not in master: master[new_id] = { "id": new_id, "name": st_id, "aliases": [n for n in st_names if n != st_id][:3], "description": "", "in_remnux": True, "platform": "linux", "sources": { "for610": {"covered": False}, "salt_states": { "covered": True, "install_method": st.get("install_method", "unknown"), "package_name": st_names[0] if st_names else st_id, "salt_state_path": st.get("salt_state_path", ""), }, "remnux_docs": {"covered": False}, }, } # Update index index[new_id] = new_id index[normalize_name(st_id)] = new_id for n in st_names: index[normalize_name(n)] = new_id salt_new += 1 else: # Already exists under the override ID master[new_id]["sources"]["salt_states"] = { "covered": True, "install_method": st.get("install_method", "unknown"), "package_name": st_names[0] if st_names else st_id, "salt_state_path": st.get("salt_state_path", ""), } salt_matched += 1 print(f" Matched: {salt_matched}, New: {salt_new}") # --- Step 3: Merge REMnux docs --- print("\n3. Loading REMnux docs...") doc_tools = load_remnux_docs() print(f" Loaded {len(doc_tools)} entries") # Rebuild index after salt-states additions index = build_lookup_index(list(master.values())) docs_matched = 0 docs_new = 0 for dt in doc_tools: dt_name = dt.get("name", "") dt_id = dt.get("id", make_id(dt_name)) matched_id = find_match(dt_name, index) if not matched_id: matched_id = find_match(dt_id, index) if matched_id and matched_id in master: # Enrich existing tool doc_entry = { "covered": True, "category": dt.get("category", ""), "description": dt.get("description", ""), "docs_url": dt.get("docs_url", ""), } if dt.get("website"): doc_entry["website"] = dt["website"] if dt.get("anchor"): doc_entry["anchor"] = dt["anchor"] master[matched_id]["sources"]["remnux_docs"] = doc_entry # Use REMnux docs description if we don't have one if not master[matched_id]["description"] and dt.get("description"): master[matched_id]["description"] = dt["description"] docs_matched += 1 else: # Create new entry new_id = make_id(dt_name) if dt_name else dt_id if new_id not in master: master[new_id] = { "id": new_id, "name": dt_name, "aliases": [], "description": dt.get("description", ""), "in_remnux": True, "platform": "linux", "sources": { "for610": {"covered": False}, "salt_states": {"covered": False}, "remnux_docs": { "covered": True, "category": dt.get("category", ""), "description": dt.get("description", ""), "docs_url": dt.get("docs_url", ""), }, }, } if dt.get("website"): master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"] index[new_id] = new_id index[normalize_name(dt_name)] = new_id docs_new += 1 else: master[new_id]["sources"]["remnux_docs"] = { "covered": True, "category": dt.get("category", ""), "description": dt.get("description", ""), "docs_url": dt.get("docs_url", ""), } docs_matched += 1 print(f" Matched: {docs_matched}, New: {docs_new}") # --- Step 4: Apply manual enrichments --- print("\n4. Applying manual enrichments...") if os.path.exists(ENRICHMENTS): with open(ENRICHMENTS) as f: enrich_data = yaml.safe_load(f) enrichments = enrich_data.get("enrichments", {}) enriched = 0 for tool_key, enrich in enrichments.items(): # Find the tool in master by key or normalized name matched_id = find_match(tool_key, index) if not matched_id: matched_id = tool_key if matched_id in master: tool = master[matched_id] # Apply enrichment data if enrich.get("description") and not tool.get("description"): tool["description"] = enrich["description"] elif enrich.get("description"): tool["description"] = enrich["description"] # Add usage examples to for610 source (or create enrichment source) if enrich.get("typical_usage"): if not tool["sources"]["for610"].get("covered"): tool["sources"]["for610"]["covered"] = True tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"] tool["sources"]["for610"]["tags"] = enrich.get("tags", []) tool["sources"]["for610"]["description"] = enrich.get("description", "") else: # Merge usage examples existing = tool["sources"]["for610"].get("typical_usage", []) for u in enrich["typical_usage"]: if u not in existing: existing.append(u) tool["sources"]["for610"]["typical_usage"] = existing enriched += 1 else: print(f" Warning: enrichment key '{tool_key}' not found in master") print(f" Enriched: {enriched} tools") else: print(" No enrichments file found, skipping") # Rebuild index after enrichments index = build_lookup_index(list(master.values())) # --- Step 5: Compute derived fields --- print("\n5. Computing derived fields...") for tool in master.values(): tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False) tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False) tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False) tool["help_tier"] = compute_help_tier(tool) # --- Step 6: Sort and output --- tools_list = sorted(master.values(), key=lambda t: t["id"]) # Remove windows-only/online tools that aren't in remnux # (keep them for reference but flag appropriately) tiers = {} for t in tools_list: tier = t["help_tier"] tiers[tier] = tiers.get(tier, 0) + 1 output = { "metadata": { "total_tools": len(tools_list), "in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]), "help_tier_counts": tiers, "source_coverage": { "for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]), "remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]), "salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]), "all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]), "for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]), "for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]), "docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]), "no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]), }, }, "tools": tools_list, } with open(OUTPUT, "w") as f: yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True) print(f"\n{'='*50}") print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools") print(f" In REMnux: {output['metadata']['in_remnux_count']}") print(f"\nHelp Tiers:") for tier, count in sorted(tiers.items()): print(f" {tier}: {count}") print(f"\nSource Coverage:") for key, val in output["metadata"]["source_coverage"].items(): print(f" {key}: {val}") print(f"\nOutput: {OUTPUT}") if __name__ == "__main__": main()