Add FOR610 tool/workflow knowledge base and data pipeline

Build comprehensive malware analysis knowledge base from 3 sources:
- SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes
- REMnux salt-states: 340 packages parsed from GitHub
- REMnux docs: 280+ tools scraped from docs.remnux.org

Master inventory merges all sources into 447 tools with help tiers
(rich/standard/basic). Pipeline generates: tools.db (397 entries),
397 cheatsheets with multi-tool recipes, 15 workflow guides, 224
TLDR pages, and coverage reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tobias
2026-03-28 17:38:15 +01:00
parent 06ebb09ab0
commit f3ccc09c3d
663 changed files with 36339 additions and 1 deletions
+466
View File
@@ -0,0 +1,466 @@
#!/usr/bin/env python3
"""Build the master tool inventory by merging three sources.
Merges:
1. FOR610 course data (data/for610/tools.yaml)
2. Salt-states installation data (data/remnux/sources/salt-states.yaml)
3. REMnux docs (data/remnux/sources/remnux-docs.yaml)
Output: data/remnux/tools-master.yaml
"""
import os
import re
import yaml
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml")
SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml")
REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml")
ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml")
OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
# Manual override mapping for tools that have different names across sources
# Format: normalized_key -> canonical_id
NAME_OVERRIDES = {
"die": "diec",
"detect-it-easy": "diec",
"detect it easy": "diec",
"js": "spidermonkey",
"js-patched": "spidermonkey",
"spidermonkey-patched": "spidermonkey",
"mozilla-spidermonkey": "spidermonkey",
"vol": "volatility3",
"vol-py": "volatility3",
"volatility-framework": "volatility3",
"volatility": "volatility3",
"process-hacker": "system-informer",
"yara-rules": "yara",
"yara-forge": "yara",
"yara-x": "yara-x",
"jsbeautifier": "js-beautify",
"js-beautifier": "js-beautify",
"ilspycmd": "ilspycmd",
"ilspy": "ilspy",
"upx-ucl": "upx",
"unrar-free": "rar",
"netcat-openbsd": "netcat",
"net-tools": "net-tools",
"oletools": "olevba",
"pev": "readpe",
"scdbg": "scdbgc",
"origamindee": "origami",
"pdftk-java": "pdftk",
"fakenet-ng": "fakenet-ng",
"accept-all-ips": "httpd",
"7zip": "7zip",
"7z": "7zip",
"p7zip": "7zip",
"info-zip": "unzip",
"cutter": "cutter",
"r2pipe": "radare2",
"r2": "radare2",
"stpyv8": "spidermonkey",
"rhino-debugger": "spidermonkey",
"powershell-core": "powershell",
"powershell": "powershell",
"didier-stevens-scripts": "didier-stevens-suite",
"docker-compose": "docker",
"docker": "docker",
"ghidrassist-mcp": "ghidra",
"remnux-mcp-server": "remnux-mcp-server",
}
def normalize_name(name):
"""Normalize a tool name for matching."""
n = name.lower().strip()
n = re.sub(r'\.py$', '', n)
n = re.sub(r'\.pl$', '', n)
n = re.sub(r'\.bat$', '', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def make_id(name):
"""Create a kebab-case ID from a name."""
n = name.lower().strip()
# Keep .py/.pl as -py/-pl in the ID
n = re.sub(r'\.py$', '-py', n)
n = re.sub(r'\.pl$', '-pl', n)
n = re.sub(r'\.bat$', '-bat', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def load_for610():
"""Load FOR610 tools."""
with open(FOR610_TOOLS) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def load_salt_states():
"""Load salt-states parsed data."""
if not os.path.exists(SALT_STATES):
print(f" Warning: {SALT_STATES} not found, skipping")
return []
with open(SALT_STATES) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def load_remnux_docs():
"""Load REMnux docs scraped data."""
if not os.path.exists(REMNUX_DOCS):
print(f" Warning: {REMNUX_DOCS} not found, skipping")
return []
with open(REMNUX_DOCS) as f:
data = yaml.safe_load(f)
return data.get("tools", [])
def build_lookup_index(master_tools):
"""Build a multi-key lookup index for matching."""
index = {}
for tool in master_tools:
tid = tool["id"]
# Index by id
index[tid] = tid
# Index by normalized name
index[normalize_name(tool["name"])] = tid
# Index by aliases
for alias in tool.get("aliases", []):
index[normalize_name(alias)] = tid
return index
def find_match(name, index):
"""Try to find a matching tool in the index."""
normalized = normalize_name(name)
# Check overrides first
if normalized in NAME_OVERRIDES:
override_id = NAME_OVERRIDES[normalized]
if override_id in index:
return index[override_id]
return override_id
# Direct match
if normalized in index:
return index[normalized]
# Try with -py suffix
if normalized + "-py" in index:
return index[normalized + "-py"]
# Try without trailing digits
stripped = re.sub(r'-?\d+$', '', normalized)
if stripped and stripped in index:
return index[stripped]
return None
def compute_help_tier(tool):
"""Determine the help tier based on coverage."""
has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False)
has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False)
has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False)
if has_for610:
return "rich"
elif has_docs:
return "standard"
elif has_salt:
return "basic"
else:
return "stub"
def main():
print("Building master tool inventory...")
# --- Step 1: Load FOR610 tools as base ---
print("\n1. Loading FOR610 tools...")
for610_tools = load_for610()
print(f" Loaded {len(for610_tools)} tools")
master = {}
for t in for610_tools:
tid = t["id"]
entry = {
"id": tid,
"name": t["name"],
"aliases": t.get("aliases", []),
"description": t.get("description", ""),
"in_remnux": t.get("in_remnux", False),
"platform": t.get("platform", "linux"),
"sources": {
"for610": {
"covered": True,
"description": t.get("description", ""),
"category": t.get("category", ""),
"labs": t.get("labs", []),
"sections": t.get("for610_sections", []),
"typical_usage": t.get("typical_usage", []),
"tags": t.get("tags", []),
},
"salt_states": {"covered": False},
"remnux_docs": {"covered": False},
},
}
if t.get("author"):
entry["sources"]["for610"]["author"] = t["author"]
master[tid] = entry
# --- Step 2: Merge salt-states ---
print("\n2. Loading salt-states...")
salt_tools = load_salt_states()
print(f" Loaded {len(salt_tools)} entries")
index = build_lookup_index(list(master.values()))
salt_matched = 0
salt_new = 0
for st in salt_tools:
st_id = st["id"]
st_names = st.get("package_names", [st_id])
# Try to match against existing tools
matched_id = None
for name in [st_id] + st_names:
matched_id = find_match(name, index)
if matched_id:
break
if matched_id and matched_id in master:
# Enrich existing tool
master[matched_id]["sources"]["salt_states"] = {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
}
master[matched_id]["in_remnux"] = True
salt_matched += 1
else:
# Create new tool entry
new_id = make_id(st_id)
# Check if override maps to something we don't have yet
if normalize_name(st_id) in NAME_OVERRIDES:
new_id = NAME_OVERRIDES[normalize_name(st_id)]
if new_id not in master:
master[new_id] = {
"id": new_id,
"name": st_id,
"aliases": [n for n in st_names if n != st_id][:3],
"description": "",
"in_remnux": True,
"platform": "linux",
"sources": {
"for610": {"covered": False},
"salt_states": {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
},
"remnux_docs": {"covered": False},
},
}
# Update index
index[new_id] = new_id
index[normalize_name(st_id)] = new_id
for n in st_names:
index[normalize_name(n)] = new_id
salt_new += 1
else:
# Already exists under the override ID
master[new_id]["sources"]["salt_states"] = {
"covered": True,
"install_method": st.get("install_method", "unknown"),
"package_name": st_names[0] if st_names else st_id,
"salt_state_path": st.get("salt_state_path", ""),
}
salt_matched += 1
print(f" Matched: {salt_matched}, New: {salt_new}")
# --- Step 3: Merge REMnux docs ---
print("\n3. Loading REMnux docs...")
doc_tools = load_remnux_docs()
print(f" Loaded {len(doc_tools)} entries")
# Rebuild index after salt-states additions
index = build_lookup_index(list(master.values()))
docs_matched = 0
docs_new = 0
for dt in doc_tools:
dt_name = dt.get("name", "")
dt_id = dt.get("id", make_id(dt_name))
matched_id = find_match(dt_name, index)
if not matched_id:
matched_id = find_match(dt_id, index)
if matched_id and matched_id in master:
# Enrich existing tool
doc_entry = {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
}
if dt.get("website"):
doc_entry["website"] = dt["website"]
if dt.get("anchor"):
doc_entry["anchor"] = dt["anchor"]
master[matched_id]["sources"]["remnux_docs"] = doc_entry
# Use REMnux docs description if we don't have one
if not master[matched_id]["description"] and dt.get("description"):
master[matched_id]["description"] = dt["description"]
docs_matched += 1
else:
# Create new entry
new_id = make_id(dt_name) if dt_name else dt_id
if new_id not in master:
master[new_id] = {
"id": new_id,
"name": dt_name,
"aliases": [],
"description": dt.get("description", ""),
"in_remnux": True,
"platform": "linux",
"sources": {
"for610": {"covered": False},
"salt_states": {"covered": False},
"remnux_docs": {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
},
},
}
if dt.get("website"):
master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"]
index[new_id] = new_id
index[normalize_name(dt_name)] = new_id
docs_new += 1
else:
master[new_id]["sources"]["remnux_docs"] = {
"covered": True,
"category": dt.get("category", ""),
"description": dt.get("description", ""),
"docs_url": dt.get("docs_url", ""),
}
docs_matched += 1
print(f" Matched: {docs_matched}, New: {docs_new}")
# --- Step 4: Apply manual enrichments ---
print("\n4. Applying manual enrichments...")
if os.path.exists(ENRICHMENTS):
with open(ENRICHMENTS) as f:
enrich_data = yaml.safe_load(f)
enrichments = enrich_data.get("enrichments", {})
enriched = 0
for tool_key, enrich in enrichments.items():
# Find the tool in master by key or normalized name
matched_id = find_match(tool_key, index)
if not matched_id:
matched_id = tool_key
if matched_id in master:
tool = master[matched_id]
# Apply enrichment data
if enrich.get("description") and not tool.get("description"):
tool["description"] = enrich["description"]
elif enrich.get("description"):
tool["description"] = enrich["description"]
# Add usage examples to for610 source (or create enrichment source)
if enrich.get("typical_usage"):
if not tool["sources"]["for610"].get("covered"):
tool["sources"]["for610"]["covered"] = True
tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"]
tool["sources"]["for610"]["tags"] = enrich.get("tags", [])
tool["sources"]["for610"]["description"] = enrich.get("description", "")
else:
# Merge usage examples
existing = tool["sources"]["for610"].get("typical_usage", [])
for u in enrich["typical_usage"]:
if u not in existing:
existing.append(u)
tool["sources"]["for610"]["typical_usage"] = existing
enriched += 1
else:
print(f" Warning: enrichment key '{tool_key}' not found in master")
print(f" Enriched: {enriched} tools")
else:
print(" No enrichments file found, skipping")
# Rebuild index after enrichments
index = build_lookup_index(list(master.values()))
# --- Step 5: Compute derived fields ---
print("\n5. Computing derived fields...")
for tool in master.values():
tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False)
tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False)
tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False)
tool["help_tier"] = compute_help_tier(tool)
# --- Step 6: Sort and output ---
tools_list = sorted(master.values(), key=lambda t: t["id"])
# Remove windows-only/online tools that aren't in remnux
# (keep them for reference but flag appropriately)
tiers = {}
for t in tools_list:
tier = t["help_tier"]
tiers[tier] = tiers.get(tier, 0) + 1
output = {
"metadata": {
"total_tools": len(tools_list),
"in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]),
"help_tier_counts": tiers,
"source_coverage": {
"for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
"remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]),
"salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]),
"all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]),
"for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]),
"for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]),
"docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]),
"no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
},
},
"tools": tools_list,
}
with open(OUTPUT, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\n{'='*50}")
print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools")
print(f" In REMnux: {output['metadata']['in_remnux_count']}")
print(f"\nHelp Tiers:")
for tier, count in sorted(tiers.items()):
print(f" {tier}: {count}")
print(f"\nSource Coverage:")
for key, val in output["metadata"]["source_coverage"].items():
print(f" {key}: {val}")
print(f"\nOutput: {OUTPUT}")
if __name__ == "__main__":
main()
+122
View File
@@ -0,0 +1,122 @@
#!/usr/bin/env python3
"""Generate coverage report from the master tool inventory.
Reads data/remnux/tools-master.yaml and produces:
- data/generated/coverage-report.md (human-readable)
- data/remnux/coverage-report.yaml (machine-readable)
"""
import os
import yaml
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
MASTER = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
MD_OUTPUT = os.path.join(BASE_DIR, "data", "generated", "coverage-report.md")
YAML_OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "coverage-report.yaml")
def main():
with open(MASTER) as f:
data = yaml.safe_load(f)
tools = data["tools"]
meta = data["metadata"]
# Classify tools
remnux_tools = [t for t in tools if t.get("in_remnux")]
rich = [t for t in tools if t["help_tier"] == "rich"]
standard = [t for t in tools if t["help_tier"] == "standard"]
basic = [t for t in tools if t["help_tier"] == "basic"]
stub = [t for t in tools if t["help_tier"] == "stub"]
# Tools in REMnux with no good help
needs_help = [t for t in remnux_tools if t["help_tier"] in ("basic", "stub")]
needs_help.sort(key=lambda t: t["name"])
# Tools with FOR610 coverage (richest help)
for610_covered = [t for t in remnux_tools if t.get("has_for610_coverage")]
for610_covered.sort(key=lambda t: t["name"])
# Tools with REMnux docs only (decent help)
docs_only = [t for t in remnux_tools if t.get("has_remnux_docs") and not t.get("has_for610_coverage")]
docs_only.sort(key=lambda t: t["name"])
# Generate markdown report
lines = [
"# Tool Coverage Report",
"",
"## Summary",
"",
f"| Metric | Count |",
f"|--------|-------|",
f"| Total tools in master inventory | {len(tools)} |",
f"| Tools in REMnux container | {len(remnux_tools)} |",
f"| Rich help (FOR610 coverage) | {len(rich)} |",
f"| Standard help (REMnux docs) | {len(standard)} |",
f"| Basic help (salt-states only) | {len(basic)} |",
f"| Stub (no documentation) | {len(stub)} |",
"",
"## Source Overlap",
"",
f"| Combination | Count |",
f"|-------------|-------|",
]
for key, val in meta["source_coverage"].items():
lines.append(f"| {key.replace('_', ' ')} | {val} |")
lines += [
"",
"## Priority: REMnux Tools Needing Help",
"",
f"These {len(needs_help)} tools are installed in the container but have minimal or no documentation:",
"",
]
for t in needs_help:
tier_badge = "basic" if t["help_tier"] == "basic" else "STUB"
lines.append(f"- `{t['name']}` [{tier_badge}]")
lines += [
"",
f"## Rich Help Tools ({len(for610_covered)} tools with FOR610 coverage)",
"",
]
for t in for610_covered:
labs = t["sources"]["for610"].get("labs", [])
lab_str = f" (Labs: {', '.join(labs)})" if labs else ""
lines.append(f"- `{t['name']}`{lab_str}")
lines += [
"",
f"## Standard Help Tools ({len(docs_only)} tools with REMnux docs only)",
"",
]
for t in docs_only:
cat = t["sources"]["remnux_docs"].get("category", "")
lines.append(f"- `{t['name']}` — {cat}")
md_content = "\n".join(lines) + "\n"
os.makedirs(os.path.dirname(MD_OUTPUT), exist_ok=True)
with open(MD_OUTPUT, "w") as f:
f.write(md_content)
# Machine-readable YAML
yaml_data = {
"summary": meta,
"needs_help": [{"id": t["id"], "name": t["name"], "tier": t["help_tier"]} for t in needs_help],
"rich_tools": [{"id": t["id"], "name": t["name"]} for t in for610_covered],
"standard_tools": [{"id": t["id"], "name": t["name"]} for t in docs_only],
}
with open(YAML_OUTPUT, "w") as f:
yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False)
print(f"Coverage report generated:")
print(f" Markdown: {MD_OUTPUT}")
print(f" YAML: {YAML_OUTPUT}")
print(f"\n {len(remnux_tools)} REMnux tools:")
print(f" {len(rich)} rich, {len(standard)} standard, {len(basic)} basic, {len(stub)} stub")
print(f" {len(needs_help)} need better documentation")
if __name__ == "__main__":
main()
+534
View File
@@ -0,0 +1,534 @@
#!/usr/bin/env python3
"""Generate all help artifacts from the master tool inventory.
Reads data/remnux/tools-master.yaml and data/for610/workflows.yaml to produce:
- data/generated/tools.db (pipe-delimited for find-tool)
- data/generated/cheatsheets/*.cheat (per-tool cheat sheets)
- data/generated/workflows/*.txt (workflow help files)
- data/generated/tldr/*.md (TLDR pages)
"""
import os
import re
import yaml
import textwrap
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
MASTER = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
WORKFLOWS_SRC = os.path.join(BASE_DIR, "data", "for610", "workflows.yaml")
RECIPES_SRC = os.path.join(BASE_DIR, "data", "for610", "recipes.yaml")
GEN_DIR = os.path.join(BASE_DIR, "data", "generated")
def load_master():
with open(MASTER) as f:
return yaml.safe_load(f)
def load_workflows():
with open(WORKFLOWS_SRC) as f:
return yaml.safe_load(f)
def load_recipes():
if os.path.exists(RECIPES_SRC):
with open(RECIPES_SRC) as f:
return yaml.safe_load(f)
return {"recipes": []}
def build_recipe_index(recipes_data):
"""Build a mapping of tool_id -> list of recipes that use that tool."""
index = {}
for recipe in recipes_data.get("recipes", []):
for tool_id in recipe.get("tools", []):
index.setdefault(tool_id, []).append(recipe)
# Also index by normalized variants
normalized = tool_id.lower().replace("-", "").replace("_", "")
if normalized != tool_id:
index.setdefault(normalized, []).append(recipe)
return index
# ============================================================
# tools.db generator
# ============================================================
def generate_tools_db(tools):
"""Generate pipe-delimited tools.db for find-tool."""
output_path = os.path.join(GEN_DIR, "tools.db")
lines = []
for t in tools:
if not t.get("in_remnux"):
continue
name = t["name"]
desc = t.get("description", "").replace("|", "/").replace("\n", " ").strip()[:120]
if not desc:
desc = f"(no description available)"
# Get best category
cat = ""
if t["sources"]["remnux_docs"].get("covered"):
cat = t["sources"]["remnux_docs"].get("category", "")
elif t["sources"]["for610"].get("covered"):
cat = t["sources"]["for610"].get("category", "")
# Get best usage example
usage = ""
if t["sources"]["for610"].get("covered"):
usages = t["sources"]["for610"].get("typical_usage", [])
if usages:
usage = usages[0]
if not usage:
usage = f"{name} --help"
usage = usage.replace("|", " ").strip()
tier = t.get("help_tier", "stub")
lines.append(f"{name}|{desc}|{cat}|{usage}|{tier}")
lines.sort()
with open(output_path, "w") as f:
f.write("\n".join(lines) + "\n")
print(f" tools.db: {len(lines)} entries")
return len(lines)
# ============================================================
# Cheatsheet generator
# ============================================================
def sanitize_filename(name):
"""Convert tool name to a safe filename."""
return re.sub(r'[^a-zA-Z0-9._-]', '-', name).strip('-').lower()
def generate_usage_comment(name, usage, index):
"""Generate a descriptive comment for a usage example."""
# Analyze the command to produce a meaningful description
usage_lower = usage.lower()
if index == 0:
return f"Basic usage"
# Try to describe based on flags
if "-vv" in usage or "--verbose" in usage:
return "Verbose output with details"
if "--no-static" in usage or "--no static" in usage:
return "Skip static analysis, focus on dynamic"
if "-n " in usage:
return "Suppress default output"
if "-a " in usage or "--all" in usage:
return "Show all results"
if "-s " in usage:
return "Select specific item"
if "-d " in usage:
return "Dump/extract content"
if "-r " in usage:
return "Recursive/follow references"
if "-k " in usage:
return "Extract by keyword"
if "-o " in usage:
return "Output to file"
if "-f " in usage:
return "Process input file"
if "-i " in usage:
return "Case-insensitive search"
if "grep" in usage_lower:
return "Filter output for specific pattern"
if "--help" in usage:
return "Show help"
if "|" in usage:
return "Pipe output for processing"
if ">" in usage:
return "Save output to file"
return f"Alternative usage"
def format_recipes_section(tool_id, recipe_index):
"""Generate the recipes section for a cheatsheet."""
recipes = recipe_index.get(tool_id, [])
if not recipes:
# Try variants
for variant in [tool_id.replace("-py", ""), tool_id.replace("-", "")]:
recipes = recipe_index.get(variant, [])
if recipes:
break
if not recipes:
return ""
# Deduplicate recipes by id
seen = set()
unique = []
for r in recipes:
if r["id"] not in seen:
seen.add(r["id"])
unique.append(r)
lines = [
"",
"# --- Recipes (multi-tool chains) ---",
"",
]
for recipe in unique:
lines.append(f"# >> {recipe['name']}")
for cmd in recipe.get("commands", []):
lines.append(cmd)
lines.append("")
return "\n".join(lines)
def generate_cheatsheet_rich(t, recipe_index=None):
"""Generate a rich cheatsheet for a tool with FOR610 coverage."""
f610 = t["sources"]["for610"]
name = t["name"]
desc = t.get("description", "")
labs = f610.get("labs", [])
sections = f610.get("sections", [])
tags = f610.get("tags", [])
usages = f610.get("typical_usage", [])
author = f610.get("author", "")
lines = [
f"# {name}",
f"# {desc}",
]
meta_parts = []
if labs:
meta_parts.append(f"FOR610 Labs: {', '.join(labs)}")
if sections:
meta_parts.append(f"Sections: {', '.join(str(s) for s in sections)}")
if author:
meta_parts.append(f"Author: {author}")
if meta_parts:
lines.append(f"# {' | '.join(meta_parts)}")
# REMnux docs URL if available
if t["sources"]["remnux_docs"].get("covered"):
url = t["sources"]["remnux_docs"].get("docs_url", "")
if url:
lines.append(f"# Docs: {url}")
lines.append("")
# Tags
tag_str = ", ".join(tags[:8]) if tags else name.lower()
lines.append(f"% {tag_str}")
lines.append("")
# Usage examples with descriptive comments
for i, usage in enumerate(usages):
comment = generate_usage_comment(name, usage, i)
lines.append(f"# {comment}")
lines.append(usage)
lines.append("")
# If no usage examples, add a basic one
if not usages:
lines.append(f"# Show help")
lines.append(f"{name} --help")
lines.append("")
# Append recipes section if this tool participates in any recipes
if recipe_index:
recipes_text = format_recipes_section(t["id"], recipe_index)
if recipes_text:
lines.append(recipes_text)
return "\n".join(lines)
def generate_cheatsheet_standard(t):
"""Generate a standard cheatsheet from REMnux docs."""
rdocs = t["sources"]["remnux_docs"]
name = t["name"]
desc = t.get("description", "") or rdocs.get("description", "")
cat = rdocs.get("category", "")
url = rdocs.get("docs_url", "")
lines = [
f"# {name}",
f"# {desc}" if desc else f"# {name} tool",
]
if cat:
lines.append(f"# Category: {cat}")
if url:
lines.append(f"# Docs: {url}")
lines += [
"",
f"% {sanitize_filename(name)}",
"",
f"# Show help for {name}",
f"{name} --help",
"",
]
return "\n".join(lines)
def generate_cheatsheet_basic(t):
"""Generate a minimal cheatsheet for a tool with only salt-states."""
name = t["name"]
salt = t["sources"]["salt_states"]
install = salt.get("install_method", "unknown")
pkg = salt.get("package_name", name)
lines = [
f"# {name}",
f"# Installed via: {install} ({pkg})",
"",
f"% {sanitize_filename(name)}",
"",
f"# Show help for {name}",
f"{name} --help",
"",
]
return "\n".join(lines)
def generate_cheatsheets(tools, recipe_index=None):
"""Generate per-tool cheatsheet files."""
cheat_dir = os.path.join(GEN_DIR, "cheatsheets")
os.makedirs(cheat_dir, exist_ok=True)
count = 0
for t in tools:
if not t.get("in_remnux"):
continue
tier = t.get("help_tier", "stub")
name = t["name"]
filename = sanitize_filename(name) + ".cheat"
if tier == "rich":
content = generate_cheatsheet_rich(t, recipe_index=recipe_index)
elif tier == "standard":
content = generate_cheatsheet_standard(t)
else:
content = generate_cheatsheet_basic(t)
with open(os.path.join(cheat_dir, filename), "w") as f:
f.write(content)
count += 1
print(f" cheatsheets: {count} .cheat files")
return count
# ============================================================
# Workflow generator
# ============================================================
def _get_tool_examples(tool_name, master_tools_by_name):
"""Get 1-2 example commands for a tool from the master inventory."""
tool = master_tools_by_name.get(tool_name)
if not tool:
# Try kebab-case lookup
normalized = tool_name.lower().replace("_", "-")
tool = master_tools_by_name.get(normalized)
if tool and tool["sources"]["for610"].get("covered"):
usages = tool["sources"]["for610"].get("typical_usage", [])
return usages[:2]
return []
def generate_workflows(workflows_data, master_tools=None):
"""Generate readable workflow help files with inline examples."""
wf_dir = os.path.join(GEN_DIR, "workflows")
os.makedirs(wf_dir, exist_ok=True)
# Build tool name lookup for inline examples
tools_by_name = {}
if master_tools:
for t in master_tools:
tools_by_name[t["name"].lower()] = t
tools_by_name[t["id"]] = t
for alias in t.get("aliases", []):
tools_by_name[alias.lower()] = t
workflows = workflows_data.get("workflows", [])
count = 0
for wf in workflows:
wf_id = wf["id"]
name = wf["name"]
desc = wf.get("description", "")
steps = wf.get("steps", [])
related_labs = wf.get("related_labs", [])
lines = [
f"{'='*60}",
f" {name}",
f"{'='*60}",
"",
f" {desc}",
"",
]
if related_labs:
lines.append(f" Related FOR610 Labs: {', '.join(related_labs)}")
lines.append("")
lines.append(f"{''*60}")
lines.append("")
for step in steps:
order = step.get("order", "?")
step_name = step.get("name", "")
step_desc = step.get("description", "")
step_tools = step.get("tools", [])
lines.append(f" Step {order}: {step_name}")
if step_tools:
lines.append(f" Tools: {', '.join(step_tools)}")
if step_desc:
wrapped = textwrap.fill(step_desc, width=56, initial_indent=" ", subsequent_indent=" ")
lines.append(wrapped)
# Add inline command examples for each tool
if step_tools and tools_by_name:
examples_shown = False
for tool_name in step_tools:
examples = _get_tool_examples(tool_name, tools_by_name)
if examples:
if not examples_shown:
lines.append("")
for ex in examples[:1]: # Show 1 example per tool
lines.append(f" $ {ex}")
examples_shown = True
lines.append("")
lines.append(f"{''*60}")
lines.append(f" Tip: 'fhelp cheat <tool>' for full examples")
lines.append(f" 'Ctrl+G' for interactive cheatsheet browser")
lines.append("")
filename = wf_id.replace("_", "-") + ".txt"
with open(os.path.join(wf_dir, filename), "w") as f:
f.write("\n".join(lines))
count += 1
# Also generate an index file
index_lines = [
f"{'='*60}",
f" Available Analysis Workflows",
f"{'='*60}",
"",
]
for wf in workflows:
wf_id = wf["id"].replace("_", "-")
name = wf["name"]
desc = wf.get("description", "")
index_lines.append(f" {wf_id}")
index_lines.append(f" {name}")
wrapped = textwrap.fill(desc, width=56, initial_indent=" ", subsequent_indent=" ")
index_lines.append(wrapped)
index_lines.append("")
index_lines += [
f"{''*60}",
f" Usage: fhelp workflow <name>",
f" Example: fhelp workflow static-analysis",
"",
]
with open(os.path.join(wf_dir, "index.txt"), "w") as f:
f.write("\n".join(index_lines))
print(f" workflows: {count} workflow files + index")
return count
# ============================================================
# TLDR generator
# ============================================================
def generate_tldr(tools):
"""Generate TLDR pages for tools missing from upstream."""
tldr_dir = os.path.join(GEN_DIR, "tldr")
os.makedirs(tldr_dir, exist_ok=True)
count = 0
for t in tools:
if not t.get("in_remnux"):
continue
tier = t.get("help_tier", "stub")
if tier not in ("rich", "standard"):
continue
name = t["name"]
desc = t.get("description", "") or f"{name} tool"
# Get usage examples
usages = []
if t["sources"]["for610"].get("covered"):
usages = t["sources"]["for610"].get("typical_usage", [])
if not usages:
usages = [f"{name} --help"]
# TLDR format
lines = [
f"# {name}",
"",
f"> {desc}",
"",
]
for i, usage in enumerate(usages[:4]):
# Create a description from the command
lines.append(f"- Run {name}:")
lines.append("")
lines.append(f"`{usage}`")
lines.append("")
filename = sanitize_filename(name) + ".md"
with open(os.path.join(tldr_dir, filename), "w") as f:
f.write("\n".join(lines))
count += 1
print(f" tldr: {count} pages")
return count
# ============================================================
# Main
# ============================================================
def main():
print("Generating help artifacts from master inventory...")
master = load_master()
tools = master["tools"]
workflows_data = load_workflows()
recipes_data = load_recipes()
recipe_index = build_recipe_index(recipes_data)
print(f"\nInput: {len(tools)} tools, {len(workflows_data.get('workflows', []))} workflows, {len(recipes_data.get('recipes', []))} recipes")
print()
db_count = generate_tools_db(tools)
cheat_count = generate_cheatsheets(tools, recipe_index=recipe_index)
wf_count = generate_workflows(workflows_data, master_tools=tools)
tldr_count = generate_tldr(tools)
print(f"\nAll artifacts generated in {GEN_DIR}/")
print(f" tools.db: {db_count} entries")
print(f" cheatsheets/: {cheat_count} files")
print(f" workflows/: {wf_count} + index")
print(f" tldr/: {tldr_count} pages")
if __name__ == "__main__":
main()
+202
View File
@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""Parse REMnux salt-states repository to extract all installed tools/packages.
Fetches the salt-states repo tree from GitHub, parses .sls files to identify
what gets installed, and outputs data/remnux/sources/salt-states.yaml.
"""
import json
import re
import urllib.request
import yaml
import os
GITHUB_API = "https://api.github.com/repos/REMnux/salt-states"
RAW_BASE = "https://raw.githubusercontent.com/REMnux/salt-states/master"
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "salt-states.yaml")
def fetch_json(url):
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
def fetch_text(url):
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode()
except Exception as e:
print(f" Warning: could not fetch {url}: {e}")
return None
def get_sls_files():
"""Get all .sls file paths from the repo."""
tree = fetch_json(f"{GITHUB_API}/git/trees/master?recursive=1")
return [item["path"] for item in tree["tree"]
if item["path"].endswith(".sls") and item["type"] == "blob"]
def classify_sls_path(path):
"""Classify the install method from the directory structure."""
parts = path.lower()
if "python3-package" in parts or "python-package" in parts:
return "pip"
elif "pip" in parts:
return "pip"
elif "rubygem" in parts:
return "gem"
elif "npm" in parts or "node" in parts:
return "npm"
elif "perl-package" in parts:
return "perl"
elif "package" in parts:
return "apt"
elif "tools" in parts:
return "manual"
elif "script" in parts:
return "script"
else:
return "unknown"
def extract_tool_name_from_path(path):
"""Extract a human-readable tool name from the .sls file path."""
basename = os.path.basename(path).replace(".sls", "")
# Skip non-tool files
skip = {"init", "addon", "cloud", "dedicated", "theme", "remnux-config",
"apt-transport-https", "packages", "python3-packages", "python-packages",
"rubygems", "perl-packages", "node-packages", "tools", "scripts"}
if basename in skip:
return None
return basename
def parse_sls_content(content, path):
"""Parse a .sls file and extract package/tool information."""
if not content:
return []
results = []
tool_name = extract_tool_name_from_path(path)
if not tool_name:
return []
install_method = classify_sls_path(path)
# Try to find the actual package name from the content
package_names = []
# Match pip.installed, pkg.installed, gem.installed, npm.installed
for match in re.finditer(r'(\w[\w.-]+):\s*\n\s+(?:pip|pkg|gem|npm)\.installed', content):
package_names.append(match.group(1))
# Match "- name: package_name" in pip/pkg states
for match in re.finditer(r'-\s+name:\s+([^\s#\n]+)', content):
name = match.group(1).strip("'\"")
if name and not name.startswith('{') and not name.startswith('/'):
package_names.append(name)
# Match wget/curl downloads (manual installs)
for match in re.finditer(r'(?:wget|curl)\s+.*?/([^/\s"]+?)(?:\s|"|$)', content):
fname = match.group(1)
if '.' in fname and not fname.endswith('.key'):
package_names.append(fname)
# Match file.managed targets (scripts/binaries being deployed)
for match in re.finditer(r'/usr/local/bin/([^:\s]+)', content):
package_names.append(match.group(1))
# Deduplicate and clean
seen = set()
clean_names = []
for n in package_names:
n = n.strip().strip("'\"")
if n and n.lower() not in seen and len(n) > 1:
seen.add(n.lower())
clean_names.append(n)
entry = {
"id": tool_name,
"package_names": clean_names if clean_names else [tool_name],
"install_method": install_method,
"salt_state_path": path,
}
# Try to detect if it's enabled/disabled
if "False" in content and ("onlyif" in content.lower() or "unless" in content.lower()):
entry["possibly_conditional"] = True
results.append(entry)
return results
def main():
print("Fetching salt-states repository tree...")
sls_files = get_sls_files()
print(f"Found {len(sls_files)} .sls files")
# Filter to relevant paths (skip top-level orchestration files)
relevant = [f for f in sls_files if f.startswith("remnux/")]
print(f" {len(relevant)} under remnux/")
all_tools = []
categories_seen = set()
for i, path in enumerate(relevant):
if i % 20 == 0:
print(f" Processing {i}/{len(relevant)}...")
# Derive category from path
parts = path.split("/")
if len(parts) >= 3:
category_dir = parts[1] # e.g., "python3-packages", "tools", "packages"
categories_seen.add(category_dir)
content = fetch_text(f"{RAW_BASE}/{path}")
tools = parse_sls_content(content, path)
all_tools.extend(tools)
# Deduplicate by id
seen_ids = set()
unique_tools = []
for t in all_tools:
if t["id"] not in seen_ids:
seen_ids.add(t["id"])
unique_tools.append(t)
# Sort by id
unique_tools.sort(key=lambda t: t["id"])
output = {
"metadata": {
"source": "https://github.com/REMnux/salt-states",
"branch": "master",
"total_sls_files": len(relevant),
"total_tools_extracted": len(unique_tools),
"install_method_counts": {},
"salt_directories": sorted(categories_seen),
},
"tools": unique_tools,
}
# Count install methods
for t in unique_tools:
m = t["install_method"]
output["metadata"]["install_method_counts"][m] = \
output["metadata"]["install_method_counts"].get(m, 0) + 1
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\nDone! Extracted {len(unique_tools)} tools")
for method, count in sorted(output["metadata"]["install_method_counts"].items()):
print(f" {method}: {count}")
print(f"Output: {OUTPUT_PATH}")
if __name__ == "__main__":
main()
+226
View File
@@ -0,0 +1,226 @@
#!/usr/bin/env python3
"""Scrape REMnux documentation to extract all documented tools.
Fetches docs.remnux.org tool listing pages and extracts tool names,
descriptions, categories, and URLs. Outputs data/remnux/sources/remnux-docs.yaml.
"""
import re
import urllib.request
import yaml
import os
import time
BASE_URL = "https://docs.remnux.org/discover-the-tools"
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "remnux-docs.yaml")
# All known category pages from docs.remnux.org
CATEGORY_PAGES = [
# Examine Static Properties
("Examine Static Properties > General", "examine+static+properties/general"),
("Examine Static Properties > PE Files", "examine+static+properties/pe-files"),
("Examine Static Properties > ELF Files", "examine+static+properties/elf-files"),
("Examine Static Properties > .NET", "examine+static+properties/.net"),
("Examine Static Properties > Go", "examine+static+properties/go"),
("Examine Static Properties > Deobfuscation", "examine+static+properties/deobfuscation"),
# Statically Analyze Code
("Statically Analyze Code > General", "statically+analyze+code/general"),
("Statically Analyze Code > Unpacking", "statically+analyze+code/unpacking"),
("Statically Analyze Code > PE Files", "statically+analyze+code/pe-files"),
("Statically Analyze Code > Python", "statically+analyze+code/python"),
("Statically Analyze Code > Scripts", "statically+analyze+code/scripts"),
("Statically Analyze Code > Java", "statically+analyze+code/java"),
("Statically Analyze Code > .NET", "statically+analyze+code/.net"),
("Statically Analyze Code > Android", "statically+analyze+code/android"),
# Dynamically Reverse-Engineer Code
("Dynamically Reverse-Engineer Code > General", "dynamically+reverse-engineer+code/general"),
("Dynamically Reverse-Engineer Code > Shellcode", "dynamically+reverse-engineer+code/shellcode"),
("Dynamically Reverse-Engineer Code > Scripts", "dynamically+reverse-engineer+code/scripts"),
("Dynamically Reverse-Engineer Code > ELF Files", "dynamically+reverse-engineer+code/elf-files"),
# Memory Forensics
("Perform Memory Forensics", "perform+memory+forensics"),
# Network Interactions
("Explore Network Interactions > Monitoring", "explore+network+interactions/monitoring"),
("Explore Network Interactions > Connecting", "explore+network+interactions/connecting"),
("Explore Network Interactions > Services", "explore+network+interactions/services"),
# System Interactions
("Investigate System Interactions", "investigate+system+interactions"),
# Documents
("Analyze Documents > General", "analyze+documents/general"),
("Analyze Documents > PDF", "analyze+documents/pdf"),
("Analyze Documents > Microsoft Office", "analyze+documents/microsoft+office"),
("Analyze Documents > Email Messages", "analyze+documents/email+messages"),
# AI
("Use Artificial Intelligence", "use+artificial+intelligence"),
# Data
("Gather and Analyze Data", "gather+and+analyze+data"),
# View/Edit
("View or Edit Files", "view+or+edit+files"),
# Utilities
("General Utilities", "general+utilities"),
]
def fetch_page(url):
"""Fetch a page and return its text content."""
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 (remnux-doc-scraper)",
"Accept": "text/html,application/xhtml+xml",
})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode("utf-8", errors="replace")
except Exception as e:
print(f" Warning: could not fetch {url}: {e}")
return None
def normalize_id(name):
"""Convert tool name to a normalized kebab-case ID."""
# Remove .py suffix for ID, keep display name
n = name.lower().strip()
n = re.sub(r'\.py$', '-py', n)
n = re.sub(r'\.pl$', '-pl', n)
n = re.sub(r'\.bat$', '-bat', n)
n = re.sub(r'[^a-z0-9]+', '-', n)
n = n.strip('-')
return n
def extract_tools_from_html(html, category, category_path):
"""Extract tool entries from a docs page HTML."""
tools = []
# GitBook pages use specific patterns for tool headings
# Pattern 1: <h2> or <h3> headings with tool names
# Pattern 2: Bold text followed by description
# The docs use a pattern like: **Tool Name** description text
# Try to find tool sections - GitBook uses specific div/section patterns
# Look for heading patterns with tool names
heading_pattern = re.compile(
r'<h[23][^>]*id="([^"]*)"[^>]*>.*?<a[^>]*>.*?</a>\s*(.*?)\s*</h[23]>',
re.DOTALL | re.IGNORECASE
)
# Also try plain text patterns
# GitBook often renders as: tool-name followed by description
bold_pattern = re.compile(
r'<strong>(.*?)</strong>\s*[-:]\s*(.*?)(?=<(?:br|p|div|strong|h[23])|$)',
re.DOTALL | re.IGNORECASE
)
# Find headings first
for match in heading_pattern.finditer(html):
anchor_id = match.group(1)
heading_text = re.sub(r'<[^>]+>', '', match.group(2)).strip()
if heading_text and len(heading_text) < 80:
# Get description from content after heading
pos = match.end()
desc_chunk = html[pos:pos+500]
desc_chunk = re.sub(r'<[^>]+>', ' ', desc_chunk)
desc_chunk = re.sub(r'\s+', ' ', desc_chunk).strip()
# Take first sentence
desc = desc_chunk.split('.')[0].strip() + '.' if desc_chunk else ""
if len(desc) > 200:
desc = desc[:197] + "..."
# Try to find website URL near this section
website_chunk = html[pos:pos+2000]
website_match = re.search(r'href="(https?://(?!docs\.remnux)[^"]+)"', website_chunk)
website = website_match.group(1) if website_match else ""
tool = {
"name": heading_text,
"id": normalize_id(heading_text),
"category": category,
"category_path": category_path,
"description": desc,
"docs_url": f"{BASE_URL}/{category_path}",
"anchor": anchor_id,
}
if website:
tool["website"] = website
tools.append(tool)
# If we got nothing from headings, try the bold pattern
if not tools:
for match in bold_pattern.finditer(html):
name = re.sub(r'<[^>]+>', '', match.group(1)).strip()
desc = re.sub(r'<[^>]+>', ' ', match.group(2)).strip()
desc = re.sub(r'\s+', ' ', desc).strip()
if name and len(name) < 80 and len(name) > 1:
if len(desc) > 200:
desc = desc[:197] + "..."
tools.append({
"name": name,
"id": normalize_id(name),
"category": category,
"category_path": category_path,
"description": desc,
"docs_url": f"{BASE_URL}/{category_path}",
})
return tools
def main():
print("Scraping REMnux documentation...")
all_tools = []
for category, path in CATEGORY_PAGES:
url = f"{BASE_URL}/{path}"
print(f" Fetching: {category}")
html = fetch_page(url)
if not html:
print(f" Skipped (fetch failed)")
continue
tools = extract_tools_from_html(html, category, path)
print(f" Found {len(tools)} tools")
all_tools.extend(tools)
time.sleep(0.3) # Be polite
# Deduplicate by id (same tool can appear in multiple categories)
seen = {}
for t in all_tools:
tid = t["id"]
if tid not in seen:
seen[tid] = t
else:
# Tool appears in multiple categories - track both
existing = seen[tid]
if "additional_categories" not in existing:
existing["additional_categories"] = []
existing["additional_categories"].append(t["category"])
unique_tools = sorted(seen.values(), key=lambda t: t["id"])
output = {
"metadata": {
"source": "https://docs.remnux.org/discover-the-tools",
"categories_scraped": len(CATEGORY_PAGES),
"total_tools_extracted": len(unique_tools),
"category_counts": {},
},
"tools": unique_tools,
}
# Count per category
for t in all_tools:
cat = t["category"]
output["metadata"]["category_counts"][cat] = \
output["metadata"]["category_counts"].get(cat, 0) + 1
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\nDone! Extracted {len(unique_tools)} unique tools from {len(CATEGORY_PAGES)} category pages")
print(f"Output: {OUTPUT_PATH}")
if __name__ == "__main__":
main()
+360
View File
@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""Comprehensive verification of generated help artifacts.
Tests:
1. All FOR610 tools with in_remnux=true have cheatsheets
2. All cheatsheet content matches researched data
3. All workflows are generated and contain correct tool references
4. tools.db entries match master inventory
5. No orphaned references (tools in labs but missing from master)
6. Rich-tier cheatsheets have usage examples from FOR610
7. REMnux docs tools have correct descriptions
"""
import os
import sys
import yaml
import glob
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
def load_yaml(path):
with open(path) as f:
return yaml.safe_load(f)
def test_master_inventory():
"""Verify master inventory integrity."""
print("=" * 60)
print("TEST 1: Master Inventory Integrity")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
tools = master["tools"]
# Check all tools have required fields
for t in tools:
tid = t.get("id", "MISSING")
if not t.get("id"):
errors.append(f"Tool missing id: {t}")
if not t.get("name"):
errors.append(f"Tool {tid} missing name")
if "sources" not in t:
errors.append(f"Tool {tid} missing sources")
if "help_tier" not in t:
errors.append(f"Tool {tid} missing help_tier")
# Check no duplicate IDs
ids = [t["id"] for t in tools]
dupes = [x for x in ids if ids.count(x) > 1]
if dupes:
errors.append(f"Duplicate IDs: {set(dupes)}")
print(f" Total tools: {len(tools)}")
print(f" Errors: {len(errors)}")
for e in errors[:10]:
print(f" ! {e}")
return errors
def test_for610_coverage():
"""Verify all FOR610 in_remnux tools appear in master and have cheatsheets."""
print("\n" + "=" * 60)
print("TEST 2: FOR610 Tool Coverage")
print("=" * 60)
errors = []
for610 = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
master_ids = {t["id"] for t in master["tools"]}
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
cheat_files = {os.path.basename(f).replace(".cheat", "")
for f in glob.glob(os.path.join(cheat_dir, "*.cheat"))}
for610_remnux = [t for t in for610["tools"] if t.get("in_remnux")]
for610_all = for610["tools"]
# Check all FOR610 in_remnux tools are in master
missing_from_master = []
for t in for610_remnux:
if t["id"] not in master_ids:
missing_from_master.append(t["id"])
errors.append(f"FOR610 tool '{t['id']}' ({t['name']}) not in master inventory")
# Check all FOR610 in_remnux tools have cheatsheets
missing_cheats = []
for t in for610_remnux:
name_variants = [
t["name"].lower().replace(" ", "-"),
t["id"],
t["name"].lower(),
]
found = False
for v in name_variants:
if v in cheat_files:
found = True
break
if not found:
missing_cheats.append(t["name"])
# Check rich-tier cheatsheets have usage examples
rich_without_examples = []
for t in for610_remnux:
usages = t.get("typical_usage", [])
cheat_path = os.path.join(cheat_dir, t["name"].lower().replace(" ", "-") + ".cheat")
if not os.path.exists(cheat_path):
cheat_path = os.path.join(cheat_dir, t["id"] + ".cheat")
if os.path.exists(cheat_path):
content = open(cheat_path).read()
if usages and not any(u in content for u in usages[:1]):
rich_without_examples.append(t["name"])
print(f" FOR610 tools (all): {len(for610_all)}")
print(f" FOR610 in REMnux: {len(for610_remnux)}")
print(f" Missing from master: {len(missing_from_master)}")
print(f" Missing cheatsheets: {len(missing_cheats)}")
if missing_cheats:
for m in missing_cheats[:5]:
print(f" ! {m}")
print(f" Rich without examples: {len(rich_without_examples)}")
if rich_without_examples:
for m in rich_without_examples[:5]:
print(f" ! {m}")
print(f" Errors: {len(errors)}")
return errors
def test_tools_db():
"""Verify tools.db matches master inventory."""
print("\n" + "=" * 60)
print("TEST 3: tools.db Consistency")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
remnux_tools = {t["name"]: t for t in master["tools"] if t.get("in_remnux")}
db_path = os.path.join(BASE_DIR, "data/generated/tools.db")
db_entries = {}
with open(db_path) as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split("|")
if len(parts) >= 5:
db_entries[parts[0]] = {
"name": parts[0],
"description": parts[1],
"category": parts[2],
"usage": parts[3],
"tier": parts[4],
}
# Check all REMnux tools are in DB
missing_from_db = []
for name, tool in remnux_tools.items():
if name not in db_entries:
missing_from_db.append(name)
# Check no empty descriptions
empty_descs = [e["name"] for e in db_entries.values()
if e["description"] == "(no description available)"]
# Check tier consistency
tier_mismatches = []
for name, entry in db_entries.items():
if name in remnux_tools:
expected_tier = remnux_tools[name].get("help_tier", "stub")
if entry["tier"] != expected_tier:
tier_mismatches.append(f"{name}: db={entry['tier']} vs master={expected_tier}")
print(f" tools.db entries: {len(db_entries)}")
print(f" REMnux tools in master: {len(remnux_tools)}")
print(f" Missing from DB: {len(missing_from_db)}")
if missing_from_db:
for m in missing_from_db[:5]:
print(f" ! {m}")
print(f" Empty descriptions: {len(empty_descs)}")
if empty_descs:
for m in empty_descs[:5]:
print(f" ! {m}")
print(f" Tier mismatches: {len(tier_mismatches)}")
return errors
def test_workflows():
"""Verify all workflow files are generated and contain valid tool references."""
print("\n" + "=" * 60)
print("TEST 4: Workflow Files")
print("=" * 60)
errors = []
wf_src = load_yaml(os.path.join(BASE_DIR, "data/for610/workflows.yaml"))
wf_dir = os.path.join(BASE_DIR, "data/generated/workflows")
expected_workflows = wf_src.get("workflows", [])
generated = glob.glob(os.path.join(wf_dir, "*.txt"))
generated_names = {os.path.basename(f).replace(".txt", "") for f in generated}
# Check all workflows generated
for wf in expected_workflows:
wf_id = wf["id"].replace("_", "-")
if wf_id not in generated_names:
errors.append(f"Missing workflow file: {wf_id}.txt")
# Check index file exists
if "index" not in generated_names:
errors.append("Missing workflow index.txt")
# Check each workflow file has content
for f in generated:
content = open(f).read()
if len(content) < 50:
errors.append(f"Workflow file too short: {os.path.basename(f)}")
print(f" Expected workflows: {len(expected_workflows)}")
print(f" Generated files: {len(generated)} (including index)")
print(f" Errors: {len(errors)}")
for e in errors:
print(f" ! {e}")
return errors
def test_lab_tool_references():
"""Verify all tools referenced in labs exist in master inventory."""
print("\n" + "=" * 60)
print("TEST 5: Lab-Tool Cross-References")
print("=" * 60)
errors = []
labs = load_yaml(os.path.join(BASE_DIR, "data/for610/labs.yaml"))
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
master_ids = {t["id"] for t in master["tools"]}
for610_tools = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
for610_ids = {t["id"] for t in for610_tools["tools"]}
# Check all tool_ids in labs exist in FOR610
missing = set()
for lab in labs["labs"]:
for tu in lab.get("tools_used", []):
tid = tu["tool_id"]
if tid not in for610_ids:
missing.add(f"Lab {lab['id']}: tool '{tid}'")
errors.append(f"Lab {lab['id']} references unknown tool: {tid}")
print(f" Labs: {len(labs['labs'])}")
print(f" Missing tool references: {len(missing)}")
for m in sorted(missing)[:5]:
print(f" ! {m}")
return errors
def test_remnux_docs_coverage():
"""Check how many REMnux-documented tools have help content."""
print("\n" + "=" * 60)
print("TEST 6: REMnux Docs Coverage in Help")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
docs_tools = [t for t in master["tools"]
if t["sources"]["remnux_docs"].get("covered") and t.get("in_remnux")]
docs_with_cheat = 0
docs_without_cheat = []
for t in docs_tools:
name = t["name"].lower().replace(" ", "-")
variants = [name, t["id"], name + ".cheat"]
found = any(os.path.exists(os.path.join(cheat_dir, v + ".cheat")) for v in [name, t["id"]])
if found:
docs_with_cheat += 1
else:
docs_without_cheat.append(t["name"])
print(f" REMnux-documented tools: {len(docs_tools)}")
print(f" With cheatsheets: {docs_with_cheat}")
print(f" Without cheatsheets: {len(docs_without_cheat)}")
if docs_without_cheat:
for m in docs_without_cheat[:5]:
print(f" ! {m}")
return errors
def test_cheatsheet_quality():
"""Spot-check cheatsheet content for key tools."""
print("\n" + "=" * 60)
print("TEST 7: Cheatsheet Quality Spot-Checks")
print("=" * 60)
errors = []
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
# Key tools that MUST have good cheatsheets
key_tools = {
"pdfid.py": ["pdfid.py", "document.pdf"],
"pdf-parser.py": ["pdf-parser.py", "-a", "-s"],
"oledump.py": ["oledump.py", "-s", "-v"],
"capa": ["capa", "specimen"],
"speakeasy": ["speakeasy", "-t"],
"ghidra": ["ghidra"],
"wireshark": ["wireshark"],
"floss": ["floss"],
"scdbgc": ["scdbgc", "/f"],
"rtfdump.py": ["rtfdump.py"],
}
for tool, expected_strings in key_tools.items():
cheat_path = os.path.join(cheat_dir, tool + ".cheat")
if not os.path.exists(cheat_path):
# Try without .py
alt = tool.replace(".py", "-py") + ".cheat"
cheat_path = os.path.join(cheat_dir, alt)
if not os.path.exists(cheat_path):
errors.append(f"Key tool {tool} has no cheatsheet")
print(f" ! {tool}: NO CHEATSHEET")
continue
content = open(cheat_path).read()
missing_strings = [s for s in expected_strings if s not in content]
if missing_strings:
errors.append(f"{tool} cheatsheet missing: {missing_strings}")
print(f" ! {tool}: missing {missing_strings}")
else:
print(f" + {tool}: OK")
return errors
def main():
all_errors = []
all_errors.extend(test_master_inventory())
all_errors.extend(test_for610_coverage())
all_errors.extend(test_tools_db())
all_errors.extend(test_workflows())
all_errors.extend(test_lab_tool_references())
all_errors.extend(test_remnux_docs_coverage())
all_errors.extend(test_cheatsheet_quality())
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
if all_errors:
print(f"\n Total issues found: {len(all_errors)}")
for e in all_errors:
print(f" - {e}")
sys.exit(1)
else:
print(f"\n All tests passed!")
sys.exit(0)
if __name__ == "__main__":
main()