f3ccc09c3d
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
361 lines
12 KiB
Python
361 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Comprehensive verification of generated help artifacts.
|
|
|
|
Tests:
|
|
1. All FOR610 tools with in_remnux=true have cheatsheets
|
|
2. All cheatsheet content matches researched data
|
|
3. All workflows are generated and contain correct tool references
|
|
4. tools.db entries match master inventory
|
|
5. No orphaned references (tools in labs but missing from master)
|
|
6. Rich-tier cheatsheets have usage examples from FOR610
|
|
7. REMnux docs tools have correct descriptions
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import glob
|
|
|
|
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
|
|
|
|
def load_yaml(path):
|
|
with open(path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def test_master_inventory():
|
|
"""Verify master inventory integrity."""
|
|
print("=" * 60)
|
|
print("TEST 1: Master Inventory Integrity")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
|
|
tools = master["tools"]
|
|
|
|
# Check all tools have required fields
|
|
for t in tools:
|
|
tid = t.get("id", "MISSING")
|
|
if not t.get("id"):
|
|
errors.append(f"Tool missing id: {t}")
|
|
if not t.get("name"):
|
|
errors.append(f"Tool {tid} missing name")
|
|
if "sources" not in t:
|
|
errors.append(f"Tool {tid} missing sources")
|
|
if "help_tier" not in t:
|
|
errors.append(f"Tool {tid} missing help_tier")
|
|
|
|
# Check no duplicate IDs
|
|
ids = [t["id"] for t in tools]
|
|
dupes = [x for x in ids if ids.count(x) > 1]
|
|
if dupes:
|
|
errors.append(f"Duplicate IDs: {set(dupes)}")
|
|
|
|
print(f" Total tools: {len(tools)}")
|
|
print(f" Errors: {len(errors)}")
|
|
for e in errors[:10]:
|
|
print(f" ! {e}")
|
|
return errors
|
|
|
|
|
|
def test_for610_coverage():
|
|
"""Verify all FOR610 in_remnux tools appear in master and have cheatsheets."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 2: FOR610 Tool Coverage")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
for610 = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
|
|
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
|
|
master_ids = {t["id"] for t in master["tools"]}
|
|
|
|
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
|
|
cheat_files = {os.path.basename(f).replace(".cheat", "")
|
|
for f in glob.glob(os.path.join(cheat_dir, "*.cheat"))}
|
|
|
|
for610_remnux = [t for t in for610["tools"] if t.get("in_remnux")]
|
|
for610_all = for610["tools"]
|
|
|
|
# Check all FOR610 in_remnux tools are in master
|
|
missing_from_master = []
|
|
for t in for610_remnux:
|
|
if t["id"] not in master_ids:
|
|
missing_from_master.append(t["id"])
|
|
errors.append(f"FOR610 tool '{t['id']}' ({t['name']}) not in master inventory")
|
|
|
|
# Check all FOR610 in_remnux tools have cheatsheets
|
|
missing_cheats = []
|
|
for t in for610_remnux:
|
|
name_variants = [
|
|
t["name"].lower().replace(" ", "-"),
|
|
t["id"],
|
|
t["name"].lower(),
|
|
]
|
|
found = False
|
|
for v in name_variants:
|
|
if v in cheat_files:
|
|
found = True
|
|
break
|
|
if not found:
|
|
missing_cheats.append(t["name"])
|
|
|
|
# Check rich-tier cheatsheets have usage examples
|
|
rich_without_examples = []
|
|
for t in for610_remnux:
|
|
usages = t.get("typical_usage", [])
|
|
cheat_path = os.path.join(cheat_dir, t["name"].lower().replace(" ", "-") + ".cheat")
|
|
if not os.path.exists(cheat_path):
|
|
cheat_path = os.path.join(cheat_dir, t["id"] + ".cheat")
|
|
if os.path.exists(cheat_path):
|
|
content = open(cheat_path).read()
|
|
if usages and not any(u in content for u in usages[:1]):
|
|
rich_without_examples.append(t["name"])
|
|
|
|
print(f" FOR610 tools (all): {len(for610_all)}")
|
|
print(f" FOR610 in REMnux: {len(for610_remnux)}")
|
|
print(f" Missing from master: {len(missing_from_master)}")
|
|
print(f" Missing cheatsheets: {len(missing_cheats)}")
|
|
if missing_cheats:
|
|
for m in missing_cheats[:5]:
|
|
print(f" ! {m}")
|
|
print(f" Rich without examples: {len(rich_without_examples)}")
|
|
if rich_without_examples:
|
|
for m in rich_without_examples[:5]:
|
|
print(f" ! {m}")
|
|
print(f" Errors: {len(errors)}")
|
|
return errors
|
|
|
|
|
|
def test_tools_db():
|
|
"""Verify tools.db matches master inventory."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 3: tools.db Consistency")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
|
|
remnux_tools = {t["name"]: t for t in master["tools"] if t.get("in_remnux")}
|
|
|
|
db_path = os.path.join(BASE_DIR, "data/generated/tools.db")
|
|
db_entries = {}
|
|
with open(db_path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
parts = line.split("|")
|
|
if len(parts) >= 5:
|
|
db_entries[parts[0]] = {
|
|
"name": parts[0],
|
|
"description": parts[1],
|
|
"category": parts[2],
|
|
"usage": parts[3],
|
|
"tier": parts[4],
|
|
}
|
|
|
|
# Check all REMnux tools are in DB
|
|
missing_from_db = []
|
|
for name, tool in remnux_tools.items():
|
|
if name not in db_entries:
|
|
missing_from_db.append(name)
|
|
|
|
# Check no empty descriptions
|
|
empty_descs = [e["name"] for e in db_entries.values()
|
|
if e["description"] == "(no description available)"]
|
|
|
|
# Check tier consistency
|
|
tier_mismatches = []
|
|
for name, entry in db_entries.items():
|
|
if name in remnux_tools:
|
|
expected_tier = remnux_tools[name].get("help_tier", "stub")
|
|
if entry["tier"] != expected_tier:
|
|
tier_mismatches.append(f"{name}: db={entry['tier']} vs master={expected_tier}")
|
|
|
|
print(f" tools.db entries: {len(db_entries)}")
|
|
print(f" REMnux tools in master: {len(remnux_tools)}")
|
|
print(f" Missing from DB: {len(missing_from_db)}")
|
|
if missing_from_db:
|
|
for m in missing_from_db[:5]:
|
|
print(f" ! {m}")
|
|
print(f" Empty descriptions: {len(empty_descs)}")
|
|
if empty_descs:
|
|
for m in empty_descs[:5]:
|
|
print(f" ! {m}")
|
|
print(f" Tier mismatches: {len(tier_mismatches)}")
|
|
return errors
|
|
|
|
|
|
def test_workflows():
|
|
"""Verify all workflow files are generated and contain valid tool references."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 4: Workflow Files")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
wf_src = load_yaml(os.path.join(BASE_DIR, "data/for610/workflows.yaml"))
|
|
wf_dir = os.path.join(BASE_DIR, "data/generated/workflows")
|
|
|
|
expected_workflows = wf_src.get("workflows", [])
|
|
generated = glob.glob(os.path.join(wf_dir, "*.txt"))
|
|
generated_names = {os.path.basename(f).replace(".txt", "") for f in generated}
|
|
|
|
# Check all workflows generated
|
|
for wf in expected_workflows:
|
|
wf_id = wf["id"].replace("_", "-")
|
|
if wf_id not in generated_names:
|
|
errors.append(f"Missing workflow file: {wf_id}.txt")
|
|
|
|
# Check index file exists
|
|
if "index" not in generated_names:
|
|
errors.append("Missing workflow index.txt")
|
|
|
|
# Check each workflow file has content
|
|
for f in generated:
|
|
content = open(f).read()
|
|
if len(content) < 50:
|
|
errors.append(f"Workflow file too short: {os.path.basename(f)}")
|
|
|
|
print(f" Expected workflows: {len(expected_workflows)}")
|
|
print(f" Generated files: {len(generated)} (including index)")
|
|
print(f" Errors: {len(errors)}")
|
|
for e in errors:
|
|
print(f" ! {e}")
|
|
return errors
|
|
|
|
|
|
def test_lab_tool_references():
|
|
"""Verify all tools referenced in labs exist in master inventory."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 5: Lab-Tool Cross-References")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
labs = load_yaml(os.path.join(BASE_DIR, "data/for610/labs.yaml"))
|
|
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
|
|
master_ids = {t["id"] for t in master["tools"]}
|
|
|
|
for610_tools = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
|
|
for610_ids = {t["id"] for t in for610_tools["tools"]}
|
|
|
|
# Check all tool_ids in labs exist in FOR610
|
|
missing = set()
|
|
for lab in labs["labs"]:
|
|
for tu in lab.get("tools_used", []):
|
|
tid = tu["tool_id"]
|
|
if tid not in for610_ids:
|
|
missing.add(f"Lab {lab['id']}: tool '{tid}'")
|
|
errors.append(f"Lab {lab['id']} references unknown tool: {tid}")
|
|
|
|
print(f" Labs: {len(labs['labs'])}")
|
|
print(f" Missing tool references: {len(missing)}")
|
|
for m in sorted(missing)[:5]:
|
|
print(f" ! {m}")
|
|
return errors
|
|
|
|
|
|
def test_remnux_docs_coverage():
|
|
"""Check how many REMnux-documented tools have help content."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 6: REMnux Docs Coverage in Help")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
|
|
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
|
|
|
|
docs_tools = [t for t in master["tools"]
|
|
if t["sources"]["remnux_docs"].get("covered") and t.get("in_remnux")]
|
|
docs_with_cheat = 0
|
|
docs_without_cheat = []
|
|
|
|
for t in docs_tools:
|
|
name = t["name"].lower().replace(" ", "-")
|
|
variants = [name, t["id"], name + ".cheat"]
|
|
found = any(os.path.exists(os.path.join(cheat_dir, v + ".cheat")) for v in [name, t["id"]])
|
|
if found:
|
|
docs_with_cheat += 1
|
|
else:
|
|
docs_without_cheat.append(t["name"])
|
|
|
|
print(f" REMnux-documented tools: {len(docs_tools)}")
|
|
print(f" With cheatsheets: {docs_with_cheat}")
|
|
print(f" Without cheatsheets: {len(docs_without_cheat)}")
|
|
if docs_without_cheat:
|
|
for m in docs_without_cheat[:5]:
|
|
print(f" ! {m}")
|
|
return errors
|
|
|
|
|
|
def test_cheatsheet_quality():
|
|
"""Spot-check cheatsheet content for key tools."""
|
|
print("\n" + "=" * 60)
|
|
print("TEST 7: Cheatsheet Quality Spot-Checks")
|
|
print("=" * 60)
|
|
errors = []
|
|
|
|
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
|
|
|
|
# Key tools that MUST have good cheatsheets
|
|
key_tools = {
|
|
"pdfid.py": ["pdfid.py", "document.pdf"],
|
|
"pdf-parser.py": ["pdf-parser.py", "-a", "-s"],
|
|
"oledump.py": ["oledump.py", "-s", "-v"],
|
|
"capa": ["capa", "specimen"],
|
|
"speakeasy": ["speakeasy", "-t"],
|
|
"ghidra": ["ghidra"],
|
|
"wireshark": ["wireshark"],
|
|
"floss": ["floss"],
|
|
"scdbgc": ["scdbgc", "/f"],
|
|
"rtfdump.py": ["rtfdump.py"],
|
|
}
|
|
|
|
for tool, expected_strings in key_tools.items():
|
|
cheat_path = os.path.join(cheat_dir, tool + ".cheat")
|
|
if not os.path.exists(cheat_path):
|
|
# Try without .py
|
|
alt = tool.replace(".py", "-py") + ".cheat"
|
|
cheat_path = os.path.join(cheat_dir, alt)
|
|
|
|
if not os.path.exists(cheat_path):
|
|
errors.append(f"Key tool {tool} has no cheatsheet")
|
|
print(f" ! {tool}: NO CHEATSHEET")
|
|
continue
|
|
|
|
content = open(cheat_path).read()
|
|
missing_strings = [s for s in expected_strings if s not in content]
|
|
if missing_strings:
|
|
errors.append(f"{tool} cheatsheet missing: {missing_strings}")
|
|
print(f" ! {tool}: missing {missing_strings}")
|
|
else:
|
|
print(f" + {tool}: OK")
|
|
|
|
return errors
|
|
|
|
|
|
def main():
|
|
all_errors = []
|
|
|
|
all_errors.extend(test_master_inventory())
|
|
all_errors.extend(test_for610_coverage())
|
|
all_errors.extend(test_tools_db())
|
|
all_errors.extend(test_workflows())
|
|
all_errors.extend(test_lab_tool_references())
|
|
all_errors.extend(test_remnux_docs_coverage())
|
|
all_errors.extend(test_cheatsheet_quality())
|
|
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
if all_errors:
|
|
print(f"\n Total issues found: {len(all_errors)}")
|
|
for e in all_errors:
|
|
print(f" - {e}")
|
|
sys.exit(1)
|
|
else:
|
|
print(f"\n All tests passed!")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|