Files
docker_file_analysis/scripts/verify-help-quality.py
T
tobias f3ccc09c3d Add FOR610 tool/workflow knowledge base and data pipeline
Build comprehensive malware analysis knowledge base from 3 sources:
- SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes
- REMnux salt-states: 340 packages parsed from GitHub
- REMnux docs: 280+ tools scraped from docs.remnux.org

Master inventory merges all sources into 447 tools with help tiers
(rich/standard/basic). Pipeline generates: tools.db (397 entries),
397 cheatsheets with multi-tool recipes, 15 workflow guides, 224
TLDR pages, and coverage reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 17:38:15 +01:00

361 lines
12 KiB
Python

#!/usr/bin/env python3
"""Comprehensive verification of generated help artifacts.
Tests:
1. All FOR610 tools with in_remnux=true have cheatsheets
2. All cheatsheet content matches researched data
3. All workflows are generated and contain correct tool references
4. tools.db entries match master inventory
5. No orphaned references (tools in labs but missing from master)
6. Rich-tier cheatsheets have usage examples from FOR610
7. REMnux docs tools have correct descriptions
"""
import os
import sys
import yaml
import glob
BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
def load_yaml(path):
with open(path) as f:
return yaml.safe_load(f)
def test_master_inventory():
"""Verify master inventory integrity."""
print("=" * 60)
print("TEST 1: Master Inventory Integrity")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
tools = master["tools"]
# Check all tools have required fields
for t in tools:
tid = t.get("id", "MISSING")
if not t.get("id"):
errors.append(f"Tool missing id: {t}")
if not t.get("name"):
errors.append(f"Tool {tid} missing name")
if "sources" not in t:
errors.append(f"Tool {tid} missing sources")
if "help_tier" not in t:
errors.append(f"Tool {tid} missing help_tier")
# Check no duplicate IDs
ids = [t["id"] for t in tools]
dupes = [x for x in ids if ids.count(x) > 1]
if dupes:
errors.append(f"Duplicate IDs: {set(dupes)}")
print(f" Total tools: {len(tools)}")
print(f" Errors: {len(errors)}")
for e in errors[:10]:
print(f" ! {e}")
return errors
def test_for610_coverage():
"""Verify all FOR610 in_remnux tools appear in master and have cheatsheets."""
print("\n" + "=" * 60)
print("TEST 2: FOR610 Tool Coverage")
print("=" * 60)
errors = []
for610 = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
master_ids = {t["id"] for t in master["tools"]}
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
cheat_files = {os.path.basename(f).replace(".cheat", "")
for f in glob.glob(os.path.join(cheat_dir, "*.cheat"))}
for610_remnux = [t for t in for610["tools"] if t.get("in_remnux")]
for610_all = for610["tools"]
# Check all FOR610 in_remnux tools are in master
missing_from_master = []
for t in for610_remnux:
if t["id"] not in master_ids:
missing_from_master.append(t["id"])
errors.append(f"FOR610 tool '{t['id']}' ({t['name']}) not in master inventory")
# Check all FOR610 in_remnux tools have cheatsheets
missing_cheats = []
for t in for610_remnux:
name_variants = [
t["name"].lower().replace(" ", "-"),
t["id"],
t["name"].lower(),
]
found = False
for v in name_variants:
if v in cheat_files:
found = True
break
if not found:
missing_cheats.append(t["name"])
# Check rich-tier cheatsheets have usage examples
rich_without_examples = []
for t in for610_remnux:
usages = t.get("typical_usage", [])
cheat_path = os.path.join(cheat_dir, t["name"].lower().replace(" ", "-") + ".cheat")
if not os.path.exists(cheat_path):
cheat_path = os.path.join(cheat_dir, t["id"] + ".cheat")
if os.path.exists(cheat_path):
content = open(cheat_path).read()
if usages and not any(u in content for u in usages[:1]):
rich_without_examples.append(t["name"])
print(f" FOR610 tools (all): {len(for610_all)}")
print(f" FOR610 in REMnux: {len(for610_remnux)}")
print(f" Missing from master: {len(missing_from_master)}")
print(f" Missing cheatsheets: {len(missing_cheats)}")
if missing_cheats:
for m in missing_cheats[:5]:
print(f" ! {m}")
print(f" Rich without examples: {len(rich_without_examples)}")
if rich_without_examples:
for m in rich_without_examples[:5]:
print(f" ! {m}")
print(f" Errors: {len(errors)}")
return errors
def test_tools_db():
"""Verify tools.db matches master inventory."""
print("\n" + "=" * 60)
print("TEST 3: tools.db Consistency")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
remnux_tools = {t["name"]: t for t in master["tools"] if t.get("in_remnux")}
db_path = os.path.join(BASE_DIR, "data/generated/tools.db")
db_entries = {}
with open(db_path) as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split("|")
if len(parts) >= 5:
db_entries[parts[0]] = {
"name": parts[0],
"description": parts[1],
"category": parts[2],
"usage": parts[3],
"tier": parts[4],
}
# Check all REMnux tools are in DB
missing_from_db = []
for name, tool in remnux_tools.items():
if name not in db_entries:
missing_from_db.append(name)
# Check no empty descriptions
empty_descs = [e["name"] for e in db_entries.values()
if e["description"] == "(no description available)"]
# Check tier consistency
tier_mismatches = []
for name, entry in db_entries.items():
if name in remnux_tools:
expected_tier = remnux_tools[name].get("help_tier", "stub")
if entry["tier"] != expected_tier:
tier_mismatches.append(f"{name}: db={entry['tier']} vs master={expected_tier}")
print(f" tools.db entries: {len(db_entries)}")
print(f" REMnux tools in master: {len(remnux_tools)}")
print(f" Missing from DB: {len(missing_from_db)}")
if missing_from_db:
for m in missing_from_db[:5]:
print(f" ! {m}")
print(f" Empty descriptions: {len(empty_descs)}")
if empty_descs:
for m in empty_descs[:5]:
print(f" ! {m}")
print(f" Tier mismatches: {len(tier_mismatches)}")
return errors
def test_workflows():
"""Verify all workflow files are generated and contain valid tool references."""
print("\n" + "=" * 60)
print("TEST 4: Workflow Files")
print("=" * 60)
errors = []
wf_src = load_yaml(os.path.join(BASE_DIR, "data/for610/workflows.yaml"))
wf_dir = os.path.join(BASE_DIR, "data/generated/workflows")
expected_workflows = wf_src.get("workflows", [])
generated = glob.glob(os.path.join(wf_dir, "*.txt"))
generated_names = {os.path.basename(f).replace(".txt", "") for f in generated}
# Check all workflows generated
for wf in expected_workflows:
wf_id = wf["id"].replace("_", "-")
if wf_id not in generated_names:
errors.append(f"Missing workflow file: {wf_id}.txt")
# Check index file exists
if "index" not in generated_names:
errors.append("Missing workflow index.txt")
# Check each workflow file has content
for f in generated:
content = open(f).read()
if len(content) < 50:
errors.append(f"Workflow file too short: {os.path.basename(f)}")
print(f" Expected workflows: {len(expected_workflows)}")
print(f" Generated files: {len(generated)} (including index)")
print(f" Errors: {len(errors)}")
for e in errors:
print(f" ! {e}")
return errors
def test_lab_tool_references():
"""Verify all tools referenced in labs exist in master inventory."""
print("\n" + "=" * 60)
print("TEST 5: Lab-Tool Cross-References")
print("=" * 60)
errors = []
labs = load_yaml(os.path.join(BASE_DIR, "data/for610/labs.yaml"))
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
master_ids = {t["id"] for t in master["tools"]}
for610_tools = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
for610_ids = {t["id"] for t in for610_tools["tools"]}
# Check all tool_ids in labs exist in FOR610
missing = set()
for lab in labs["labs"]:
for tu in lab.get("tools_used", []):
tid = tu["tool_id"]
if tid not in for610_ids:
missing.add(f"Lab {lab['id']}: tool '{tid}'")
errors.append(f"Lab {lab['id']} references unknown tool: {tid}")
print(f" Labs: {len(labs['labs'])}")
print(f" Missing tool references: {len(missing)}")
for m in sorted(missing)[:5]:
print(f" ! {m}")
return errors
def test_remnux_docs_coverage():
"""Check how many REMnux-documented tools have help content."""
print("\n" + "=" * 60)
print("TEST 6: REMnux Docs Coverage in Help")
print("=" * 60)
errors = []
master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
docs_tools = [t for t in master["tools"]
if t["sources"]["remnux_docs"].get("covered") and t.get("in_remnux")]
docs_with_cheat = 0
docs_without_cheat = []
for t in docs_tools:
name = t["name"].lower().replace(" ", "-")
variants = [name, t["id"], name + ".cheat"]
found = any(os.path.exists(os.path.join(cheat_dir, v + ".cheat")) for v in [name, t["id"]])
if found:
docs_with_cheat += 1
else:
docs_without_cheat.append(t["name"])
print(f" REMnux-documented tools: {len(docs_tools)}")
print(f" With cheatsheets: {docs_with_cheat}")
print(f" Without cheatsheets: {len(docs_without_cheat)}")
if docs_without_cheat:
for m in docs_without_cheat[:5]:
print(f" ! {m}")
return errors
def test_cheatsheet_quality():
"""Spot-check cheatsheet content for key tools."""
print("\n" + "=" * 60)
print("TEST 7: Cheatsheet Quality Spot-Checks")
print("=" * 60)
errors = []
cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
# Key tools that MUST have good cheatsheets
key_tools = {
"pdfid.py": ["pdfid.py", "document.pdf"],
"pdf-parser.py": ["pdf-parser.py", "-a", "-s"],
"oledump.py": ["oledump.py", "-s", "-v"],
"capa": ["capa", "specimen"],
"speakeasy": ["speakeasy", "-t"],
"ghidra": ["ghidra"],
"wireshark": ["wireshark"],
"floss": ["floss"],
"scdbgc": ["scdbgc", "/f"],
"rtfdump.py": ["rtfdump.py"],
}
for tool, expected_strings in key_tools.items():
cheat_path = os.path.join(cheat_dir, tool + ".cheat")
if not os.path.exists(cheat_path):
# Try without .py
alt = tool.replace(".py", "-py") + ".cheat"
cheat_path = os.path.join(cheat_dir, alt)
if not os.path.exists(cheat_path):
errors.append(f"Key tool {tool} has no cheatsheet")
print(f" ! {tool}: NO CHEATSHEET")
continue
content = open(cheat_path).read()
missing_strings = [s for s in expected_strings if s not in content]
if missing_strings:
errors.append(f"{tool} cheatsheet missing: {missing_strings}")
print(f" ! {tool}: missing {missing_strings}")
else:
print(f" + {tool}: OK")
return errors
def main():
all_errors = []
all_errors.extend(test_master_inventory())
all_errors.extend(test_for610_coverage())
all_errors.extend(test_tools_db())
all_errors.extend(test_workflows())
all_errors.extend(test_lab_tool_references())
all_errors.extend(test_remnux_docs_coverage())
all_errors.extend(test_cheatsheet_quality())
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
if all_errors:
print(f"\n Total issues found: {len(all_errors)}")
for e in all_errors:
print(f" - {e}")
sys.exit(1)
else:
print(f"\n All tests passed!")
sys.exit(0)
if __name__ == "__main__":
main()