docker_file_analysis/scripts/verify-help-quality.py

#!/usr/bin/env python3
"""Comprehensive verification of generated help artifacts.

Tests:
1. All FOR610 tools with in_remnux=true have cheatsheets
2. All cheatsheet content matches researched data
3. All workflows are generated and contain correct tool references
4. tools.db entries match master inventory
5. No orphaned references (tools in labs but missing from master)
6. Rich-tier cheatsheets have usage examples from FOR610
7. REMnux docs tools have correct descriptions
"""

import os
import sys
import yaml
import glob

BASE_DIR = os.path.join(os.path.dirname(__file__), "..")

def load_yaml(path):
    with open(path) as f:
        return yaml.safe_load(f)


def test_master_inventory():
    """Verify master inventory integrity."""
    print("=" * 60)
    print("TEST 1: Master Inventory Integrity")
    print("=" * 60)
    errors = []

    master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
    tools = master["tools"]

    # Check all tools have required fields
    for t in tools:
        tid = t.get("id", "MISSING")
        if not t.get("id"):
            errors.append(f"Tool missing id: {t}")
        if not t.get("name"):
            errors.append(f"Tool {tid} missing name")
        if "sources" not in t:
            errors.append(f"Tool {tid} missing sources")
        if "help_tier" not in t:
            errors.append(f"Tool {tid} missing help_tier")

    # Check no duplicate IDs
    ids = [t["id"] for t in tools]
    dupes = [x for x in ids if ids.count(x) > 1]
    if dupes:
        errors.append(f"Duplicate IDs: {set(dupes)}")

    print(f"  Total tools: {len(tools)}")
    print(f"  Errors: {len(errors)}")
    for e in errors[:10]:
        print(f"    ! {e}")
    return errors


def test_for610_coverage():
    """Verify all FOR610 in_remnux tools appear in master and have cheatsheets."""
    print("\n" + "=" * 60)
    print("TEST 2: FOR610 Tool Coverage")
    print("=" * 60)
    errors = []

    for610 = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
    master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
    master_ids = {t["id"] for t in master["tools"]}

    cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")
    cheat_files = {os.path.basename(f).replace(".cheat", "")
                   for f in glob.glob(os.path.join(cheat_dir, "*.cheat"))}

    for610_remnux = [t for t in for610["tools"] if t.get("in_remnux")]
    for610_all = for610["tools"]

    # Check all FOR610 in_remnux tools are in master
    missing_from_master = []
    for t in for610_remnux:
        if t["id"] not in master_ids:
            missing_from_master.append(t["id"])
            errors.append(f"FOR610 tool '{t['id']}' ({t['name']}) not in master inventory")

    # Check all FOR610 in_remnux tools have cheatsheets
    missing_cheats = []
    for t in for610_remnux:
        name_variants = [
            t["name"].lower().replace(" ", "-"),
            t["id"],
            t["name"].lower(),
        ]
        found = False
        for v in name_variants:
            if v in cheat_files:
                found = True
                break
        if not found:
            missing_cheats.append(t["name"])

    # Check rich-tier cheatsheets have usage examples
    rich_without_examples = []
    for t in for610_remnux:
        usages = t.get("typical_usage", [])
        cheat_path = os.path.join(cheat_dir, t["name"].lower().replace(" ", "-") + ".cheat")
        if not os.path.exists(cheat_path):
            cheat_path = os.path.join(cheat_dir, t["id"] + ".cheat")
        if os.path.exists(cheat_path):
            content = open(cheat_path).read()
            if usages and not any(u in content for u in usages[:1]):
                rich_without_examples.append(t["name"])

    print(f"  FOR610 tools (all): {len(for610_all)}")
    print(f"  FOR610 in REMnux: {len(for610_remnux)}")
    print(f"  Missing from master: {len(missing_from_master)}")
    print(f"  Missing cheatsheets: {len(missing_cheats)}")
    if missing_cheats:
        for m in missing_cheats[:5]:
            print(f"    ! {m}")
    print(f"  Rich without examples: {len(rich_without_examples)}")
    if rich_without_examples:
        for m in rich_without_examples[:5]:
            print(f"    ! {m}")
    print(f"  Errors: {len(errors)}")
    return errors


def test_tools_db():
    """Verify tools.db matches master inventory."""
    print("\n" + "=" * 60)
    print("TEST 3: tools.db Consistency")
    print("=" * 60)
    errors = []

    master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
    remnux_tools = {t["name"]: t for t in master["tools"] if t.get("in_remnux")}

    db_path = os.path.join(BASE_DIR, "data/generated/tools.db")
    db_entries = {}
    with open(db_path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("|")
            if len(parts) >= 5:
                db_entries[parts[0]] = {
                    "name": parts[0],
                    "description": parts[1],
                    "category": parts[2],
                    "usage": parts[3],
                    "tier": parts[4],
                }

    # Check all REMnux tools are in DB
    missing_from_db = []
    for name, tool in remnux_tools.items():
        if name not in db_entries:
            missing_from_db.append(name)

    # Check no empty descriptions
    empty_descs = [e["name"] for e in db_entries.values()
                   if e["description"] == "(no description available)"]

    # Check tier consistency
    tier_mismatches = []
    for name, entry in db_entries.items():
        if name in remnux_tools:
            expected_tier = remnux_tools[name].get("help_tier", "stub")
            if entry["tier"] != expected_tier:
                tier_mismatches.append(f"{name}: db={entry['tier']} vs master={expected_tier}")

    print(f"  tools.db entries: {len(db_entries)}")
    print(f"  REMnux tools in master: {len(remnux_tools)}")
    print(f"  Missing from DB: {len(missing_from_db)}")
    if missing_from_db:
        for m in missing_from_db[:5]:
            print(f"    ! {m}")
    print(f"  Empty descriptions: {len(empty_descs)}")
    if empty_descs:
        for m in empty_descs[:5]:
            print(f"    ! {m}")
    print(f"  Tier mismatches: {len(tier_mismatches)}")
    return errors


def test_workflows():
    """Verify all workflow files are generated and contain valid tool references."""
    print("\n" + "=" * 60)
    print("TEST 4: Workflow Files")
    print("=" * 60)
    errors = []

    wf_src = load_yaml(os.path.join(BASE_DIR, "data/for610/workflows.yaml"))
    wf_dir = os.path.join(BASE_DIR, "data/generated/workflows")

    expected_workflows = wf_src.get("workflows", [])
    generated = glob.glob(os.path.join(wf_dir, "*.txt"))
    generated_names = {os.path.basename(f).replace(".txt", "") for f in generated}

    # Check all workflows generated
    for wf in expected_workflows:
        wf_id = wf["id"].replace("_", "-")
        if wf_id not in generated_names:
            errors.append(f"Missing workflow file: {wf_id}.txt")

    # Check index file exists
    if "index" not in generated_names:
        errors.append("Missing workflow index.txt")

    # Check each workflow file has content
    for f in generated:
        content = open(f).read()
        if len(content) < 50:
            errors.append(f"Workflow file too short: {os.path.basename(f)}")

    print(f"  Expected workflows: {len(expected_workflows)}")
    print(f"  Generated files: {len(generated)} (including index)")
    print(f"  Errors: {len(errors)}")
    for e in errors:
        print(f"    ! {e}")
    return errors


def test_lab_tool_references():
    """Verify all tools referenced in labs exist in master inventory."""
    print("\n" + "=" * 60)
    print("TEST 5: Lab-Tool Cross-References")
    print("=" * 60)
    errors = []

    labs = load_yaml(os.path.join(BASE_DIR, "data/for610/labs.yaml"))
    master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
    master_ids = {t["id"] for t in master["tools"]}

    for610_tools = load_yaml(os.path.join(BASE_DIR, "data/for610/tools.yaml"))
    for610_ids = {t["id"] for t in for610_tools["tools"]}

    # Check all tool_ids in labs exist in FOR610
    missing = set()
    for lab in labs["labs"]:
        for tu in lab.get("tools_used", []):
            tid = tu["tool_id"]
            if tid not in for610_ids:
                missing.add(f"Lab {lab['id']}: tool '{tid}'")
                errors.append(f"Lab {lab['id']} references unknown tool: {tid}")

    print(f"  Labs: {len(labs['labs'])}")
    print(f"  Missing tool references: {len(missing)}")
    for m in sorted(missing)[:5]:
        print(f"    ! {m}")
    return errors


def test_remnux_docs_coverage():
    """Check how many REMnux-documented tools have help content."""
    print("\n" + "=" * 60)
    print("TEST 6: REMnux Docs Coverage in Help")
    print("=" * 60)
    errors = []

    master = load_yaml(os.path.join(BASE_DIR, "data/remnux/tools-master.yaml"))
    cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")

    docs_tools = [t for t in master["tools"]
                  if t["sources"]["remnux_docs"].get("covered") and t.get("in_remnux")]
    docs_with_cheat = 0
    docs_without_cheat = []

    for t in docs_tools:
        name = t["name"].lower().replace(" ", "-")
        variants = [name, t["id"], name + ".cheat"]
        found = any(os.path.exists(os.path.join(cheat_dir, v + ".cheat")) for v in [name, t["id"]])
        if found:
            docs_with_cheat += 1
        else:
            docs_without_cheat.append(t["name"])

    print(f"  REMnux-documented tools: {len(docs_tools)}")
    print(f"  With cheatsheets: {docs_with_cheat}")
    print(f"  Without cheatsheets: {len(docs_without_cheat)}")
    if docs_without_cheat:
        for m in docs_without_cheat[:5]:
            print(f"    ! {m}")
    return errors


def test_cheatsheet_quality():
    """Spot-check cheatsheet content for key tools."""
    print("\n" + "=" * 60)
    print("TEST 7: Cheatsheet Quality Spot-Checks")
    print("=" * 60)
    errors = []

    cheat_dir = os.path.join(BASE_DIR, "data/generated/cheatsheets")

    # Key tools that MUST have good cheatsheets
    key_tools = {
        "pdfid.py": ["pdfid.py", "document.pdf"],
        "pdf-parser.py": ["pdf-parser.py", "-a", "-s"],
        "oledump.py": ["oledump.py", "-s", "-v"],
        "capa": ["capa", "specimen"],
        "speakeasy": ["speakeasy", "-t"],
        "ghidra": ["ghidra"],
        "wireshark": ["wireshark"],
        "floss": ["floss"],
        "scdbgc": ["scdbgc", "/f"],
        "rtfdump.py": ["rtfdump.py"],
    }

    for tool, expected_strings in key_tools.items():
        cheat_path = os.path.join(cheat_dir, tool + ".cheat")
        if not os.path.exists(cheat_path):
            # Try without .py
            alt = tool.replace(".py", "-py") + ".cheat"
            cheat_path = os.path.join(cheat_dir, alt)

        if not os.path.exists(cheat_path):
            errors.append(f"Key tool {tool} has no cheatsheet")
            print(f"  ! {tool}: NO CHEATSHEET")
            continue

        content = open(cheat_path).read()
        missing_strings = [s for s in expected_strings if s not in content]
        if missing_strings:
            errors.append(f"{tool} cheatsheet missing: {missing_strings}")
            print(f"  ! {tool}: missing {missing_strings}")
        else:
            print(f"  + {tool}: OK")

    return errors


def main():
    all_errors = []

    all_errors.extend(test_master_inventory())
    all_errors.extend(test_for610_coverage())
    all_errors.extend(test_tools_db())
    all_errors.extend(test_workflows())
    all_errors.extend(test_lab_tool_references())
    all_errors.extend(test_remnux_docs_coverage())
    all_errors.extend(test_cheatsheet_quality())

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    if all_errors:
        print(f"\n  Total issues found: {len(all_errors)}")
        for e in all_errors:
            print(f"    - {e}")
        sys.exit(1)
    else:
        print(f"\n  All tests passed!")
        sys.exit(0)


if __name__ == "__main__":
    main()