docker_file_analysis/scripts/build-master-inventory.py

#!/usr/bin/env python3
"""Build the master tool inventory by merging three sources.

Merges:
1. FOR610 course data (data/for610/tools.yaml)
2. Salt-states installation data (data/remnux/sources/salt-states.yaml)
3. REMnux docs (data/remnux/sources/remnux-docs.yaml)

Output: data/remnux/tools-master.yaml
"""

import os
import re
import yaml

BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
FOR610_TOOLS = os.path.join(BASE_DIR, "data", "for610", "tools.yaml")
SALT_STATES = os.path.join(BASE_DIR, "data", "remnux", "sources", "salt-states.yaml")
REMNUX_DOCS = os.path.join(BASE_DIR, "data", "remnux", "sources", "remnux-docs.yaml")
ENRICHMENTS = os.path.join(BASE_DIR, "data", "remnux", "tool-enrichments.yaml")
OUTPUT = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")

# Manual override mapping for tools that have different names across sources
# Format: normalized_key -> canonical_id
NAME_OVERRIDES = {
    "die": "diec",
    "detect-it-easy": "diec",
    "detect it easy": "diec",
    "js": "spidermonkey",
    "js-patched": "spidermonkey",
    "spidermonkey-patched": "spidermonkey",
    "mozilla-spidermonkey": "spidermonkey",
    "vol": "volatility3",
    "vol-py": "volatility3",
    "volatility-framework": "volatility3",
    "volatility": "volatility3",
    "process-hacker": "system-informer",
    "yara-rules": "yara",
    "yara-forge": "yara",
    "yara-x": "yara-x",
    "jsbeautifier": "js-beautify",
    "js-beautifier": "js-beautify",
    "ilspycmd": "ilspycmd",
    "ilspy": "ilspy",
    "upx-ucl": "upx",
    "unrar-free": "rar",
    "netcat-openbsd": "netcat",
    "net-tools": "net-tools",
    "oletools": "olevba",
    "pev": "readpe",
    "scdbg": "scdbgc",
    "origamindee": "origami",
    "pdftk-java": "pdftk",
    "fakenet-ng": "fakenet-ng",
    "accept-all-ips": "httpd",
    "7zip": "7zip",
    "7z": "7zip",
    "p7zip": "7zip",
    "info-zip": "unzip",
    "cutter": "cutter",
    "r2pipe": "radare2",
    "r2": "radare2",
    "stpyv8": "spidermonkey",
    "rhino-debugger": "spidermonkey",
    "powershell-core": "powershell",
    "powershell": "powershell",
    "didier-stevens-scripts": "didier-stevens-suite",
    "docker-compose": "docker",
    "docker": "docker",
    "ghidrassist-mcp": "ghidra",
    "remnux-mcp-server": "remnux-mcp-server",
}


def normalize_name(name):
    """Normalize a tool name for matching."""
    n = name.lower().strip()
    n = re.sub(r'\.py$', '', n)
    n = re.sub(r'\.pl$', '', n)
    n = re.sub(r'\.bat$', '', n)
    n = re.sub(r'[^a-z0-9]+', '-', n)
    n = n.strip('-')
    return n


def make_id(name):
    """Create a kebab-case ID from a name."""
    n = name.lower().strip()
    # Keep .py/.pl as -py/-pl in the ID
    n = re.sub(r'\.py$', '-py', n)
    n = re.sub(r'\.pl$', '-pl', n)
    n = re.sub(r'\.bat$', '-bat', n)
    n = re.sub(r'[^a-z0-9]+', '-', n)
    n = n.strip('-')
    return n


def load_for610():
    """Load FOR610 tools."""
    with open(FOR610_TOOLS) as f:
        data = yaml.safe_load(f)
    return data.get("tools", [])


def load_salt_states():
    """Load salt-states parsed data."""
    if not os.path.exists(SALT_STATES):
        print(f"  Warning: {SALT_STATES} not found, skipping")
        return []
    with open(SALT_STATES) as f:
        data = yaml.safe_load(f)
    return data.get("tools", [])


def load_remnux_docs():
    """Load REMnux docs scraped data."""
    if not os.path.exists(REMNUX_DOCS):
        print(f"  Warning: {REMNUX_DOCS} not found, skipping")
        return []
    with open(REMNUX_DOCS) as f:
        data = yaml.safe_load(f)
    return data.get("tools", [])


def build_lookup_index(master_tools):
    """Build a multi-key lookup index for matching."""
    index = {}
    for tool in master_tools:
        tid = tool["id"]
        # Index by id
        index[tid] = tid
        # Index by normalized name
        index[normalize_name(tool["name"])] = tid
        # Index by aliases
        for alias in tool.get("aliases", []):
            index[normalize_name(alias)] = tid
    return index


def find_match(name, index):
    """Try to find a matching tool in the index."""
    normalized = normalize_name(name)

    # Check overrides first
    if normalized in NAME_OVERRIDES:
        override_id = NAME_OVERRIDES[normalized]
        if override_id in index:
            return index[override_id]
        return override_id

    # Direct match
    if normalized in index:
        return index[normalized]

    # Try with -py suffix
    if normalized + "-py" in index:
        return index[normalized + "-py"]

    # Try without trailing digits
    stripped = re.sub(r'-?\d+$', '', normalized)
    if stripped and stripped in index:
        return index[stripped]

    return None


def compute_help_tier(tool):
    """Determine the help tier based on coverage."""
    has_for610 = tool.get("sources", {}).get("for610", {}).get("covered", False)
    has_docs = tool.get("sources", {}).get("remnux_docs", {}).get("covered", False)
    has_salt = tool.get("sources", {}).get("salt_states", {}).get("covered", False)

    if has_for610:
        return "rich"
    elif has_docs:
        return "standard"
    elif has_salt:
        return "basic"
    else:
        return "stub"


def main():
    print("Building master tool inventory...")

    # --- Step 1: Load FOR610 tools as base ---
    print("\n1. Loading FOR610 tools...")
    for610_tools = load_for610()
    print(f"   Loaded {len(for610_tools)} tools")

    master = {}
    for t in for610_tools:
        tid = t["id"]
        entry = {
            "id": tid,
            "name": t["name"],
            "aliases": t.get("aliases", []),
            "description": t.get("description", ""),
            "in_remnux": t.get("in_remnux", False),
            "platform": t.get("platform", "linux"),
            "sources": {
                "for610": {
                    "covered": True,
                    "description": t.get("description", ""),
                    "category": t.get("category", ""),
                    "labs": t.get("labs", []),
                    "sections": t.get("for610_sections", []),
                    "typical_usage": t.get("typical_usage", []),
                    "tags": t.get("tags", []),
                },
                "salt_states": {"covered": False},
                "remnux_docs": {"covered": False},
            },
        }
        if t.get("author"):
            entry["sources"]["for610"]["author"] = t["author"]
        master[tid] = entry

    # --- Step 2: Merge salt-states ---
    print("\n2. Loading salt-states...")
    salt_tools = load_salt_states()
    print(f"   Loaded {len(salt_tools)} entries")

    index = build_lookup_index(list(master.values()))
    salt_matched = 0
    salt_new = 0

    for st in salt_tools:
        st_id = st["id"]
        st_names = st.get("package_names", [st_id])

        # Try to match against existing tools
        matched_id = None
        for name in [st_id] + st_names:
            matched_id = find_match(name, index)
            if matched_id:
                break

        if matched_id and matched_id in master:
            # Enrich existing tool
            master[matched_id]["sources"]["salt_states"] = {
                "covered": True,
                "install_method": st.get("install_method", "unknown"),
                "package_name": st_names[0] if st_names else st_id,
                "salt_state_path": st.get("salt_state_path", ""),
            }
            master[matched_id]["in_remnux"] = True
            salt_matched += 1
        else:
            # Create new tool entry
            new_id = make_id(st_id)
            # Check if override maps to something we don't have yet
            if normalize_name(st_id) in NAME_OVERRIDES:
                new_id = NAME_OVERRIDES[normalize_name(st_id)]

            if new_id not in master:
                master[new_id] = {
                    "id": new_id,
                    "name": st_id,
                    "aliases": [n for n in st_names if n != st_id][:3],
                    "description": "",
                    "in_remnux": True,
                    "platform": "linux",
                    "sources": {
                        "for610": {"covered": False},
                        "salt_states": {
                            "covered": True,
                            "install_method": st.get("install_method", "unknown"),
                            "package_name": st_names[0] if st_names else st_id,
                            "salt_state_path": st.get("salt_state_path", ""),
                        },
                        "remnux_docs": {"covered": False},
                    },
                }
                # Update index
                index[new_id] = new_id
                index[normalize_name(st_id)] = new_id
                for n in st_names:
                    index[normalize_name(n)] = new_id
                salt_new += 1
            else:
                # Already exists under the override ID
                master[new_id]["sources"]["salt_states"] = {
                    "covered": True,
                    "install_method": st.get("install_method", "unknown"),
                    "package_name": st_names[0] if st_names else st_id,
                    "salt_state_path": st.get("salt_state_path", ""),
                }
                salt_matched += 1

    print(f"   Matched: {salt_matched}, New: {salt_new}")

    # --- Step 3: Merge REMnux docs ---
    print("\n3. Loading REMnux docs...")
    doc_tools = load_remnux_docs()
    print(f"   Loaded {len(doc_tools)} entries")

    # Rebuild index after salt-states additions
    index = build_lookup_index(list(master.values()))
    docs_matched = 0
    docs_new = 0

    for dt in doc_tools:
        dt_name = dt.get("name", "")
        dt_id = dt.get("id", make_id(dt_name))

        matched_id = find_match(dt_name, index)
        if not matched_id:
            matched_id = find_match(dt_id, index)

        if matched_id and matched_id in master:
            # Enrich existing tool
            doc_entry = {
                "covered": True,
                "category": dt.get("category", ""),
                "description": dt.get("description", ""),
                "docs_url": dt.get("docs_url", ""),
            }
            if dt.get("website"):
                doc_entry["website"] = dt["website"]
            if dt.get("anchor"):
                doc_entry["anchor"] = dt["anchor"]

            master[matched_id]["sources"]["remnux_docs"] = doc_entry

            # Use REMnux docs description if we don't have one
            if not master[matched_id]["description"] and dt.get("description"):
                master[matched_id]["description"] = dt["description"]

            docs_matched += 1
        else:
            # Create new entry
            new_id = make_id(dt_name) if dt_name else dt_id
            if new_id not in master:
                master[new_id] = {
                    "id": new_id,
                    "name": dt_name,
                    "aliases": [],
                    "description": dt.get("description", ""),
                    "in_remnux": True,
                    "platform": "linux",
                    "sources": {
                        "for610": {"covered": False},
                        "salt_states": {"covered": False},
                        "remnux_docs": {
                            "covered": True,
                            "category": dt.get("category", ""),
                            "description": dt.get("description", ""),
                            "docs_url": dt.get("docs_url", ""),
                        },
                    },
                }
                if dt.get("website"):
                    master[new_id]["sources"]["remnux_docs"]["website"] = dt["website"]
                index[new_id] = new_id
                index[normalize_name(dt_name)] = new_id
                docs_new += 1
            else:
                master[new_id]["sources"]["remnux_docs"] = {
                    "covered": True,
                    "category": dt.get("category", ""),
                    "description": dt.get("description", ""),
                    "docs_url": dt.get("docs_url", ""),
                }
                docs_matched += 1

    print(f"   Matched: {docs_matched}, New: {docs_new}")

    # --- Step 4: Apply manual enrichments ---
    print("\n4. Applying manual enrichments...")
    if os.path.exists(ENRICHMENTS):
        with open(ENRICHMENTS) as f:
            enrich_data = yaml.safe_load(f)
        enrichments = enrich_data.get("enrichments", {})
        enriched = 0
        for tool_key, enrich in enrichments.items():
            # Find the tool in master by key or normalized name
            matched_id = find_match(tool_key, index)
            if not matched_id:
                matched_id = tool_key
            if matched_id in master:
                tool = master[matched_id]
                # Apply enrichment data
                if enrich.get("description") and not tool.get("description"):
                    tool["description"] = enrich["description"]
                elif enrich.get("description"):
                    tool["description"] = enrich["description"]
                # Add usage examples to for610 source (or create enrichment source)
                if enrich.get("typical_usage"):
                    if not tool["sources"]["for610"].get("covered"):
                        tool["sources"]["for610"]["covered"] = True
                        tool["sources"]["for610"]["typical_usage"] = enrich["typical_usage"]
                        tool["sources"]["for610"]["tags"] = enrich.get("tags", [])
                        tool["sources"]["for610"]["description"] = enrich.get("description", "")
                    else:
                        # Merge usage examples
                        existing = tool["sources"]["for610"].get("typical_usage", [])
                        for u in enrich["typical_usage"]:
                            if u not in existing:
                                existing.append(u)
                        tool["sources"]["for610"]["typical_usage"] = existing
                enriched += 1
            else:
                print(f"    Warning: enrichment key '{tool_key}' not found in master")
        print(f"   Enriched: {enriched} tools")
    else:
        print("   No enrichments file found, skipping")

    # Rebuild index after enrichments
    index = build_lookup_index(list(master.values()))

    # --- Step 5: Compute derived fields ---
    print("\n5. Computing derived fields...")
    for tool in master.values():
        tool["has_for610_coverage"] = tool["sources"]["for610"].get("covered", False)
        tool["has_remnux_docs"] = tool["sources"]["remnux_docs"].get("covered", False)
        tool["has_salt_state"] = tool["sources"]["salt_states"].get("covered", False)
        tool["help_tier"] = compute_help_tier(tool)

    # --- Step 6: Sort and output ---
    tools_list = sorted(master.values(), key=lambda t: t["id"])

    # Remove windows-only/online tools that aren't in remnux
    # (keep them for reference but flag appropriately)

    tiers = {}
    for t in tools_list:
        tier = t["help_tier"]
        tiers[tier] = tiers.get(tier, 0) + 1

    output = {
        "metadata": {
            "total_tools": len(tools_list),
            "in_remnux_count": sum(1 for t in tools_list if t["in_remnux"]),
            "help_tier_counts": tiers,
            "source_coverage": {
                "for610_only": sum(1 for t in tools_list if t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
                "remnux_docs_only": sum(1 for t in tools_list if t["has_remnux_docs"] and not t["has_for610_coverage"] and not t["has_salt_state"]),
                "salt_states_only": sum(1 for t in tools_list if t["has_salt_state"] and not t["has_for610_coverage"] and not t["has_remnux_docs"]),
                "all_three": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"] and t["has_salt_state"]),
                "for610_and_docs": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_remnux_docs"]),
                "for610_and_salt": sum(1 for t in tools_list if t["has_for610_coverage"] and t["has_salt_state"]),
                "docs_and_salt": sum(1 for t in tools_list if t["has_remnux_docs"] and t["has_salt_state"]),
                "no_coverage": sum(1 for t in tools_list if not t["has_for610_coverage"] and not t["has_remnux_docs"] and not t["has_salt_state"]),
            },
        },
        "tools": tools_list,
    }

    with open(OUTPUT, "w") as f:
        yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

    print(f"\n{'='*50}")
    print(f"MASTER INVENTORY BUILT: {len(tools_list)} tools")
    print(f"  In REMnux: {output['metadata']['in_remnux_count']}")
    print(f"\nHelp Tiers:")
    for tier, count in sorted(tiers.items()):
        print(f"  {tier}: {count}")
    print(f"\nSource Coverage:")
    for key, val in output["metadata"]["source_coverage"].items():
        print(f"  {key}: {val}")
    print(f"\nOutput: {OUTPUT}")


if __name__ == "__main__":
    main()