Add FOR610 tool/workflow knowledge base and data pipeline

Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 17:38:15 +01:00
parent 06ebb09ab0
commit f3ccc09c3d
663 changed files with 36339 additions and 1 deletions
@@ -0,0 +1,534 @@
+#!/usr/bin/env python3
+"""Generate all help artifacts from the master tool inventory.
+
+Reads data/remnux/tools-master.yaml and data/for610/workflows.yaml to produce:
+- data/generated/tools.db (pipe-delimited for find-tool)
+- data/generated/cheatsheets/*.cheat (per-tool cheat sheets)
+- data/generated/workflows/*.txt (workflow help files)
+- data/generated/tldr/*.md (TLDR pages)
+"""
+
+import os
+import re
+import yaml
+import textwrap
+
+BASE_DIR = os.path.join(os.path.dirname(__file__), "..")
+MASTER = os.path.join(BASE_DIR, "data", "remnux", "tools-master.yaml")
+WORKFLOWS_SRC = os.path.join(BASE_DIR, "data", "for610", "workflows.yaml")
+RECIPES_SRC = os.path.join(BASE_DIR, "data", "for610", "recipes.yaml")
+GEN_DIR = os.path.join(BASE_DIR, "data", "generated")
+
+
+def load_master():
+    with open(MASTER) as f:
+        return yaml.safe_load(f)
+
+
+def load_workflows():
+    with open(WORKFLOWS_SRC) as f:
+        return yaml.safe_load(f)
+
+
+def load_recipes():
+    if os.path.exists(RECIPES_SRC):
+        with open(RECIPES_SRC) as f:
+            return yaml.safe_load(f)
+    return {"recipes": []}
+
+
+def build_recipe_index(recipes_data):
+    """Build a mapping of tool_id -> list of recipes that use that tool."""
+    index = {}
+    for recipe in recipes_data.get("recipes", []):
+        for tool_id in recipe.get("tools", []):
+            index.setdefault(tool_id, []).append(recipe)
+            # Also index by normalized variants
+            normalized = tool_id.lower().replace("-", "").replace("_", "")
+            if normalized != tool_id:
+                index.setdefault(normalized, []).append(recipe)
+    return index
+
+
+# ============================================================
+# tools.db generator
+# ============================================================
+
+def generate_tools_db(tools):
+    """Generate pipe-delimited tools.db for find-tool."""
+    output_path = os.path.join(GEN_DIR, "tools.db")
+    lines = []
+
+    for t in tools:
+        if not t.get("in_remnux"):
+            continue
+
+        name = t["name"]
+        desc = t.get("description", "").replace("|", "/").replace("\n", " ").strip()[:120]
+        if not desc:
+            desc = f"(no description available)"
+
+        # Get best category
+        cat = ""
+        if t["sources"]["remnux_docs"].get("covered"):
+            cat = t["sources"]["remnux_docs"].get("category", "")
+        elif t["sources"]["for610"].get("covered"):
+            cat = t["sources"]["for610"].get("category", "")
+
+        # Get best usage example
+        usage = ""
+        if t["sources"]["for610"].get("covered"):
+            usages = t["sources"]["for610"].get("typical_usage", [])
+            if usages:
+                usage = usages[0]
+        if not usage:
+            usage = f"{name} --help"
+        usage = usage.replace("|", " ").strip()
+
+        tier = t.get("help_tier", "stub")
+
+        lines.append(f"{name}|{desc}|{cat}|{usage}|{tier}")
+
+    lines.sort()
+
+    with open(output_path, "w") as f:
+        f.write("\n".join(lines) + "\n")
+
+    print(f"  tools.db: {len(lines)} entries")
+    return len(lines)
+
+
+# ============================================================
+# Cheatsheet generator
+# ============================================================
+
+def sanitize_filename(name):
+    """Convert tool name to a safe filename."""
+    return re.sub(r'[^a-zA-Z0-9._-]', '-', name).strip('-').lower()
+
+
+def generate_usage_comment(name, usage, index):
+    """Generate a descriptive comment for a usage example."""
+    # Analyze the command to produce a meaningful description
+    usage_lower = usage.lower()
+
+    if index == 0:
+        return f"Basic usage"
+
+    # Try to describe based on flags
+    if "-vv" in usage or "--verbose" in usage:
+        return "Verbose output with details"
+    if "--no-static" in usage or "--no static" in usage:
+        return "Skip static analysis, focus on dynamic"
+    if "-n " in usage:
+        return "Suppress default output"
+    if "-a " in usage or "--all" in usage:
+        return "Show all results"
+    if "-s " in usage:
+        return "Select specific item"
+    if "-d " in usage:
+        return "Dump/extract content"
+    if "-r " in usage:
+        return "Recursive/follow references"
+    if "-k " in usage:
+        return "Extract by keyword"
+    if "-o " in usage:
+        return "Output to file"
+    if "-f " in usage:
+        return "Process input file"
+    if "-i " in usage:
+        return "Case-insensitive search"
+    if "grep" in usage_lower:
+        return "Filter output for specific pattern"
+    if "--help" in usage:
+        return "Show help"
+    if "|" in usage:
+        return "Pipe output for processing"
+    if ">" in usage:
+        return "Save output to file"
+
+    return f"Alternative usage"
+
+
+def format_recipes_section(tool_id, recipe_index):
+    """Generate the recipes section for a cheatsheet."""
+    recipes = recipe_index.get(tool_id, [])
+    if not recipes:
+        # Try variants
+        for variant in [tool_id.replace("-py", ""), tool_id.replace("-", "")]:
+            recipes = recipe_index.get(variant, [])
+            if recipes:
+                break
+    if not recipes:
+        return ""
+
+    # Deduplicate recipes by id
+    seen = set()
+    unique = []
+    for r in recipes:
+        if r["id"] not in seen:
+            seen.add(r["id"])
+            unique.append(r)
+
+    lines = [
+        "",
+        "# --- Recipes (multi-tool chains) ---",
+        "",
+    ]
+    for recipe in unique:
+        lines.append(f"# >> {recipe['name']}")
+        for cmd in recipe.get("commands", []):
+            lines.append(cmd)
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def generate_cheatsheet_rich(t, recipe_index=None):
+    """Generate a rich cheatsheet for a tool with FOR610 coverage."""
+    f610 = t["sources"]["for610"]
+    name = t["name"]
+    desc = t.get("description", "")
+    labs = f610.get("labs", [])
+    sections = f610.get("sections", [])
+    tags = f610.get("tags", [])
+    usages = f610.get("typical_usage", [])
+    author = f610.get("author", "")
+
+    lines = [
+        f"# {name}",
+        f"# {desc}",
+    ]
+
+    meta_parts = []
+    if labs:
+        meta_parts.append(f"FOR610 Labs: {', '.join(labs)}")
+    if sections:
+        meta_parts.append(f"Sections: {', '.join(str(s) for s in sections)}")
+    if author:
+        meta_parts.append(f"Author: {author}")
+    if meta_parts:
+        lines.append(f"# {' | '.join(meta_parts)}")
+
+    # REMnux docs URL if available
+    if t["sources"]["remnux_docs"].get("covered"):
+        url = t["sources"]["remnux_docs"].get("docs_url", "")
+        if url:
+            lines.append(f"# Docs: {url}")
+
+    lines.append("")
+
+    # Tags
+    tag_str = ", ".join(tags[:8]) if tags else name.lower()
+    lines.append(f"% {tag_str}")
+    lines.append("")
+
+    # Usage examples with descriptive comments
+    for i, usage in enumerate(usages):
+        comment = generate_usage_comment(name, usage, i)
+        lines.append(f"# {comment}")
+        lines.append(usage)
+        lines.append("")
+
+    # If no usage examples, add a basic one
+    if not usages:
+        lines.append(f"# Show help")
+        lines.append(f"{name} --help")
+        lines.append("")
+
+    # Append recipes section if this tool participates in any recipes
+    if recipe_index:
+        recipes_text = format_recipes_section(t["id"], recipe_index)
+        if recipes_text:
+            lines.append(recipes_text)
+
+    return "\n".join(lines)
+
+
+def generate_cheatsheet_standard(t):
+    """Generate a standard cheatsheet from REMnux docs."""
+    rdocs = t["sources"]["remnux_docs"]
+    name = t["name"]
+    desc = t.get("description", "") or rdocs.get("description", "")
+    cat = rdocs.get("category", "")
+    url = rdocs.get("docs_url", "")
+
+    lines = [
+        f"# {name}",
+        f"# {desc}" if desc else f"# {name} tool",
+    ]
+    if cat:
+        lines.append(f"# Category: {cat}")
+    if url:
+        lines.append(f"# Docs: {url}")
+
+    lines += [
+        "",
+        f"% {sanitize_filename(name)}",
+        "",
+        f"# Show help for {name}",
+        f"{name} --help",
+        "",
+    ]
+
+    return "\n".join(lines)
+
+
+def generate_cheatsheet_basic(t):
+    """Generate a minimal cheatsheet for a tool with only salt-states."""
+    name = t["name"]
+    salt = t["sources"]["salt_states"]
+    install = salt.get("install_method", "unknown")
+    pkg = salt.get("package_name", name)
+
+    lines = [
+        f"# {name}",
+        f"# Installed via: {install} ({pkg})",
+        "",
+        f"% {sanitize_filename(name)}",
+        "",
+        f"# Show help for {name}",
+        f"{name} --help",
+        "",
+    ]
+
+    return "\n".join(lines)
+
+
+def generate_cheatsheets(tools, recipe_index=None):
+    """Generate per-tool cheatsheet files."""
+    cheat_dir = os.path.join(GEN_DIR, "cheatsheets")
+    os.makedirs(cheat_dir, exist_ok=True)
+
+    count = 0
+    for t in tools:
+        if not t.get("in_remnux"):
+            continue
+
+        tier = t.get("help_tier", "stub")
+        name = t["name"]
+        filename = sanitize_filename(name) + ".cheat"
+
+        if tier == "rich":
+            content = generate_cheatsheet_rich(t, recipe_index=recipe_index)
+        elif tier == "standard":
+            content = generate_cheatsheet_standard(t)
+        else:
+            content = generate_cheatsheet_basic(t)
+
+        with open(os.path.join(cheat_dir, filename), "w") as f:
+            f.write(content)
+        count += 1
+
+    print(f"  cheatsheets: {count} .cheat files")
+    return count
+
+
+# ============================================================
+# Workflow generator
+# ============================================================
+
+def _get_tool_examples(tool_name, master_tools_by_name):
+    """Get 1-2 example commands for a tool from the master inventory."""
+    tool = master_tools_by_name.get(tool_name)
+    if not tool:
+        # Try kebab-case lookup
+        normalized = tool_name.lower().replace("_", "-")
+        tool = master_tools_by_name.get(normalized)
+    if tool and tool["sources"]["for610"].get("covered"):
+        usages = tool["sources"]["for610"].get("typical_usage", [])
+        return usages[:2]
+    return []
+
+
+def generate_workflows(workflows_data, master_tools=None):
+    """Generate readable workflow help files with inline examples."""
+    wf_dir = os.path.join(GEN_DIR, "workflows")
+    os.makedirs(wf_dir, exist_ok=True)
+
+    # Build tool name lookup for inline examples
+    tools_by_name = {}
+    if master_tools:
+        for t in master_tools:
+            tools_by_name[t["name"].lower()] = t
+            tools_by_name[t["id"]] = t
+            for alias in t.get("aliases", []):
+                tools_by_name[alias.lower()] = t
+
+    workflows = workflows_data.get("workflows", [])
+    count = 0
+
+    for wf in workflows:
+        wf_id = wf["id"]
+        name = wf["name"]
+        desc = wf.get("description", "")
+        steps = wf.get("steps", [])
+        related_labs = wf.get("related_labs", [])
+
+        lines = [
+            f"{'='*60}",
+            f"  {name}",
+            f"{'='*60}",
+            "",
+            f"  {desc}",
+            "",
+        ]
+
+        if related_labs:
+            lines.append(f"  Related FOR610 Labs: {', '.join(related_labs)}")
+            lines.append("")
+
+        lines.append(f"{'─'*60}")
+        lines.append("")
+
+        for step in steps:
+            order = step.get("order", "?")
+            step_name = step.get("name", "")
+            step_desc = step.get("description", "")
+            step_tools = step.get("tools", [])
+
+            lines.append(f"  Step {order}: {step_name}")
+            if step_tools:
+                lines.append(f"  Tools: {', '.join(step_tools)}")
+            if step_desc:
+                wrapped = textwrap.fill(step_desc, width=56, initial_indent="  ", subsequent_indent="  ")
+                lines.append(wrapped)
+
+            # Add inline command examples for each tool
+            if step_tools and tools_by_name:
+                examples_shown = False
+                for tool_name in step_tools:
+                    examples = _get_tool_examples(tool_name, tools_by_name)
+                    if examples:
+                        if not examples_shown:
+                            lines.append("")
+                        for ex in examples[:1]:  # Show 1 example per tool
+                            lines.append(f"    $ {ex}")
+                        examples_shown = True
+
+            lines.append("")
+
+        lines.append(f"{'─'*60}")
+        lines.append(f"  Tip: 'fhelp cheat <tool>' for full examples")
+        lines.append(f"       'Ctrl+G' for interactive cheatsheet browser")
+        lines.append("")
+
+        filename = wf_id.replace("_", "-") + ".txt"
+        with open(os.path.join(wf_dir, filename), "w") as f:
+            f.write("\n".join(lines))
+        count += 1
+
+    # Also generate an index file
+    index_lines = [
+        f"{'='*60}",
+        f"  Available Analysis Workflows",
+        f"{'='*60}",
+        "",
+    ]
+    for wf in workflows:
+        wf_id = wf["id"].replace("_", "-")
+        name = wf["name"]
+        desc = wf.get("description", "")
+        index_lines.append(f"  {wf_id}")
+        index_lines.append(f"    {name}")
+        wrapped = textwrap.fill(desc, width=56, initial_indent="    ", subsequent_indent="    ")
+        index_lines.append(wrapped)
+        index_lines.append("")
+
+    index_lines += [
+        f"{'─'*60}",
+        f"  Usage: fhelp workflow <name>",
+        f"  Example: fhelp workflow static-analysis",
+        "",
+    ]
+
+    with open(os.path.join(wf_dir, "index.txt"), "w") as f:
+        f.write("\n".join(index_lines))
+
+    print(f"  workflows: {count} workflow files + index")
+    return count
+
+
+# ============================================================
+# TLDR generator
+# ============================================================
+
+def generate_tldr(tools):
+    """Generate TLDR pages for tools missing from upstream."""
+    tldr_dir = os.path.join(GEN_DIR, "tldr")
+    os.makedirs(tldr_dir, exist_ok=True)
+
+    count = 0
+    for t in tools:
+        if not t.get("in_remnux"):
+            continue
+
+        tier = t.get("help_tier", "stub")
+        if tier not in ("rich", "standard"):
+            continue
+
+        name = t["name"]
+        desc = t.get("description", "") or f"{name} tool"
+
+        # Get usage examples
+        usages = []
+        if t["sources"]["for610"].get("covered"):
+            usages = t["sources"]["for610"].get("typical_usage", [])
+
+        if not usages:
+            usages = [f"{name} --help"]
+
+        # TLDR format
+        lines = [
+            f"# {name}",
+            "",
+            f"> {desc}",
+            "",
+        ]
+
+        for i, usage in enumerate(usages[:4]):
+            # Create a description from the command
+            lines.append(f"- Run {name}:")
+            lines.append("")
+            lines.append(f"`{usage}`")
+            lines.append("")
+
+        filename = sanitize_filename(name) + ".md"
+        with open(os.path.join(tldr_dir, filename), "w") as f:
+            f.write("\n".join(lines))
+        count += 1
+
+    print(f"  tldr: {count} pages")
+    return count
+
+
+# ============================================================
+# Main
+# ============================================================
+
+def main():
+    print("Generating help artifacts from master inventory...")
+
+    master = load_master()
+    tools = master["tools"]
+    workflows_data = load_workflows()
+    recipes_data = load_recipes()
+    recipe_index = build_recipe_index(recipes_data)
+
+    print(f"\nInput: {len(tools)} tools, {len(workflows_data.get('workflows', []))} workflows, {len(recipes_data.get('recipes', []))} recipes")
+    print()
+
+    db_count = generate_tools_db(tools)
+    cheat_count = generate_cheatsheets(tools, recipe_index=recipe_index)
+    wf_count = generate_workflows(workflows_data, master_tools=tools)
+    tldr_count = generate_tldr(tools)
+
+    print(f"\nAll artifacts generated in {GEN_DIR}/")
+    print(f"  tools.db:      {db_count} entries")
+    print(f"  cheatsheets/:  {cheat_count} files")
+    print(f"  workflows/:    {wf_count} + index")
+    print(f"  tldr/:         {tldr_count} pages")
+
+
+if __name__ == "__main__":
+    main()