Add FOR610 tool/workflow knowledge base and data pipeline

Build comprehensive malware analysis knowledge base from 3 sources:
- SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes
- REMnux salt-states: 340 packages parsed from GitHub
- REMnux docs: 280+ tools scraped from docs.remnux.org

Master inventory merges all sources into 447 tools with help tiers
(rich/standard/basic). Pipeline generates: tools.db (397 entries),
397 cheatsheets with multi-tool recipes, 15 workflow guides, 224
TLDR pages, and coverage reports.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
tobias
2026-03-28 17:38:15 +01:00
parent 06ebb09ab0
commit f3ccc09c3d
663 changed files with 36339 additions and 1 deletions
+202
View File
@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""Parse REMnux salt-states repository to extract all installed tools/packages.
Fetches the salt-states repo tree from GitHub, parses .sls files to identify
what gets installed, and outputs data/remnux/sources/salt-states.yaml.
"""
import json
import re
import urllib.request
import yaml
import os
GITHUB_API = "https://api.github.com/repos/REMnux/salt-states"
RAW_BASE = "https://raw.githubusercontent.com/REMnux/salt-states/master"
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "salt-states.yaml")
def fetch_json(url):
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
def fetch_text(url):
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read().decode()
except Exception as e:
print(f" Warning: could not fetch {url}: {e}")
return None
def get_sls_files():
"""Get all .sls file paths from the repo."""
tree = fetch_json(f"{GITHUB_API}/git/trees/master?recursive=1")
return [item["path"] for item in tree["tree"]
if item["path"].endswith(".sls") and item["type"] == "blob"]
def classify_sls_path(path):
"""Classify the install method from the directory structure."""
parts = path.lower()
if "python3-package" in parts or "python-package" in parts:
return "pip"
elif "pip" in parts:
return "pip"
elif "rubygem" in parts:
return "gem"
elif "npm" in parts or "node" in parts:
return "npm"
elif "perl-package" in parts:
return "perl"
elif "package" in parts:
return "apt"
elif "tools" in parts:
return "manual"
elif "script" in parts:
return "script"
else:
return "unknown"
def extract_tool_name_from_path(path):
"""Extract a human-readable tool name from the .sls file path."""
basename = os.path.basename(path).replace(".sls", "")
# Skip non-tool files
skip = {"init", "addon", "cloud", "dedicated", "theme", "remnux-config",
"apt-transport-https", "packages", "python3-packages", "python-packages",
"rubygems", "perl-packages", "node-packages", "tools", "scripts"}
if basename in skip:
return None
return basename
def parse_sls_content(content, path):
"""Parse a .sls file and extract package/tool information."""
if not content:
return []
results = []
tool_name = extract_tool_name_from_path(path)
if not tool_name:
return []
install_method = classify_sls_path(path)
# Try to find the actual package name from the content
package_names = []
# Match pip.installed, pkg.installed, gem.installed, npm.installed
for match in re.finditer(r'(\w[\w.-]+):\s*\n\s+(?:pip|pkg|gem|npm)\.installed', content):
package_names.append(match.group(1))
# Match "- name: package_name" in pip/pkg states
for match in re.finditer(r'-\s+name:\s+([^\s#\n]+)', content):
name = match.group(1).strip("'\"")
if name and not name.startswith('{') and not name.startswith('/'):
package_names.append(name)
# Match wget/curl downloads (manual installs)
for match in re.finditer(r'(?:wget|curl)\s+.*?/([^/\s"]+?)(?:\s|"|$)', content):
fname = match.group(1)
if '.' in fname and not fname.endswith('.key'):
package_names.append(fname)
# Match file.managed targets (scripts/binaries being deployed)
for match in re.finditer(r'/usr/local/bin/([^:\s]+)', content):
package_names.append(match.group(1))
# Deduplicate and clean
seen = set()
clean_names = []
for n in package_names:
n = n.strip().strip("'\"")
if n and n.lower() not in seen and len(n) > 1:
seen.add(n.lower())
clean_names.append(n)
entry = {
"id": tool_name,
"package_names": clean_names if clean_names else [tool_name],
"install_method": install_method,
"salt_state_path": path,
}
# Try to detect if it's enabled/disabled
if "False" in content and ("onlyif" in content.lower() or "unless" in content.lower()):
entry["possibly_conditional"] = True
results.append(entry)
return results
def main():
print("Fetching salt-states repository tree...")
sls_files = get_sls_files()
print(f"Found {len(sls_files)} .sls files")
# Filter to relevant paths (skip top-level orchestration files)
relevant = [f for f in sls_files if f.startswith("remnux/")]
print(f" {len(relevant)} under remnux/")
all_tools = []
categories_seen = set()
for i, path in enumerate(relevant):
if i % 20 == 0:
print(f" Processing {i}/{len(relevant)}...")
# Derive category from path
parts = path.split("/")
if len(parts) >= 3:
category_dir = parts[1] # e.g., "python3-packages", "tools", "packages"
categories_seen.add(category_dir)
content = fetch_text(f"{RAW_BASE}/{path}")
tools = parse_sls_content(content, path)
all_tools.extend(tools)
# Deduplicate by id
seen_ids = set()
unique_tools = []
for t in all_tools:
if t["id"] not in seen_ids:
seen_ids.add(t["id"])
unique_tools.append(t)
# Sort by id
unique_tools.sort(key=lambda t: t["id"])
output = {
"metadata": {
"source": "https://github.com/REMnux/salt-states",
"branch": "master",
"total_sls_files": len(relevant),
"total_tools_extracted": len(unique_tools),
"install_method_counts": {},
"salt_directories": sorted(categories_seen),
},
"tools": unique_tools,
}
# Count install methods
for t in unique_tools:
m = t["install_method"]
output["metadata"]["install_method_counts"][m] = \
output["metadata"]["install_method_counts"].get(m, 0) + 1
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
print(f"\nDone! Extracted {len(unique_tools)} tools")
for method, count in sorted(output["metadata"]["install_method_counts"].items()):
print(f" {method}: {count}")
print(f"Output: {OUTPUT_PATH}")
if __name__ == "__main__":
main()