Add FOR610 tool/workflow knowledge base and data pipeline
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Parse REMnux salt-states repository to extract all installed tools/packages.
|
||||
|
||||
Fetches the salt-states repo tree from GitHub, parses .sls files to identify
|
||||
what gets installed, and outputs data/remnux/sources/salt-states.yaml.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import urllib.request
|
||||
import yaml
|
||||
import os
|
||||
|
||||
GITHUB_API = "https://api.github.com/repos/REMnux/salt-states"
|
||||
RAW_BASE = "https://raw.githubusercontent.com/REMnux/salt-states/master"
|
||||
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "salt-states.yaml")
|
||||
|
||||
|
||||
def fetch_json(url):
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
|
||||
|
||||
def fetch_text(url):
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return resp.read().decode()
|
||||
except Exception as e:
|
||||
print(f" Warning: could not fetch {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_sls_files():
|
||||
"""Get all .sls file paths from the repo."""
|
||||
tree = fetch_json(f"{GITHUB_API}/git/trees/master?recursive=1")
|
||||
return [item["path"] for item in tree["tree"]
|
||||
if item["path"].endswith(".sls") and item["type"] == "blob"]
|
||||
|
||||
|
||||
def classify_sls_path(path):
|
||||
"""Classify the install method from the directory structure."""
|
||||
parts = path.lower()
|
||||
if "python3-package" in parts or "python-package" in parts:
|
||||
return "pip"
|
||||
elif "pip" in parts:
|
||||
return "pip"
|
||||
elif "rubygem" in parts:
|
||||
return "gem"
|
||||
elif "npm" in parts or "node" in parts:
|
||||
return "npm"
|
||||
elif "perl-package" in parts:
|
||||
return "perl"
|
||||
elif "package" in parts:
|
||||
return "apt"
|
||||
elif "tools" in parts:
|
||||
return "manual"
|
||||
elif "script" in parts:
|
||||
return "script"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_tool_name_from_path(path):
|
||||
"""Extract a human-readable tool name from the .sls file path."""
|
||||
basename = os.path.basename(path).replace(".sls", "")
|
||||
# Skip non-tool files
|
||||
skip = {"init", "addon", "cloud", "dedicated", "theme", "remnux-config",
|
||||
"apt-transport-https", "packages", "python3-packages", "python-packages",
|
||||
"rubygems", "perl-packages", "node-packages", "tools", "scripts"}
|
||||
if basename in skip:
|
||||
return None
|
||||
return basename
|
||||
|
||||
|
||||
def parse_sls_content(content, path):
|
||||
"""Parse a .sls file and extract package/tool information."""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
results = []
|
||||
tool_name = extract_tool_name_from_path(path)
|
||||
if not tool_name:
|
||||
return []
|
||||
|
||||
install_method = classify_sls_path(path)
|
||||
|
||||
# Try to find the actual package name from the content
|
||||
package_names = []
|
||||
|
||||
# Match pip.installed, pkg.installed, gem.installed, npm.installed
|
||||
for match in re.finditer(r'(\w[\w.-]+):\s*\n\s+(?:pip|pkg|gem|npm)\.installed', content):
|
||||
package_names.append(match.group(1))
|
||||
|
||||
# Match "- name: package_name" in pip/pkg states
|
||||
for match in re.finditer(r'-\s+name:\s+([^\s#\n]+)', content):
|
||||
name = match.group(1).strip("'\"")
|
||||
if name and not name.startswith('{') and not name.startswith('/'):
|
||||
package_names.append(name)
|
||||
|
||||
# Match wget/curl downloads (manual installs)
|
||||
for match in re.finditer(r'(?:wget|curl)\s+.*?/([^/\s"]+?)(?:\s|"|$)', content):
|
||||
fname = match.group(1)
|
||||
if '.' in fname and not fname.endswith('.key'):
|
||||
package_names.append(fname)
|
||||
|
||||
# Match file.managed targets (scripts/binaries being deployed)
|
||||
for match in re.finditer(r'/usr/local/bin/([^:\s]+)', content):
|
||||
package_names.append(match.group(1))
|
||||
|
||||
# Deduplicate and clean
|
||||
seen = set()
|
||||
clean_names = []
|
||||
for n in package_names:
|
||||
n = n.strip().strip("'\"")
|
||||
if n and n.lower() not in seen and len(n) > 1:
|
||||
seen.add(n.lower())
|
||||
clean_names.append(n)
|
||||
|
||||
entry = {
|
||||
"id": tool_name,
|
||||
"package_names": clean_names if clean_names else [tool_name],
|
||||
"install_method": install_method,
|
||||
"salt_state_path": path,
|
||||
}
|
||||
|
||||
# Try to detect if it's enabled/disabled
|
||||
if "False" in content and ("onlyif" in content.lower() or "unless" in content.lower()):
|
||||
entry["possibly_conditional"] = True
|
||||
|
||||
results.append(entry)
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
print("Fetching salt-states repository tree...")
|
||||
sls_files = get_sls_files()
|
||||
print(f"Found {len(sls_files)} .sls files")
|
||||
|
||||
# Filter to relevant paths (skip top-level orchestration files)
|
||||
relevant = [f for f in sls_files if f.startswith("remnux/")]
|
||||
print(f" {len(relevant)} under remnux/")
|
||||
|
||||
all_tools = []
|
||||
categories_seen = set()
|
||||
|
||||
for i, path in enumerate(relevant):
|
||||
if i % 20 == 0:
|
||||
print(f" Processing {i}/{len(relevant)}...")
|
||||
|
||||
# Derive category from path
|
||||
parts = path.split("/")
|
||||
if len(parts) >= 3:
|
||||
category_dir = parts[1] # e.g., "python3-packages", "tools", "packages"
|
||||
categories_seen.add(category_dir)
|
||||
|
||||
content = fetch_text(f"{RAW_BASE}/{path}")
|
||||
tools = parse_sls_content(content, path)
|
||||
all_tools.extend(tools)
|
||||
|
||||
# Deduplicate by id
|
||||
seen_ids = set()
|
||||
unique_tools = []
|
||||
for t in all_tools:
|
||||
if t["id"] not in seen_ids:
|
||||
seen_ids.add(t["id"])
|
||||
unique_tools.append(t)
|
||||
|
||||
# Sort by id
|
||||
unique_tools.sort(key=lambda t: t["id"])
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"source": "https://github.com/REMnux/salt-states",
|
||||
"branch": "master",
|
||||
"total_sls_files": len(relevant),
|
||||
"total_tools_extracted": len(unique_tools),
|
||||
"install_method_counts": {},
|
||||
"salt_directories": sorted(categories_seen),
|
||||
},
|
||||
"tools": unique_tools,
|
||||
}
|
||||
|
||||
# Count install methods
|
||||
for t in unique_tools:
|
||||
m = t["install_method"]
|
||||
output["metadata"]["install_method_counts"][m] = \
|
||||
output["metadata"]["install_method_counts"].get(m, 0) + 1
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
||||
|
||||
print(f"\nDone! Extracted {len(unique_tools)} tools")
|
||||
for method, count in sorted(output["metadata"]["install_method_counts"].items()):
|
||||
print(f" {method}: {count}")
|
||||
print(f"Output: {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user