f3ccc09c3d
Build comprehensive malware analysis knowledge base from 3 sources: - SANS FOR610 course: 120 tools, 47 labs, 15 workflows, 27 recipes - REMnux salt-states: 340 packages parsed from GitHub - REMnux docs: 280+ tools scraped from docs.remnux.org Master inventory merges all sources into 447 tools with help tiers (rich/standard/basic). Pipeline generates: tools.db (397 entries), 397 cheatsheets with multi-tool recipes, 15 workflow guides, 224 TLDR pages, and coverage reports. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
203 lines
6.5 KiB
Python
203 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Parse REMnux salt-states repository to extract all installed tools/packages.
|
|
|
|
Fetches the salt-states repo tree from GitHub, parses .sls files to identify
|
|
what gets installed, and outputs data/remnux/sources/salt-states.yaml.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import urllib.request
|
|
import yaml
|
|
import os
|
|
|
|
GITHUB_API = "https://api.github.com/repos/REMnux/salt-states"
|
|
RAW_BASE = "https://raw.githubusercontent.com/REMnux/salt-states/master"
|
|
OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "salt-states.yaml")
|
|
|
|
|
|
def fetch_json(url):
|
|
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode())
|
|
|
|
|
|
def fetch_text(url):
|
|
req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return resp.read().decode()
|
|
except Exception as e:
|
|
print(f" Warning: could not fetch {url}: {e}")
|
|
return None
|
|
|
|
|
|
def get_sls_files():
|
|
"""Get all .sls file paths from the repo."""
|
|
tree = fetch_json(f"{GITHUB_API}/git/trees/master?recursive=1")
|
|
return [item["path"] for item in tree["tree"]
|
|
if item["path"].endswith(".sls") and item["type"] == "blob"]
|
|
|
|
|
|
def classify_sls_path(path):
|
|
"""Classify the install method from the directory structure."""
|
|
parts = path.lower()
|
|
if "python3-package" in parts or "python-package" in parts:
|
|
return "pip"
|
|
elif "pip" in parts:
|
|
return "pip"
|
|
elif "rubygem" in parts:
|
|
return "gem"
|
|
elif "npm" in parts or "node" in parts:
|
|
return "npm"
|
|
elif "perl-package" in parts:
|
|
return "perl"
|
|
elif "package" in parts:
|
|
return "apt"
|
|
elif "tools" in parts:
|
|
return "manual"
|
|
elif "script" in parts:
|
|
return "script"
|
|
else:
|
|
return "unknown"
|
|
|
|
|
|
def extract_tool_name_from_path(path):
|
|
"""Extract a human-readable tool name from the .sls file path."""
|
|
basename = os.path.basename(path).replace(".sls", "")
|
|
# Skip non-tool files
|
|
skip = {"init", "addon", "cloud", "dedicated", "theme", "remnux-config",
|
|
"apt-transport-https", "packages", "python3-packages", "python-packages",
|
|
"rubygems", "perl-packages", "node-packages", "tools", "scripts"}
|
|
if basename in skip:
|
|
return None
|
|
return basename
|
|
|
|
|
|
def parse_sls_content(content, path):
|
|
"""Parse a .sls file and extract package/tool information."""
|
|
if not content:
|
|
return []
|
|
|
|
results = []
|
|
tool_name = extract_tool_name_from_path(path)
|
|
if not tool_name:
|
|
return []
|
|
|
|
install_method = classify_sls_path(path)
|
|
|
|
# Try to find the actual package name from the content
|
|
package_names = []
|
|
|
|
# Match pip.installed, pkg.installed, gem.installed, npm.installed
|
|
for match in re.finditer(r'(\w[\w.-]+):\s*\n\s+(?:pip|pkg|gem|npm)\.installed', content):
|
|
package_names.append(match.group(1))
|
|
|
|
# Match "- name: package_name" in pip/pkg states
|
|
for match in re.finditer(r'-\s+name:\s+([^\s#\n]+)', content):
|
|
name = match.group(1).strip("'\"")
|
|
if name and not name.startswith('{') and not name.startswith('/'):
|
|
package_names.append(name)
|
|
|
|
# Match wget/curl downloads (manual installs)
|
|
for match in re.finditer(r'(?:wget|curl)\s+.*?/([^/\s"]+?)(?:\s|"|$)', content):
|
|
fname = match.group(1)
|
|
if '.' in fname and not fname.endswith('.key'):
|
|
package_names.append(fname)
|
|
|
|
# Match file.managed targets (scripts/binaries being deployed)
|
|
for match in re.finditer(r'/usr/local/bin/([^:\s]+)', content):
|
|
package_names.append(match.group(1))
|
|
|
|
# Deduplicate and clean
|
|
seen = set()
|
|
clean_names = []
|
|
for n in package_names:
|
|
n = n.strip().strip("'\"")
|
|
if n and n.lower() not in seen and len(n) > 1:
|
|
seen.add(n.lower())
|
|
clean_names.append(n)
|
|
|
|
entry = {
|
|
"id": tool_name,
|
|
"package_names": clean_names if clean_names else [tool_name],
|
|
"install_method": install_method,
|
|
"salt_state_path": path,
|
|
}
|
|
|
|
# Try to detect if it's enabled/disabled
|
|
if "False" in content and ("onlyif" in content.lower() or "unless" in content.lower()):
|
|
entry["possibly_conditional"] = True
|
|
|
|
results.append(entry)
|
|
return results
|
|
|
|
|
|
def main():
|
|
print("Fetching salt-states repository tree...")
|
|
sls_files = get_sls_files()
|
|
print(f"Found {len(sls_files)} .sls files")
|
|
|
|
# Filter to relevant paths (skip top-level orchestration files)
|
|
relevant = [f for f in sls_files if f.startswith("remnux/")]
|
|
print(f" {len(relevant)} under remnux/")
|
|
|
|
all_tools = []
|
|
categories_seen = set()
|
|
|
|
for i, path in enumerate(relevant):
|
|
if i % 20 == 0:
|
|
print(f" Processing {i}/{len(relevant)}...")
|
|
|
|
# Derive category from path
|
|
parts = path.split("/")
|
|
if len(parts) >= 3:
|
|
category_dir = parts[1] # e.g., "python3-packages", "tools", "packages"
|
|
categories_seen.add(category_dir)
|
|
|
|
content = fetch_text(f"{RAW_BASE}/{path}")
|
|
tools = parse_sls_content(content, path)
|
|
all_tools.extend(tools)
|
|
|
|
# Deduplicate by id
|
|
seen_ids = set()
|
|
unique_tools = []
|
|
for t in all_tools:
|
|
if t["id"] not in seen_ids:
|
|
seen_ids.add(t["id"])
|
|
unique_tools.append(t)
|
|
|
|
# Sort by id
|
|
unique_tools.sort(key=lambda t: t["id"])
|
|
|
|
output = {
|
|
"metadata": {
|
|
"source": "https://github.com/REMnux/salt-states",
|
|
"branch": "master",
|
|
"total_sls_files": len(relevant),
|
|
"total_tools_extracted": len(unique_tools),
|
|
"install_method_counts": {},
|
|
"salt_directories": sorted(categories_seen),
|
|
},
|
|
"tools": unique_tools,
|
|
}
|
|
|
|
# Count install methods
|
|
for t in unique_tools:
|
|
m = t["install_method"]
|
|
output["metadata"]["install_method_counts"][m] = \
|
|
output["metadata"]["install_method_counts"].get(m, 0) + 1
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
|
|
with open(OUTPUT_PATH, "w") as f:
|
|
yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
|
|
|
|
print(f"\nDone! Extracted {len(unique_tools)} tools")
|
|
for method, count in sorted(output["metadata"]["install_method_counts"].items()):
|
|
print(f" {method}: {count}")
|
|
print(f"Output: {OUTPUT_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|