#!/usr/bin/env python3 """Parse REMnux salt-states repository to extract all installed tools/packages. Fetches the salt-states repo tree from GitHub, parses .sls files to identify what gets installed, and outputs data/remnux/sources/salt-states.yaml. """ import json import re import urllib.request import yaml import os GITHUB_API = "https://api.github.com/repos/REMnux/salt-states" RAW_BASE = "https://raw.githubusercontent.com/REMnux/salt-states/master" OUTPUT_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "remnux", "sources", "salt-states.yaml") def fetch_json(url): req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"}) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode()) def fetch_text(url): req = urllib.request.Request(url, headers={"User-Agent": "remnux-tool-parser"}) try: with urllib.request.urlopen(req, timeout=30) as resp: return resp.read().decode() except Exception as e: print(f" Warning: could not fetch {url}: {e}") return None def get_sls_files(): """Get all .sls file paths from the repo.""" tree = fetch_json(f"{GITHUB_API}/git/trees/master?recursive=1") return [item["path"] for item in tree["tree"] if item["path"].endswith(".sls") and item["type"] == "blob"] def classify_sls_path(path): """Classify the install method from the directory structure.""" parts = path.lower() if "python3-package" in parts or "python-package" in parts: return "pip" elif "pip" in parts: return "pip" elif "rubygem" in parts: return "gem" elif "npm" in parts or "node" in parts: return "npm" elif "perl-package" in parts: return "perl" elif "package" in parts: return "apt" elif "tools" in parts: return "manual" elif "script" in parts: return "script" else: return "unknown" def extract_tool_name_from_path(path): """Extract a human-readable tool name from the .sls file path.""" basename = os.path.basename(path).replace(".sls", "") # Skip non-tool files skip = {"init", "addon", "cloud", "dedicated", "theme", "remnux-config", "apt-transport-https", "packages", "python3-packages", "python-packages", "rubygems", "perl-packages", "node-packages", "tools", "scripts"} if basename in skip: return None return basename def parse_sls_content(content, path): """Parse a .sls file and extract package/tool information.""" if not content: return [] results = [] tool_name = extract_tool_name_from_path(path) if not tool_name: return [] install_method = classify_sls_path(path) # Try to find the actual package name from the content package_names = [] # Match pip.installed, pkg.installed, gem.installed, npm.installed for match in re.finditer(r'(\w[\w.-]+):\s*\n\s+(?:pip|pkg|gem|npm)\.installed', content): package_names.append(match.group(1)) # Match "- name: package_name" in pip/pkg states for match in re.finditer(r'-\s+name:\s+([^\s#\n]+)', content): name = match.group(1).strip("'\"") if name and not name.startswith('{') and not name.startswith('/'): package_names.append(name) # Match wget/curl downloads (manual installs) for match in re.finditer(r'(?:wget|curl)\s+.*?/([^/\s"]+?)(?:\s|"|$)', content): fname = match.group(1) if '.' in fname and not fname.endswith('.key'): package_names.append(fname) # Match file.managed targets (scripts/binaries being deployed) for match in re.finditer(r'/usr/local/bin/([^:\s]+)', content): package_names.append(match.group(1)) # Deduplicate and clean seen = set() clean_names = [] for n in package_names: n = n.strip().strip("'\"") if n and n.lower() not in seen and len(n) > 1: seen.add(n.lower()) clean_names.append(n) entry = { "id": tool_name, "package_names": clean_names if clean_names else [tool_name], "install_method": install_method, "salt_state_path": path, } # Try to detect if it's enabled/disabled if "False" in content and ("onlyif" in content.lower() or "unless" in content.lower()): entry["possibly_conditional"] = True results.append(entry) return results def main(): print("Fetching salt-states repository tree...") sls_files = get_sls_files() print(f"Found {len(sls_files)} .sls files") # Filter to relevant paths (skip top-level orchestration files) relevant = [f for f in sls_files if f.startswith("remnux/")] print(f" {len(relevant)} under remnux/") all_tools = [] categories_seen = set() for i, path in enumerate(relevant): if i % 20 == 0: print(f" Processing {i}/{len(relevant)}...") # Derive category from path parts = path.split("/") if len(parts) >= 3: category_dir = parts[1] # e.g., "python3-packages", "tools", "packages" categories_seen.add(category_dir) content = fetch_text(f"{RAW_BASE}/{path}") tools = parse_sls_content(content, path) all_tools.extend(tools) # Deduplicate by id seen_ids = set() unique_tools = [] for t in all_tools: if t["id"] not in seen_ids: seen_ids.add(t["id"]) unique_tools.append(t) # Sort by id unique_tools.sort(key=lambda t: t["id"]) output = { "metadata": { "source": "https://github.com/REMnux/salt-states", "branch": "master", "total_sls_files": len(relevant), "total_tools_extracted": len(unique_tools), "install_method_counts": {}, "salt_directories": sorted(categories_seen), }, "tools": unique_tools, } # Count install methods for t in unique_tools: m = t["install_method"] output["metadata"]["install_method_counts"][m] = \ output["metadata"]["install_method_counts"].get(m, 0) + 1 os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) with open(OUTPUT_PATH, "w") as f: yaml.dump(output, f, default_flow_style=False, sort_keys=False, allow_unicode=True) print(f"\nDone! Extracted {len(unique_tools)} tools") for method, count in sorted(output["metadata"]["install_method_counts"].items()): print(f" {method}: {count}") print(f"Output: {OUTPUT_PATH}") if __name__ == "__main__": main()