diff --git a/.gitignore b/.gitignore index 9bce8fc..423ade8 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ tmp Icon? ._* __MACOSX/ +__pycache__/ diff --git a/.what_db.json b/.what_db.json deleted file mode 100644 index 283543f..0000000 --- a/.what_db.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "version": "1.0", - "tools": { - "tools/security/scan_vt.py": { - "path": "tools/security/scan_vt.py", - "name": "scan_vt.py", - "type": "python script", - "summary": "Scans files against VirusTotal using MD5 hashes and displays detection results with positives/total ratios and permalink.", - "purpose": "Malware detection and threat analysis", - "short_description": "VirusTotal file scanner with detection ratios", - "executable": true - }, - "tools/security/imphash.py": { - "path": "tools/security/imphash.py", - "name": "imphash.py", - "type": "python script", - "summary": "Calculates and displays the import hash (imphash) of PE files using pefile library for malware analysis.", - "purpose": "Malware analysis and PE file fingerprinting", - "short_description": "PE import hash calculator", - "executable": true - }, - "tools/security/scapy_arp.py": { - "path": "tools/security/scapy_arp.py", - "name": "scapy_arp.py", - "type": "python script", - "summary": "Multi-threaded ARP network scanner using Scapy to discover live hosts on a /24 network range with MAC addresses.", - "purpose": "Network discovery and reconnaissance", - "short_description": "threaded ARP network scanner", - "executable": true - }, - "tools/data/domgrep.py": { - "path": "tools/data/domgrep.py", - "name": "domgrep.py", - "type": "python script", - "summary": "Extracts domain names from URLs read from stdin, filtering out IP addresses and handling malformed URLs gracefully.", - "purpose": "Data extraction and URL processing", - "short_description": "extract domains from URL lists", - "executable": true - }, - "tools/data/unum.py": { - "path": "tools/data/unum.py", - "name": "unum.py", - "type": "python script", - "summary": "Analyzes Unicode characters showing decimal/hex codes, categories, and official Unicode names with proper formatting.", - "purpose": "Text analysis and Unicode debugging", - "short_description": "detailed Unicode character analyzer", - "executable": true - }, - "tools/forensics/chechsqlite.py": { - "path": "tools/forensics/chechsqlite.py", - "name": "chechsqlite.py", - "type": "python script", - "summary": "Scans SQLite databases for tables containing password or hash-related columns for security analysis.", - "purpose": "Database security analysis", - "short_description": "find password/hash columns in SQLite DBs", - "executable": true - }, - "tools/hashing/scatterhash.py": { - "path": "tools/hashing/scatterhash.py", - "name": "scatterhash.py", - "type": "python script", - "summary": "Performs sparse hashing of large files by sampling blocks across the file for efficient integrity checking and validation.", - "purpose": "Large file integrity verification", - "short_description": "sparse hashing for huge files", - "executable": true - }, - "tools/hashing/libarchivesum.py": { - "path": "tools/hashing/libarchivesum.py", - "name": "libarchivesum.py", - "type": "python script", - "summary": "Calculates hashes of individual files within archives (zip, tar, etc.) without extracting them.", - "purpose": "Archive analysis and integrity checking", - "short_description": "like md5sum but for files inside archives", - "executable": true - }, - "tools/system/ltop.py": { - "path": "tools/system/ltop.py", - "name": "ltop.py", - "type": "python script", - "summary": "Real-time frequency counter for stdin lines, showing top N most common entries with live updates using curses.", - "purpose": "Log analysis and monitoring", - "short_description": "like top but for line frequency in streams", - "executable": true - }, - "tools/network/ipgrep": { - "path": "tools/network/ipgrep", - "name": "ipgrep", - "type": "shell script", - "summary": "Comprehensive IP and MAC address extractor with sorting, deduplication, ping testing, and DNS resolution capabilities.", - "purpose": "Network analysis and IP processing", - "short_description": "advanced IP/MAC extractor with ping testing", - "executable": true - }, - "tools/security/certwipe": { - "path": "tools/security/certwipe", - "name": "certwipe", - "type": "shell script", - "summary": "Professional disk wiping tool supporting ATA SecureErase with frozen disk handling and fallback to dc3dd overwriting.", - "purpose": "Data destruction and security", - "short_description": "professional disk wiper with SecureErase", - "executable": true - }, - "tools/system/watchgrowth.sh": { - "path": "tools/system/watchgrowth.sh", - "name": "watchgrowth.sh", - "type": "shell script", - "summary": "Monitors file/directory size growth in real-time, showing transfer speeds and optional progress percentage.", - "purpose": "File monitoring and transfer analysis", - "short_description": "real-time file growth monitor", - "executable": true - }, - "projects/timesketch/deploy_timesketch.sh": { - "path": "projects/timesketch/deploy_timesketch.sh", - "name": "deploy_timesketch.sh", - "type": "shell script", - "summary": "Automated deployment script for Timesketch digital forensics timeline analysis platform with Docker Compose setup.", - "purpose": "Digital forensics infrastructure deployment", - "short_description": "deploy Timesketch forensic timeline platform", - "executable": true - }, - "tools/system/backup_docker.sh": { - "path": "tools/system/backup_docker.sh", - "name": "backup_docker.sh", - "type": "shell script", - "summary": "Comprehensive Docker Compose stack backup including images, configs, and volumes with incremental storage optimization.", - "purpose": "Container infrastructure backup", - "short_description": "backup entire Docker Compose stacks", - "executable": true - }, - "tools/cloud/cloudsend.py": { - "path": "tools/cloud/cloudsend.py", - "name": "cloudsend.py", - "type": "python script", - "summary": "Uploads files to NextCloud/OwnCloud public shares with optional GPG encryption support via command line interface.", - "purpose": "Cloud file sharing and backup", - "short_description": "upload files to NextCloud public shares", - "executable": true - }, - "tools/cloud/vqa3.py": { - "path": "tools/cloud/vqa3.py", - "name": "vqa3.py", - "type": "python script", - "summary": "AI-powered image classification using OpenAI CLIP models for content categorization with customizable classification categories.", - "purpose": "AI image analysis and content filtering", - "short_description": "AI image classifier using CLIP models", - "executable": true - } - } -} diff --git a/README.md b/README.md index 3cf8d72..ccf2120 100644 --- a/README.md +++ b/README.md @@ -41,11 +41,165 @@ Applied to the current tree, the remaining rough edges are: ## Top-Level Files -- `what`: repository search helper. It can list known tools, search by query, and progressively falls back from Ollama-based natural-language search to `fzf` or plain grep. -- `.what_db.json`: the metadata database used by `what`. It stores short descriptions for known tools. +- `what`: README-driven repository search helper. It uses one local Ollama model and searches only the catalog below. - `README.md`: this guide. - `.gitignore`: standard repository ignore rules. +## Tool Catalog + +Format: `path | goal | usage`. This section is intentionally compact so `what` can pass it to a small local model without dragging the whole repository into context. + +### Active Tools + +- `what` | goal: search this repository's tool catalog with Ollama only | usage: `./what "query"` or `./what -l` + +### Security + +- `tools/security/scan_vt.py` | goal: check file hashes against VirusTotal | usage: `python3 tools/security/scan_vt.py sample.bin` +- `tools/security/imphash.py` | goal: calculate PE import hashes for malware triage | usage: `python3 tools/security/imphash.py file.exe` +- `tools/security/scapy_arp.py` | goal: scan a local network with ARP requests | usage: `python3 tools/security/scapy_arp.py` +- `tools/security/simple_portscan.py` | goal: do a lightweight TCP port scan | usage: `python3 tools/security/simple_portscan.py host` +- `tools/security/smtpbanner.py` | goal: grab SMTP banners from remote servers | usage: `python3 tools/security/smtpbanner.py host` +- `tools/security/testpw.py` | goal: test password candidates against a target workflow | usage: `python3 tools/security/testpw.py ...` +- `tools/security/vt_download.py` | goal: download malware samples or data from VirusTotal-related workflows | usage: `python3 tools/security/vt_download.py ...` +- `tools/security/vt_ip.py` | goal: enrich IP addresses with VirusTotal intel | usage: `python3 tools/security/vt_ip.py 8.8.8.8` +- `tools/security/vt_pdns.py` | goal: query passive DNS style data from VirusTotal workflows | usage: `python3 tools/security/vt_pdns.py domain.tld` +- `tools/security/certwipe` | goal: wipe disks with secure-erase oriented steps | usage: `tools/security/certwipe /dev/sdX` + +### Forensics + +- `tools/forensics/chechsqlite.py` | goal: inspect SQLite databases for password or hash style columns | usage: `python3 tools/forensics/chechsqlite.py sample.db` +- `tools/forensics/extractfolder.py` | goal: bulk-extract or sort files from a folder workflow | usage: `python3 tools/forensics/extractfolder.py input_dir` +- `tools/forensics/process_leak.py` | goal: inspect process-leak style artifacts | usage: `python3 tools/forensics/process_leak.py artifact` +- `tools/forensics/mailunpack` | goal: extract mail attachments inside a constrained container workflow | usage: `tools/forensics/mailunpack message.eml` +- `tools/forensics/showgm.sh` | goal: open image GPS EXIF coordinates in Google Maps | usage: `tools/forensics/showgm.sh image.jpg` +- `tools/forensics/showosm.sh` | goal: open image GPS EXIF coordinates in OpenStreetMap | usage: `tools/forensics/showosm.sh image.jpg` + +### Data And Text + +- `tools/data/domgrep.py` | goal: extract domain names specifically from URLs or mixed text input | usage: `cat urls.txt | python3 tools/data/domgrep.py` +- `tools/data/geturls.py` | goal: extract full raw URLs from text when you want links rather than domains | usage: `python3 tools/data/geturls.py file.txt` +- `tools/data/unum.py` | goal: inspect Unicode code points and names | usage: `echo "text" | python3 tools/data/unum.py` +- `tools/data/quickchardet.py` | goal: guess file encoding quickly | usage: `python3 tools/data/quickchardet.py file.txt` +- `tools/data/json_save.py` | goal: normalize or save JSON fragments from text streams | usage: `python3 tools/data/json_save.py ...` +- `tools/data/kv_parse.py` | goal: parse key-value formatted text | usage: `python3 tools/data/kv_parse.py input.txt` +- `tools/data/vba_chr_decode.py` | goal: decode VBA `Chr(...)` obfuscation patterns | usage: `python3 tools/data/vba_chr_decode.py macro.txt` +- `tools/data/concat.py` | goal: concatenate structured text inputs in a repeatable way | usage: `python3 tools/data/concat.py file1 file2` +- `tools/data/split_linewise.py` | goal: split text into line-based chunks | usage: `python3 tools/data/split_linewise.py input.txt` +- `tools/data/uniq.py` | goal: remove duplicate lines while preserving first occurrence order | usage: `python3 tools/data/uniq.py file.txt` +- `tools/data/urldecode.py` | goal: URL-decode strings from stdin or files | usage: `python3 tools/data/urldecode.py` +- `tools/data/between` | goal: print text between delimiters | usage: `tools/data/between START END < file.txt` +- `tools/data/csv_get` | goal: extract selected CSV fields quickly | usage: `tools/data/csv_get file.csv column` +- `tools/data/csv2dot` | goal: turn CSV relationships into Graphviz dot edges | usage: `tools/data/csv2dot` + +### Hashing And Archives + +- `tools/hashing/libarchivesum.py` | goal: hash files inside archives without full extraction | usage: `python3 tools/hashing/libarchivesum.py archive.zip` +- `tools/hashing/scatterhash.py` | goal: hash very large files by sparse sampling when you need a fingerprint rather than a comparison | usage: `python3 tools/hashing/scatterhash.py huge.img` +- `tools/hashing/hashzip.py` | goal: hash ZIP contents or metadata for comparison | usage: `python3 tools/hashing/hashzip.py sample.zip` +- `tools/hashing/tarsum.py` | goal: compute tar-oriented checksums in Python | usage: `python3 tools/hashing/tarsum.py archive.tar` +- `tools/hashing/sparsecmp.sh` | goal: compare very large files or block devices by sampling chunks at fixed offsets | usage: `tools/hashing/sparsecmp.sh source target 100` +- `tools/hashing/trunc_by_hash.py` | goal: find the byte length where a rolling hash matches a target digest | usage: `python3 tools/hashing/trunc_by_hash.py HASH file.bin` + +### Network And Cloud + +- `tools/network/ipgrep` | goal: extract IP or MAC indicators from text | usage: `cat file.txt | tools/network/ipgrep` +- `tools/network/fritzshark.sh` | goal: inspect or capture FritzBox traffic workflows | usage: `tools/network/fritzshark.sh` +- `tools/network/fritzshark2.sh` | goal: alternate FritzBox traffic workflow | usage: `tools/network/fritzshark2.sh` +- `tools/network/get_ntp.py` | goal: query NTP information from remote systems | usage: `python3 tools/network/get_ntp.py host` +- `tools/network/get_stp.sh` | goal: inspect spanning-tree data on a network | usage: `tools/network/get_stp.sh device` +- `tools/cloud/cloudsend.py` | goal: upload files to Nextcloud or OwnCloud shares | usage: `python3 tools/cloud/cloudsend.py file` +- `tools/cloud/cloudsend.sh` | goal: shell wrapper for cloud share upload workflows | usage: `tools/cloud/cloudsend.sh file` +- `tools/cloud/docker_pull.py` | goal: download image layers from a container registry without `docker pull` | usage: `python3 tools/cloud/docker_pull.py ubuntu:latest` +- `tools/cloud/speech.py` | goal: run cloud-backed speech or transcription tasks | usage: `python3 tools/cloud/speech.py input` +- `tools/cloud/vqa3.py` | goal: classify images with a local or model-backed VQA workflow | usage: `python3 tools/cloud/vqa3.py image.jpg` +- `tools/cloud/youtube_resolve.sh` | goal: resolve direct media URLs from YouTube-like inputs | usage: `tools/cloud/youtube_resolve.sh URL` + +### Formats, System, And Text Experiments + +- `tools/formats/convert2pdf.sh` | goal: convert documents into PDF form | usage: `tools/formats/convert2pdf.sh input.docx` +- `tools/formats/flatpdf.sh` | goal: flatten or normalize PDFs for downstream handling | usage: `tools/formats/flatpdf.sh input.pdf` +- `tools/formats/openflattenpdf.sh` | goal: flatten a PDF through PostScript and open the result | usage: `tools/formats/openflattenpdf.sh input.pdf` +- `tools/formats/rename.mime.py` | goal: rename or sort files by MIME type | usage: `python3 tools/formats/rename.mime.py` +- `tools/system/backup_docker.sh` | goal: back up a Docker Compose stack | usage: `tools/system/backup_docker.sh docker-compose.yml` +- `tools/system/restore_docker.sh` | goal: restore a saved Docker workflow | usage: `tools/system/restore_docker.sh` +- `tools/system/watchgrowth.sh` | goal: watch a file or directory grow over time | usage: `tools/system/watchgrowth.sh path` +- `tools/system/ltop.py` | goal: show the most frequent lines from a stream like `top` | usage: `tail -f log | python3 tools/system/ltop.py` +- `tools/system/noerr` | goal: run a command with stderr suppressed | usage: `tools/system/noerr some command` +- `tools/system/wipe.sh` | goal: perform destructive wipe or cleanup steps | usage: `tools/system/wipe.sh target` +- `tools/text/probability.py` | goal: run a small text probability experiment | usage: `python3 tools/text/probability.py` +- `tools/text/depth` | goal: inspect text depth or nesting characteristics | usage: `tools/text/depth input.txt` + +### CTF Helpers + +- `tools/ctf/filtertext.py` | goal: filter challenge text to useful fragments | usage: `python3 tools/ctf/filtertext.py input.txt` +- `tools/ctf/getjs.py` | goal: extract JavaScript from challenge pages | usage: `python3 tools/ctf/getjs.py page.html` +- `tools/ctf/guess.py` | goal: brute-force or guess through a challenge search space | usage: `python3 tools/ctf/guess.py ...` +- `tools/ctf/ps_.py` | goal: run a CTF-specific parsing or post-processing step | usage: `python3 tools/ctf/ps_.py ...` +- `tools/ctf/search.py` | goal: search challenge artifacts for signals | usage: `python3 tools/ctf/search.py input` +- `tools/ctf/submit_flag.sh` | goal: submit flags to a challenge endpoint | usage: `tools/ctf/submit_flag.sh FLAG` +- `tools/ctf/transpose.py` | goal: transpose text or matrix-like challenge data | usage: `python3 tools/ctf/transpose.py input` + +### Go Tools And Small Projects + +- `tools/go/bincmp/gobincmp.go` | goal: compare files or directories with fuzzy hashing | usage: `go run tools/go/bincmp/gobincmp.go left right` +- `tools/go/gopname/pname.go` | goal: demo process-title renaming with `gspt` | usage: `go run tools/go/gopname/pname.go` +- `tools/go/tarsum/tarsum.go` | goal: print a SHA-256 checksum for a tar file | usage: `go run tools/go/tarsum/tarsum.go archive.tar` +- `projects/go-tools/go/goipgrep/` | goal: production-grade IP and MAC extractor with ping, DNS, and lookup support | usage: `projects/go-tools/go/goipgrep/scripts/build.sh` +- `projects/go-tools/go/csv2json/csv2json.go` | goal: convert CSV input to JSON | usage: `go run projects/go-tools/go/csv2json/csv2json.go` +- `projects/go-tools/go/gobetween/gobetween.go` | goal: extract text between delimiters in Go | usage: `go run projects/go-tools/go/gobetween/gobetween.go` +- `projects/go-tools/go/goinfo/goinfo.go` | goal: inspect file or system information in Go | usage: `go run projects/go-tools/go/goinfo/goinfo.go` +- `projects/go-tools/go/gosoft/gosoft.go` | goal: enumerate installed software from multiple package sources | usage: `go run projects/go-tools/go/gosoft/gosoft.go` +- `projects/go-tools/go/gouniq/gouniq.go` | goal: remove duplicate lines in Go | usage: `go run projects/go-tools/go/gouniq/gouniq.go < file.txt` +- `projects/rust-tools/between.rs` | goal: Rust version of between-delimiter extraction | usage: `rustc projects/rust-tools/between.rs && ./between` +- `projects/rust-tools/uniq.rs` | goal: Rust uniq implementation preserving first occurrences | usage: `rustc projects/rust-tools/uniq.rs && ./uniq file.txt` +- `projects/rust-tools/uniq2.rs` | goal: alternate Rust uniq implementation | usage: `rustc projects/rust-tools/uniq2.rs && ./uniq2 file.txt` +- `projects/puzzlebox/` | goal: solve voxel and puzzlebox search problems with several solver variants | usage: `python3 projects/puzzlebox/solve.py` +- `projects/timesketch/deploy_timesketch.sh` | goal: deploy a Timesketch environment | usage: `projects/timesketch/deploy_timesketch.sh` + +### Admin And Setup Scripts + +- `scripts/proxy/get_proxy.sh` | goal: print current proxy settings | usage: `scripts/proxy/get_proxy.sh` +- `scripts/proxy/update_apt_proxy.sh` | goal: write apt proxy configuration | usage: `scripts/proxy/update_apt_proxy.sh host port` +- `scripts/proxy/update_bashrc_proxy.sh` | goal: add shell proxy exports to a bash config | usage: `scripts/proxy/update_bashrc_proxy.sh host port` +- `scripts/proxy/update_service_proxy.sh` | goal: apply proxy settings to service units | usage: `scripts/proxy/update_service_proxy.sh service` +- `scripts/display/3_screen_setup.sh` | goal: apply a fixed three-monitor `xrandr` layout | usage: `scripts/display/3_screen_setup.sh` +- `scripts/display/notebook_extended.sh` | goal: apply a laptop-plus-external-display layout | usage: `scripts/display/notebook_extended.sh` +- `scripts/display/reset_screens.sh` | goal: reset screen outputs to a known state | usage: `scripts/display/reset_screens.sh` +- `scripts/display/single_fullhd.sh` | goal: force a single full-HD laptop display mode | usage: `scripts/display/single_fullhd.sh` +- `scripts/display/toggle_display.sh` | goal: toggle an external display workflow | usage: `scripts/display/toggle_display.sh` +- `scripts/display/toggle_touchpad` | goal: toggle touchpad state on or off | usage: `scripts/display/toggle_touchpad` +- `scripts/setup/automountctl` | goal: manage automount-related setup | usage: `scripts/setup/automountctl` +- `scripts/setup/disable_ubuntu_telemetry.sh` | goal: disable Ubuntu telemetry packages and endpoints | usage: `sudo scripts/setup/disable_ubuntu_telemetry.sh` +- `scripts/setup/mount_container` | goal: mount or unmount LUKS container files listed in `.containers` manifests | usage: `scripts/setup/mount_container mount` +- `scripts/setup/share.sh` | goal: run a local sharing workflow | usage: `scripts/setup/share.sh` +- `scripts/setup/terminal-logs.sh` | goal: configure or collect terminal logging | usage: `scripts/setup/terminal-logs.sh` +- `scripts/windows/Get-ZimmermanTools.ps1` | goal: download Zimmerman forensic tools on Windows | usage: `powershell -File scripts/windows/Get-ZimmermanTools.ps1` +- `scripts/windows/getscreen.psm1` | goal: provide PowerShell screen-capture helpers | usage: `Import-Module scripts/windows/getscreen.psm1` +- `scripts/windows/sbom.ps1` | goal: generate or inspect SBOM-related data in PowerShell | usage: `powershell -File scripts/windows/sbom.ps1` + +### Config And Reference Entry Points + +- `config/visidata/` | goal: install and use the local VisiData config plus plugins | usage: `cd config/visidata && ./install.sh --link` +- `config/install.sh` | goal: bootstrap local environment configuration | usage: `config/install.sh` +- `config/z.sh` | goal: provide a shell directory-jump helper | usage: `source config/z.sh` +- `config/shell/completions/eslogger.zsh` | goal: add Zsh completion for Apple's `eslogger` | usage: `source config/shell/completions/eslogger.zsh` + +### Archived Or Narrow Tools + +- `archive/experimental/ctf_primefac.py` | goal: factor a hard-coded challenge integer with `primefac` | usage: `python3 archive/experimental/ctf_primefac.py` +- `archive/experimental/screen2.js` | goal: capture a specific webpage screenshot with PhantomJS | usage: `phantomjs archive/experimental/screen2.js` +- `archive/experimental/screenshot.js` | goal: capture screenshots for multiple URLs with PhantomJS | usage: `phantomjs archive/experimental/screenshot.js URL ...` +- `archive/experimental/usbreset.c` | goal: reset a USB device from Linux userspace | usage: `gcc archive/experimental/usbreset.c -o usbreset` +- `archive/experimental/sep_test.sh` | goal: binary-search Docker image tags for a malware detection change | usage: `archive/experimental/sep_test.sh image start_tag end_tag` +- `archive/experimental/flm.py` | goal: keep an old experimental script available for salvage | usage: `python3 archive/experimental/flm.py` +- `archive/experimental/fuzz.sh` | goal: keep an old shell fuzzing experiment available for salvage | usage: `archive/experimental/fuzz.sh` +- `archive/experimental/hydrogentest.py` | goal: keep an old experiment available for salvage | usage: `python3 archive/experimental/hydrogentest.py` +- `archive/experimental/kv.py` | goal: keep an old key-value parsing experiment available for salvage | usage: `python3 archive/experimental/kv.py` +- `archive/experimental/lpic.sh` | goal: keep an old system experiment available for salvage | usage: `archive/experimental/lpic.sh` +- `archive/experimental/matplottest.py` | goal: keep an old plotting experiment available for salvage | usage: `python3 archive/experimental/matplottest.py` +- `archive/experimental/rootshell.c` | goal: keep a dangerous historical C example archived rather than active | usage: `do not run; reference only` + ## `tools/`: Standalone Utilities ### Security @@ -159,4 +313,4 @@ Applied to the current tree, the remaining rough edges are: - If you want a more complete tool with tests or a build flow, check `projects/`. - If you want local environment setup or terminal customizations, check `config/`. - If you want containers, check `dockerfiles/`. -- If you are unsure, run `./what -l` or search through `.what_db.json`. +- If you are unsure, run `./what -l` or ask `./what "query"`. diff --git a/what b/what index 24569ad..6a9d225 100755 --- a/what +++ b/what @@ -1,423 +1,303 @@ #!/usr/bin/env python3 """ -'what' - Smart repository search tool with progressive enhancement - -Fallback hierarchy: -1. Ollama + Gemma2 (natural language search) -2. fzf (fuzzy finding) -3. grep (simple text search) +`what` - README-driven repository search using Ollama only. Usage: - what # Find tools matching query - what -h # Show help - what -l # List all tools with short descriptions - what -a # Add new file to database + what # Find tools matching a natural-language query + what -l # List catalogued tools + what --model ... # Override the default Ollama model """ -import os -import sys -import json +from __future__ import annotations + import argparse -import subprocess -import shutil -from pathlib import Path +import json +import os import re +import subprocess +import sys +from pathlib import Path -# Configuration -REPO_ROOT = Path(__file__).parent.absolute() -DB_FILE = REPO_ROOT / ".what_db.json" +REPO_ROOT = Path(__file__).parent.resolve() +README_PATH = REPO_ROOT / "README.md" +DEFAULT_MODEL = os.environ.get("WHAT_OLLAMA_MODEL", "ministral-3:3b") +CATALOG_HEADING = "## Tool Catalog" +ENTRY_RE = re.compile( + r"^- `([^`]+)` \| goal: (.*?) \| usage: (.*)$" +) +TOKEN_RE = re.compile(r"[a-z0-9_.+-]+") -class WhatTool: - def __init__(self): - self.db_path = DB_FILE - self.data = self.load_db() - - # Detect available tools - self.has_ollama = self.check_ollama() - self.has_fzf = shutil.which('fzf') is not None - - def load_db(self): - """Load the tool database""" - if self.db_path.exists(): - try: - with open(self.db_path, 'r') as f: - return json.load(f) - except json.JSONDecodeError: - print(f"Warning: Corrupted database {self.db_path}, creating new one") - - return { - "version": "1.0", - "tools": {} - } - - def save_db(self): - """Save the tool database""" - with open(self.db_path, 'w') as f: - json.dump(self.data, f, indent=2, sort_keys=True) - - def check_ollama(self): - """Check if ollama with gemma2 is available""" - try: - result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=5) - if result.returncode == 0: - # Check if gemma2 model is available - models = result.stdout.lower() - return 'gemma2' in models - except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError): - pass - return False - - def get_file_type(self, filepath): - """Determine file type""" - if not filepath.exists(): - return "missing" - - if filepath.is_dir(): - return "directory" - - # Check if executable - is_executable = os.access(filepath, os.X_OK) - - # Check extension - suffix = filepath.suffix.lower() - - if suffix == '.py': - return "python script" if is_executable else "python module" - elif suffix == '.sh': - return "shell script" - elif suffix == '.go': - return "go program" - elif suffix == '.js': - return "javascript" - elif suffix == '.ps1': - return "powershell script" - elif suffix == '.rs': - return "rust program" - elif suffix in ['.c', '.cpp']: - return "c/c++ source" - elif suffix == '.awk': - return "awk script" - elif not suffix and is_executable: - return "binary executable" - elif not suffix: - return "script" - else: - return f"{suffix[1:]} file" - - def analyze_file_with_ollama(self, filepath): - """Analyze file using Ollama Gemma2""" - try: - # Read file content (limit size for analysis) - content = "" - if filepath.stat().st_size > 50000: # Skip very large files - content = "[File too large for analysis]" - else: - try: - with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: - content = f.read()[:10000] # First 10KB - except: - content = "[Binary or unreadable file]" - - prompt = f""" -Analyze this code/script file and provide ONLY a JSON response with these fields: -Filename: {filepath.name} -File type: {self.get_file_type(filepath)} -Content preview: -{content[:2000]} +class WhatError(Exception): + pass -Respond with ONLY this JSON structure: -{{ - "summary": "Brief 1-2 sentence summary of what this tool does and how it works", - "purpose": "What this tool is used for (e.g., 'Network analysis', 'File processing', 'Security scanning')", - "short_description": "Very short description for listings (e.g., 'like md5sum but for files inside tarballs')" -}} -""" - result = subprocess.run([ - 'ollama', 'run', 'gemma2:2b', prompt - ], capture_output=True, text=True, timeout=30) - - if result.returncode == 0: - # Extract JSON from response - response = result.stdout.strip() - - # Try to find JSON in the response - json_match = re.search(r'\{.*\}', response, re.DOTALL) - if json_match: - return json.loads(json_match.group()) - - except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e: - print(f"Ollama analysis failed: {e}") - - return None - - def add_file_interactive(self, filepath): - """Add file with interactive prompts""" - rel_path = str(filepath.relative_to(REPO_ROOT)) - file_type = self.get_file_type(filepath) - - print(f"\nAdding: {rel_path}") - print(f"Type: {file_type}") - print() - - if self.has_ollama: - print("Analyzing with Ollama Gemma2...") - analysis = self.analyze_file_with_ollama(filepath) - - if analysis: - print("AI Analysis complete. Review and edit if needed:") - summary = input(f"Summary [{analysis.get('summary', '')}]: ").strip() - purpose = input(f"Purpose [{analysis.get('purpose', '')}]: ").strip() - short_desc = input(f"Short description [{analysis.get('short_description', '')}]: ").strip() - - # Use AI suggestions if user didn't provide alternatives - summary = summary or analysis.get('summary', '') - purpose = purpose or analysis.get('purpose', '') - short_desc = short_desc or analysis.get('short_description', '') - else: - print("AI analysis failed, using manual input:") - summary = input("Summary (what it does and how): ").strip() - purpose = input("Purpose (what it's used for): ").strip() - short_desc = input("Short description (for listings): ").strip() - else: - print("Manual input (Ollama not available):") - summary = input("Summary (what it does and how): ").strip() - purpose = input("Purpose (what it's used for): ").strip() - short_desc = input("Short description (for listings): ").strip() - - # Store in database - self.data["tools"][rel_path] = { - "path": rel_path, - "name": filepath.name, - "type": file_type, - "summary": summary, - "purpose": purpose, - "short_description": short_desc, - "executable": os.access(filepath, os.X_OK) - } - - self.save_db() - print(f"✓ Added {rel_path} to database") - - def search_with_ollama(self, query): - """Search using natural language with Ollama""" - try: - tools_info = [] - for tool_data in self.data["tools"].values(): - tools_info.append(f"{tool_data['name']}: {tool_data['summary']} (Purpose: {tool_data['purpose']})") - - tools_text = "\n".join(tools_info) - - prompt = f""" -Given this query: "{query}" +def load_readme() -> str: + if not README_PATH.exists(): + raise WhatError(f"README not found at {README_PATH}") + return README_PATH.read_text(encoding="utf-8") -Find the most relevant tools from this list. Respond with ONLY the tool names (one per line) in order of relevance: -{tools_text} +def extract_catalog(readme_text: str) -> list[dict[str, str]]: + in_catalog = False + entries: list[dict[str, str]] = [] + + for raw_line in readme_text.splitlines(): + line = raw_line.rstrip() + + if line == CATALOG_HEADING: + in_catalog = True + continue + + if in_catalog and line.startswith("## "): + break + + if not in_catalog: + continue + + match = ENTRY_RE.match(line) + if not match: + continue + + path, goal, usage = match.groups() + entries.append( + { + "path": path, + "goal": goal.strip(), + "usage": usage.strip(), + } + ) + + if not entries: + raise WhatError( + "No tool catalog entries found in README. " + f"Expected entries under '{CATALOG_HEADING}'." + ) + + return entries + + +def ensure_ollama_available(model: str) -> None: + if not shutil_which("ollama"): + raise WhatError("`ollama` is not installed or not in PATH.") + + try: + result = subprocess.run( + ["ollama", "list"], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + except subprocess.SubprocessError as exc: + raise WhatError(f"Failed to talk to Ollama: {exc}") from exc + + if result.returncode != 0: + stderr = result.stderr.strip() or "unknown error" + raise WhatError(f"Ollama is unavailable: {stderr}") + + models = result.stdout.lower() + if model.lower() not in models: + raise WhatError( + f"Model '{model}' is not available locally. " + "Pull it first with `ollama pull ...`." + ) + + +def shutil_which(binary: str) -> str | None: + for directory in os.environ.get("PATH", "").split(os.pathsep): + candidate = Path(directory) / binary + if candidate.is_file() and os.access(candidate, os.X_OK): + return str(candidate) + return None + + +def build_prompt(query: str, entries: list[dict[str, str]]) -> str: + catalog_lines = [ + f'- {entry["path"]} | goal: {entry["goal"]} | usage: {entry["usage"]}' + for entry in entries + ] + catalog = "\n".join(catalog_lines) + + return f"""You are selecting tools from a repository catalog. +Use only the catalog below. Prefer direct matches. Use archived tools only if they clearly fit the request. + +Return strict JSON only. The response must be a JSON array with up to 8 objects. +Each object must contain: +- "path": exact catalog path +- "reason": one short sentence + +Do not invent paths. Do not include markdown. +Prefer the entry whose action best matches the query: compare beats hash for comparison queries, open beats convert for opening queries, and mount beats inspect for mount queries. Query: {query} -Response (tool names only, one per line, max 10): +Catalog: +{catalog} """ - - result = subprocess.run([ - 'ollama', 'run', 'gemma2:2b', prompt - ], capture_output=True, text=True, timeout=20) - - if result.returncode == 0: - tool_names = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()] - - # Find matching tools in database - matches = [] - for tool_name in tool_names[:10]: # Limit to top 10 - for tool_data in self.data["tools"].values(): - if tool_data['name'] == tool_name: - matches.append(tool_data) - break - - return matches - - except Exception as e: - print(f"Ollama search failed: {e}") - - return None - - def search_with_fzf(self, query): - """Search using fzf fuzzy finder""" - try: - # Prepare search data for fzf - search_lines = [] - for tool_data in self.data["tools"].values(): - line = f"{tool_data['name']} # {tool_data['short_description']} | {tool_data['path']}" - search_lines.append(line) - - search_input = "\n".join(search_lines) - - # Run fzf with initial query - result = subprocess.run([ - 'fzf', '--filter', query, '--no-sort' - ], input=search_input, capture_output=True, text=True) - - if result.returncode == 0: - matches = [] - for line in result.stdout.strip().split('\n'): - if ' | ' in line: - path = line.split(' | ')[-1] - if path in self.data["tools"]: - matches.append(self.data["tools"][path]) - - return matches - - except Exception as e: - print(f"fzf search failed: {e}") - - return None - - def search_with_grep(self, query): - """Fallback search using grep-like functionality""" - matches = [] - query_lower = query.lower() - - for tool_data in self.data["tools"].values(): - # Search in name, summary, purpose, and short description - searchable = f"{tool_data['name']} {tool_data['summary']} {tool_data['purpose']} {tool_data['short_description']}".lower() - - if query_lower in searchable: - matches.append(tool_data) - - # Simple relevance scoring - def score_match(tool): - score = 0 - query_lower = query.lower() - if query_lower in tool['name'].lower(): - score += 10 - if query_lower in tool['short_description'].lower(): - score += 5 - if query_lower in tool['summary'].lower(): - score += 3 - if query_lower in tool['purpose'].lower(): - score += 2 - return score - - matches.sort(key=score_match, reverse=True) - return matches[:20] # Limit results - - def search(self, query): - """Search using the best available method""" - if not query: - return [] - - print(f"Searching for: {query}") - - # Try Ollama first - if self.has_ollama: - print("Using Ollama Gemma2 for natural language search...") - results = self.search_with_ollama(query) - if results is not None: - return results - print("Ollama search failed, falling back to fzf...") - - # Try fzf - if self.has_fzf: - print("Using fzf for fuzzy search...") - results = self.search_with_fzf(query) - if results is not None: - return results - print("fzf search failed, falling back to grep...") - - # Fallback to grep - print("Using basic text search...") - return self.search_with_grep(query) - - def list_all_tools(self): - """List all tools with short descriptions""" - if not self.data["tools"]: - print("No tools in database. Use 'what -a ' to add tools.") - return - - print("Available tools:") - print() - - # Sort by name - tools = sorted(self.data["tools"].values(), key=lambda x: x['name']) - - # Calculate max name length for alignment - max_name_len = max(len(tool['name']) for tool in tools) - - for tool in tools: - executable_mark = "*" if tool.get('executable', False) else " " - name_padded = tool['name'].ljust(max_name_len) - print(f"{executable_mark}{name_padded} # {tool['short_description']}") - - def show_search_results(self, results): - """Display search results""" - if not results: - print("No tools found matching your query.") - return - - print(f"\nFound {len(results)} tool(s):") - print() - - for i, tool in enumerate(results, 1): - executable_mark = "*" if tool.get('executable', False) else " " - print(f"{i:2d}. {executable_mark}{tool['name']}") - print(f" Path: {tool['path']}") - print(f" Type: {tool['type']}") - print(f" Purpose: {tool['purpose']}") - print(f" Summary: {tool['summary']}") - print() -def main(): - parser = argparse.ArgumentParser(description="Smart repository search tool") - parser.add_argument("query", nargs="?", help="Search query") - parser.add_argument("-l", "--list", action="store_true", - help="List all tools with short descriptions") - parser.add_argument("-a", "--add", metavar="PATH", - help="Add new file to database") - + +def tokenize(text: str) -> set[str]: + return set(TOKEN_RE.findall(text.lower())) + + +def shortlist_entries(query: str, entries: list[dict[str, str]], limit: int = 28) -> list[dict[str, str]]: + query_tokens = tokenize(query) + if not query_tokens: + return entries[:limit] + + scored: list[tuple[int, dict[str, str]]] = [] + for entry in entries: + haystack = f'{entry["path"]} {entry["goal"]} {entry["usage"]}'.lower() + entry_tokens = tokenize(haystack) + overlap = len(query_tokens & entry_tokens) + substring_hits = sum(1 for token in query_tokens if token in haystack) + archive_penalty = 1 if entry["path"].startswith("archive/") else 0 + score = overlap * 5 + substring_hits - archive_penalty + scored.append((score, entry)) + + scored.sort(key=lambda item: item[0], reverse=True) + best = [entry for score, entry in scored if score > 0][:limit] + return best or entries[:limit] + + +def extract_json_array(output: str) -> list[dict[str, str]]: + match = re.search(r"\[\s*\{.*\}\s*\]", output, re.DOTALL) + payload = match.group(0) if match else output + + data = json.loads(payload) + if not isinstance(data, list): + raise WhatError("Model output must be a JSON array.") + + normalized: list[dict[str, str]] = [] + for item in data: + if not isinstance(item, dict): + continue + path = str(item.get("path", "")).strip() + reason = str(item.get("reason", "")).strip() + if path: + normalized.append({"path": path, "reason": reason}) + return normalized + + +def run_ollama_once(prompt: str, model: str) -> str: + try: + result = subprocess.run( + ["ollama", "run", model, prompt], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + except subprocess.SubprocessError as exc: + raise WhatError(f"Ollama run failed: {exc}") from exc + + if result.returncode != 0: + stderr = result.stderr.strip() or "unknown error" + raise WhatError(f"Ollama run failed: {stderr}") + + return result.stdout.strip() + + +def run_ollama(prompt: str, model: str) -> list[dict[str, str]]: + first_output = run_ollama_once(prompt, model) + try: + return extract_json_array(first_output) + except (json.JSONDecodeError, WhatError): + repair_prompt = ( + "Rewrite the following response as strict JSON only.\n" + 'Target format: [{"path":"exact catalog path","reason":"short reason"}]\n' + "Do not add markdown or commentary.\n\n" + f"Response to repair:\n{first_output}\n" + ) + repaired_output = run_ollama_once(repair_prompt, model) + try: + return extract_json_array(repaired_output) + except (json.JSONDecodeError, WhatError) as exc: + raise WhatError( + "Model output was not valid JSON after repair. " + f"Raw output was:\n{repaired_output}" + ) from exc + + +def search(query: str, entries: list[dict[str, str]], model: str) -> list[dict[str, str]]: + ensure_ollama_available(model) + prompt_entries = shortlist_entries(query, entries) + raw_results = run_ollama(build_prompt(query, prompt_entries), model) + entry_map = {entry["path"]: entry for entry in entries} + + results: list[dict[str, str]] = [] + seen: set[str] = set() + for item in raw_results: + path = item["path"] + if path not in entry_map or path in seen: + continue + seen.add(path) + merged = dict(entry_map[path]) + merged["reason"] = item.get("reason", "") + results.append(merged) + return results + + +def list_entries(entries: list[dict[str, str]]) -> None: + for entry in entries: + print(f'{entry["path"]}') + print(f' goal: {entry["goal"]}') + print(f' usage: {entry["usage"]}') + + +def show_results(query: str, results: list[dict[str, str]], model: str) -> None: + if not results: + print(f"No catalogued tool matched: {query}") + return + + print(f"Model: {model}") + print(f"Query: {query}") + print() + + for idx, item in enumerate(results, 1): + print(f"{idx}. {item['path']}") + print(f" Goal: {item['goal']}") + print(f" Usage: {item['usage']}") + if item.get("reason"): + print(f" Why: {item['reason']}") + print() + + +def main() -> int: + parser = argparse.ArgumentParser(description="README-driven repository search using Ollama") + parser.add_argument("query", nargs="?", help="Natural-language search query") + parser.add_argument("-l", "--list", action="store_true", help="List catalogued tools") + parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Ollama model to use (default: {DEFAULT_MODEL})") args = parser.parse_args() - - tool = WhatTool() - + + try: + entries = extract_catalog(load_readme()) + except WhatError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + if args.list: - tool.list_all_tools() - return - - if args.add: - filepath = Path(args.add) - if not filepath.exists(): - print(f"Error: File {filepath} does not exist") - sys.exit(1) - - if not filepath.is_relative_to(REPO_ROOT): - print(f"Error: File must be within the repository ({REPO_ROOT})") - sys.exit(1) - - tool.add_file_interactive(filepath) - return - + list_entries(entries) + return 0 + if not args.query: parser.print_help() print() - print("Available search methods:") - if tool.has_ollama: - print(" ✓ Ollama + Gemma2 (natural language)") - else: - print(" ✗ Ollama + Gemma2 (not available)") - - if tool.has_fzf: - print(" ✓ fzf (fuzzy finding)") - else: - print(" ✗ fzf (not available)") - - print(" ✓ grep (basic text search)") - return - - # Perform search - results = tool.search(args.query) - tool.show_search_results(results) + print(f"Catalog source: {README_PATH}") + print(f"Default model: {args.model}") + return 0 + + try: + results = search(args.query, entries, args.model) + except WhatError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + show_results(args.query, results, args.model) + return 0 + if __name__ == "__main__": - main() + raise SystemExit(main())