Rewrite what around README catalog and Ollama

Remove the JSON tool database and move tool metadata into a compact README catalog.
Make what README-driven and Ollama-only, with shortlist generation and JSON-repair retry handling.
Pull qwen3.5:2b and ministral-3:3b, compare them on fixed repository queries, and set ministral-3:3b as the default model.
Tighten README wording so similar tools like domgrep/geturls and sparsecmp/scatterhash rank correctly.
This commit is contained in:
tke
2026-03-07 20:39:24 +01:00
parent fd515742b5
commit 559fa38c04
4 changed files with 433 additions and 547 deletions

1
.gitignore vendored
View File

@@ -5,3 +5,4 @@ tmp
Icon? Icon?
._* ._*
__MACOSX/ __MACOSX/
__pycache__/

View File

@@ -1,149 +0,0 @@
{
"version": "1.0",
"tools": {
"tools/security/scan_vt.py": {
"path": "tools/security/scan_vt.py",
"name": "scan_vt.py",
"type": "python script",
"summary": "Scans files against VirusTotal using MD5 hashes and displays detection results with positives/total ratios and permalink.",
"purpose": "Malware detection and threat analysis",
"short_description": "VirusTotal file scanner with detection ratios",
"executable": true
},
"tools/security/imphash.py": {
"path": "tools/security/imphash.py",
"name": "imphash.py",
"type": "python script",
"summary": "Calculates and displays the import hash (imphash) of PE files using pefile library for malware analysis.",
"purpose": "Malware analysis and PE file fingerprinting",
"short_description": "PE import hash calculator",
"executable": true
},
"tools/security/scapy_arp.py": {
"path": "tools/security/scapy_arp.py",
"name": "scapy_arp.py",
"type": "python script",
"summary": "Multi-threaded ARP network scanner using Scapy to discover live hosts on a /24 network range with MAC addresses.",
"purpose": "Network discovery and reconnaissance",
"short_description": "threaded ARP network scanner",
"executable": true
},
"tools/data/domgrep.py": {
"path": "tools/data/domgrep.py",
"name": "domgrep.py",
"type": "python script",
"summary": "Extracts domain names from URLs read from stdin, filtering out IP addresses and handling malformed URLs gracefully.",
"purpose": "Data extraction and URL processing",
"short_description": "extract domains from URL lists",
"executable": true
},
"tools/data/unum.py": {
"path": "tools/data/unum.py",
"name": "unum.py",
"type": "python script",
"summary": "Analyzes Unicode characters showing decimal/hex codes, categories, and official Unicode names with proper formatting.",
"purpose": "Text analysis and Unicode debugging",
"short_description": "detailed Unicode character analyzer",
"executable": true
},
"tools/forensics/chechsqlite.py": {
"path": "tools/forensics/chechsqlite.py",
"name": "chechsqlite.py",
"type": "python script",
"summary": "Scans SQLite databases for tables containing password or hash-related columns for security analysis.",
"purpose": "Database security analysis",
"short_description": "find password/hash columns in SQLite DBs",
"executable": true
},
"tools/hashing/scatterhash.py": {
"path": "tools/hashing/scatterhash.py",
"name": "scatterhash.py",
"type": "python script",
"summary": "Performs sparse hashing of large files by sampling blocks across the file for efficient integrity checking and validation.",
"purpose": "Large file integrity verification",
"short_description": "sparse hashing for huge files",
"executable": true
},
"tools/hashing/libarchivesum.py": {
"path": "tools/hashing/libarchivesum.py",
"name": "libarchivesum.py",
"type": "python script",
"summary": "Calculates hashes of individual files within archives (zip, tar, etc.) without extracting them.",
"purpose": "Archive analysis and integrity checking",
"short_description": "like md5sum but for files inside archives",
"executable": true
},
"tools/system/ltop.py": {
"path": "tools/system/ltop.py",
"name": "ltop.py",
"type": "python script",
"summary": "Real-time frequency counter for stdin lines, showing top N most common entries with live updates using curses.",
"purpose": "Log analysis and monitoring",
"short_description": "like top but for line frequency in streams",
"executable": true
},
"tools/network/ipgrep": {
"path": "tools/network/ipgrep",
"name": "ipgrep",
"type": "shell script",
"summary": "Comprehensive IP and MAC address extractor with sorting, deduplication, ping testing, and DNS resolution capabilities.",
"purpose": "Network analysis and IP processing",
"short_description": "advanced IP/MAC extractor with ping testing",
"executable": true
},
"tools/security/certwipe": {
"path": "tools/security/certwipe",
"name": "certwipe",
"type": "shell script",
"summary": "Professional disk wiping tool supporting ATA SecureErase with frozen disk handling and fallback to dc3dd overwriting.",
"purpose": "Data destruction and security",
"short_description": "professional disk wiper with SecureErase",
"executable": true
},
"tools/system/watchgrowth.sh": {
"path": "tools/system/watchgrowth.sh",
"name": "watchgrowth.sh",
"type": "shell script",
"summary": "Monitors file/directory size growth in real-time, showing transfer speeds and optional progress percentage.",
"purpose": "File monitoring and transfer analysis",
"short_description": "real-time file growth monitor",
"executable": true
},
"projects/timesketch/deploy_timesketch.sh": {
"path": "projects/timesketch/deploy_timesketch.sh",
"name": "deploy_timesketch.sh",
"type": "shell script",
"summary": "Automated deployment script for Timesketch digital forensics timeline analysis platform with Docker Compose setup.",
"purpose": "Digital forensics infrastructure deployment",
"short_description": "deploy Timesketch forensic timeline platform",
"executable": true
},
"tools/system/backup_docker.sh": {
"path": "tools/system/backup_docker.sh",
"name": "backup_docker.sh",
"type": "shell script",
"summary": "Comprehensive Docker Compose stack backup including images, configs, and volumes with incremental storage optimization.",
"purpose": "Container infrastructure backup",
"short_description": "backup entire Docker Compose stacks",
"executable": true
},
"tools/cloud/cloudsend.py": {
"path": "tools/cloud/cloudsend.py",
"name": "cloudsend.py",
"type": "python script",
"summary": "Uploads files to NextCloud/OwnCloud public shares with optional GPG encryption support via command line interface.",
"purpose": "Cloud file sharing and backup",
"short_description": "upload files to NextCloud public shares",
"executable": true
},
"tools/cloud/vqa3.py": {
"path": "tools/cloud/vqa3.py",
"name": "vqa3.py",
"type": "python script",
"summary": "AI-powered image classification using OpenAI CLIP models for content categorization with customizable classification categories.",
"purpose": "AI image analysis and content filtering",
"short_description": "AI image classifier using CLIP models",
"executable": true
}
}
}

160
README.md
View File

@@ -41,11 +41,165 @@ Applied to the current tree, the remaining rough edges are:
## Top-Level Files ## Top-Level Files
- `what`: repository search helper. It can list known tools, search by query, and progressively falls back from Ollama-based natural-language search to `fzf` or plain grep. - `what`: README-driven repository search helper. It uses one local Ollama model and searches only the catalog below.
- `.what_db.json`: the metadata database used by `what`. It stores short descriptions for known tools.
- `README.md`: this guide. - `README.md`: this guide.
- `.gitignore`: standard repository ignore rules. - `.gitignore`: standard repository ignore rules.
## Tool Catalog
Format: `path | goal | usage`. This section is intentionally compact so `what` can pass it to a small local model without dragging the whole repository into context.
### Active Tools
- `what` | goal: search this repository's tool catalog with Ollama only | usage: `./what "query"` or `./what -l`
### Security
- `tools/security/scan_vt.py` | goal: check file hashes against VirusTotal | usage: `python3 tools/security/scan_vt.py sample.bin`
- `tools/security/imphash.py` | goal: calculate PE import hashes for malware triage | usage: `python3 tools/security/imphash.py file.exe`
- `tools/security/scapy_arp.py` | goal: scan a local network with ARP requests | usage: `python3 tools/security/scapy_arp.py`
- `tools/security/simple_portscan.py` | goal: do a lightweight TCP port scan | usage: `python3 tools/security/simple_portscan.py host`
- `tools/security/smtpbanner.py` | goal: grab SMTP banners from remote servers | usage: `python3 tools/security/smtpbanner.py host`
- `tools/security/testpw.py` | goal: test password candidates against a target workflow | usage: `python3 tools/security/testpw.py ...`
- `tools/security/vt_download.py` | goal: download malware samples or data from VirusTotal-related workflows | usage: `python3 tools/security/vt_download.py ...`
- `tools/security/vt_ip.py` | goal: enrich IP addresses with VirusTotal intel | usage: `python3 tools/security/vt_ip.py 8.8.8.8`
- `tools/security/vt_pdns.py` | goal: query passive DNS style data from VirusTotal workflows | usage: `python3 tools/security/vt_pdns.py domain.tld`
- `tools/security/certwipe` | goal: wipe disks with secure-erase oriented steps | usage: `tools/security/certwipe /dev/sdX`
### Forensics
- `tools/forensics/chechsqlite.py` | goal: inspect SQLite databases for password or hash style columns | usage: `python3 tools/forensics/chechsqlite.py sample.db`
- `tools/forensics/extractfolder.py` | goal: bulk-extract or sort files from a folder workflow | usage: `python3 tools/forensics/extractfolder.py input_dir`
- `tools/forensics/process_leak.py` | goal: inspect process-leak style artifacts | usage: `python3 tools/forensics/process_leak.py artifact`
- `tools/forensics/mailunpack` | goal: extract mail attachments inside a constrained container workflow | usage: `tools/forensics/mailunpack message.eml`
- `tools/forensics/showgm.sh` | goal: open image GPS EXIF coordinates in Google Maps | usage: `tools/forensics/showgm.sh image.jpg`
- `tools/forensics/showosm.sh` | goal: open image GPS EXIF coordinates in OpenStreetMap | usage: `tools/forensics/showosm.sh image.jpg`
### Data And Text
- `tools/data/domgrep.py` | goal: extract domain names specifically from URLs or mixed text input | usage: `cat urls.txt | python3 tools/data/domgrep.py`
- `tools/data/geturls.py` | goal: extract full raw URLs from text when you want links rather than domains | usage: `python3 tools/data/geturls.py file.txt`
- `tools/data/unum.py` | goal: inspect Unicode code points and names | usage: `echo "text" | python3 tools/data/unum.py`
- `tools/data/quickchardet.py` | goal: guess file encoding quickly | usage: `python3 tools/data/quickchardet.py file.txt`
- `tools/data/json_save.py` | goal: normalize or save JSON fragments from text streams | usage: `python3 tools/data/json_save.py ...`
- `tools/data/kv_parse.py` | goal: parse key-value formatted text | usage: `python3 tools/data/kv_parse.py input.txt`
- `tools/data/vba_chr_decode.py` | goal: decode VBA `Chr(...)` obfuscation patterns | usage: `python3 tools/data/vba_chr_decode.py macro.txt`
- `tools/data/concat.py` | goal: concatenate structured text inputs in a repeatable way | usage: `python3 tools/data/concat.py file1 file2`
- `tools/data/split_linewise.py` | goal: split text into line-based chunks | usage: `python3 tools/data/split_linewise.py input.txt`
- `tools/data/uniq.py` | goal: remove duplicate lines while preserving first occurrence order | usage: `python3 tools/data/uniq.py file.txt`
- `tools/data/urldecode.py` | goal: URL-decode strings from stdin or files | usage: `python3 tools/data/urldecode.py`
- `tools/data/between` | goal: print text between delimiters | usage: `tools/data/between START END < file.txt`
- `tools/data/csv_get` | goal: extract selected CSV fields quickly | usage: `tools/data/csv_get file.csv column`
- `tools/data/csv2dot` | goal: turn CSV relationships into Graphviz dot edges | usage: `tools/data/csv2dot`
### Hashing And Archives
- `tools/hashing/libarchivesum.py` | goal: hash files inside archives without full extraction | usage: `python3 tools/hashing/libarchivesum.py archive.zip`
- `tools/hashing/scatterhash.py` | goal: hash very large files by sparse sampling when you need a fingerprint rather than a comparison | usage: `python3 tools/hashing/scatterhash.py huge.img`
- `tools/hashing/hashzip.py` | goal: hash ZIP contents or metadata for comparison | usage: `python3 tools/hashing/hashzip.py sample.zip`
- `tools/hashing/tarsum.py` | goal: compute tar-oriented checksums in Python | usage: `python3 tools/hashing/tarsum.py archive.tar`
- `tools/hashing/sparsecmp.sh` | goal: compare very large files or block devices by sampling chunks at fixed offsets | usage: `tools/hashing/sparsecmp.sh source target 100`
- `tools/hashing/trunc_by_hash.py` | goal: find the byte length where a rolling hash matches a target digest | usage: `python3 tools/hashing/trunc_by_hash.py HASH file.bin`
### Network And Cloud
- `tools/network/ipgrep` | goal: extract IP or MAC indicators from text | usage: `cat file.txt | tools/network/ipgrep`
- `tools/network/fritzshark.sh` | goal: inspect or capture FritzBox traffic workflows | usage: `tools/network/fritzshark.sh`
- `tools/network/fritzshark2.sh` | goal: alternate FritzBox traffic workflow | usage: `tools/network/fritzshark2.sh`
- `tools/network/get_ntp.py` | goal: query NTP information from remote systems | usage: `python3 tools/network/get_ntp.py host`
- `tools/network/get_stp.sh` | goal: inspect spanning-tree data on a network | usage: `tools/network/get_stp.sh device`
- `tools/cloud/cloudsend.py` | goal: upload files to Nextcloud or OwnCloud shares | usage: `python3 tools/cloud/cloudsend.py file`
- `tools/cloud/cloudsend.sh` | goal: shell wrapper for cloud share upload workflows | usage: `tools/cloud/cloudsend.sh file`
- `tools/cloud/docker_pull.py` | goal: download image layers from a container registry without `docker pull` | usage: `python3 tools/cloud/docker_pull.py ubuntu:latest`
- `tools/cloud/speech.py` | goal: run cloud-backed speech or transcription tasks | usage: `python3 tools/cloud/speech.py input`
- `tools/cloud/vqa3.py` | goal: classify images with a local or model-backed VQA workflow | usage: `python3 tools/cloud/vqa3.py image.jpg`
- `tools/cloud/youtube_resolve.sh` | goal: resolve direct media URLs from YouTube-like inputs | usage: `tools/cloud/youtube_resolve.sh URL`
### Formats, System, And Text Experiments
- `tools/formats/convert2pdf.sh` | goal: convert documents into PDF form | usage: `tools/formats/convert2pdf.sh input.docx`
- `tools/formats/flatpdf.sh` | goal: flatten or normalize PDFs for downstream handling | usage: `tools/formats/flatpdf.sh input.pdf`
- `tools/formats/openflattenpdf.sh` | goal: flatten a PDF through PostScript and open the result | usage: `tools/formats/openflattenpdf.sh input.pdf`
- `tools/formats/rename.mime.py` | goal: rename or sort files by MIME type | usage: `python3 tools/formats/rename.mime.py`
- `tools/system/backup_docker.sh` | goal: back up a Docker Compose stack | usage: `tools/system/backup_docker.sh docker-compose.yml`
- `tools/system/restore_docker.sh` | goal: restore a saved Docker workflow | usage: `tools/system/restore_docker.sh`
- `tools/system/watchgrowth.sh` | goal: watch a file or directory grow over time | usage: `tools/system/watchgrowth.sh path`
- `tools/system/ltop.py` | goal: show the most frequent lines from a stream like `top` | usage: `tail -f log | python3 tools/system/ltop.py`
- `tools/system/noerr` | goal: run a command with stderr suppressed | usage: `tools/system/noerr some command`
- `tools/system/wipe.sh` | goal: perform destructive wipe or cleanup steps | usage: `tools/system/wipe.sh target`
- `tools/text/probability.py` | goal: run a small text probability experiment | usage: `python3 tools/text/probability.py`
- `tools/text/depth` | goal: inspect text depth or nesting characteristics | usage: `tools/text/depth input.txt`
### CTF Helpers
- `tools/ctf/filtertext.py` | goal: filter challenge text to useful fragments | usage: `python3 tools/ctf/filtertext.py input.txt`
- `tools/ctf/getjs.py` | goal: extract JavaScript from challenge pages | usage: `python3 tools/ctf/getjs.py page.html`
- `tools/ctf/guess.py` | goal: brute-force or guess through a challenge search space | usage: `python3 tools/ctf/guess.py ...`
- `tools/ctf/ps_.py` | goal: run a CTF-specific parsing or post-processing step | usage: `python3 tools/ctf/ps_.py ...`
- `tools/ctf/search.py` | goal: search challenge artifacts for signals | usage: `python3 tools/ctf/search.py input`
- `tools/ctf/submit_flag.sh` | goal: submit flags to a challenge endpoint | usage: `tools/ctf/submit_flag.sh FLAG`
- `tools/ctf/transpose.py` | goal: transpose text or matrix-like challenge data | usage: `python3 tools/ctf/transpose.py input`
### Go Tools And Small Projects
- `tools/go/bincmp/gobincmp.go` | goal: compare files or directories with fuzzy hashing | usage: `go run tools/go/bincmp/gobincmp.go left right`
- `tools/go/gopname/pname.go` | goal: demo process-title renaming with `gspt` | usage: `go run tools/go/gopname/pname.go`
- `tools/go/tarsum/tarsum.go` | goal: print a SHA-256 checksum for a tar file | usage: `go run tools/go/tarsum/tarsum.go archive.tar`
- `projects/go-tools/go/goipgrep/` | goal: production-grade IP and MAC extractor with ping, DNS, and lookup support | usage: `projects/go-tools/go/goipgrep/scripts/build.sh`
- `projects/go-tools/go/csv2json/csv2json.go` | goal: convert CSV input to JSON | usage: `go run projects/go-tools/go/csv2json/csv2json.go`
- `projects/go-tools/go/gobetween/gobetween.go` | goal: extract text between delimiters in Go | usage: `go run projects/go-tools/go/gobetween/gobetween.go`
- `projects/go-tools/go/goinfo/goinfo.go` | goal: inspect file or system information in Go | usage: `go run projects/go-tools/go/goinfo/goinfo.go`
- `projects/go-tools/go/gosoft/gosoft.go` | goal: enumerate installed software from multiple package sources | usage: `go run projects/go-tools/go/gosoft/gosoft.go`
- `projects/go-tools/go/gouniq/gouniq.go` | goal: remove duplicate lines in Go | usage: `go run projects/go-tools/go/gouniq/gouniq.go < file.txt`
- `projects/rust-tools/between.rs` | goal: Rust version of between-delimiter extraction | usage: `rustc projects/rust-tools/between.rs && ./between`
- `projects/rust-tools/uniq.rs` | goal: Rust uniq implementation preserving first occurrences | usage: `rustc projects/rust-tools/uniq.rs && ./uniq file.txt`
- `projects/rust-tools/uniq2.rs` | goal: alternate Rust uniq implementation | usage: `rustc projects/rust-tools/uniq2.rs && ./uniq2 file.txt`
- `projects/puzzlebox/` | goal: solve voxel and puzzlebox search problems with several solver variants | usage: `python3 projects/puzzlebox/solve.py`
- `projects/timesketch/deploy_timesketch.sh` | goal: deploy a Timesketch environment | usage: `projects/timesketch/deploy_timesketch.sh`
### Admin And Setup Scripts
- `scripts/proxy/get_proxy.sh` | goal: print current proxy settings | usage: `scripts/proxy/get_proxy.sh`
- `scripts/proxy/update_apt_proxy.sh` | goal: write apt proxy configuration | usage: `scripts/proxy/update_apt_proxy.sh host port`
- `scripts/proxy/update_bashrc_proxy.sh` | goal: add shell proxy exports to a bash config | usage: `scripts/proxy/update_bashrc_proxy.sh host port`
- `scripts/proxy/update_service_proxy.sh` | goal: apply proxy settings to service units | usage: `scripts/proxy/update_service_proxy.sh service`
- `scripts/display/3_screen_setup.sh` | goal: apply a fixed three-monitor `xrandr` layout | usage: `scripts/display/3_screen_setup.sh`
- `scripts/display/notebook_extended.sh` | goal: apply a laptop-plus-external-display layout | usage: `scripts/display/notebook_extended.sh`
- `scripts/display/reset_screens.sh` | goal: reset screen outputs to a known state | usage: `scripts/display/reset_screens.sh`
- `scripts/display/single_fullhd.sh` | goal: force a single full-HD laptop display mode | usage: `scripts/display/single_fullhd.sh`
- `scripts/display/toggle_display.sh` | goal: toggle an external display workflow | usage: `scripts/display/toggle_display.sh`
- `scripts/display/toggle_touchpad` | goal: toggle touchpad state on or off | usage: `scripts/display/toggle_touchpad`
- `scripts/setup/automountctl` | goal: manage automount-related setup | usage: `scripts/setup/automountctl`
- `scripts/setup/disable_ubuntu_telemetry.sh` | goal: disable Ubuntu telemetry packages and endpoints | usage: `sudo scripts/setup/disable_ubuntu_telemetry.sh`
- `scripts/setup/mount_container` | goal: mount or unmount LUKS container files listed in `.containers` manifests | usage: `scripts/setup/mount_container mount`
- `scripts/setup/share.sh` | goal: run a local sharing workflow | usage: `scripts/setup/share.sh`
- `scripts/setup/terminal-logs.sh` | goal: configure or collect terminal logging | usage: `scripts/setup/terminal-logs.sh`
- `scripts/windows/Get-ZimmermanTools.ps1` | goal: download Zimmerman forensic tools on Windows | usage: `powershell -File scripts/windows/Get-ZimmermanTools.ps1`
- `scripts/windows/getscreen.psm1` | goal: provide PowerShell screen-capture helpers | usage: `Import-Module scripts/windows/getscreen.psm1`
- `scripts/windows/sbom.ps1` | goal: generate or inspect SBOM-related data in PowerShell | usage: `powershell -File scripts/windows/sbom.ps1`
### Config And Reference Entry Points
- `config/visidata/` | goal: install and use the local VisiData config plus plugins | usage: `cd config/visidata && ./install.sh --link`
- `config/install.sh` | goal: bootstrap local environment configuration | usage: `config/install.sh`
- `config/z.sh` | goal: provide a shell directory-jump helper | usage: `source config/z.sh`
- `config/shell/completions/eslogger.zsh` | goal: add Zsh completion for Apple's `eslogger` | usage: `source config/shell/completions/eslogger.zsh`
### Archived Or Narrow Tools
- `archive/experimental/ctf_primefac.py` | goal: factor a hard-coded challenge integer with `primefac` | usage: `python3 archive/experimental/ctf_primefac.py`
- `archive/experimental/screen2.js` | goal: capture a specific webpage screenshot with PhantomJS | usage: `phantomjs archive/experimental/screen2.js`
- `archive/experimental/screenshot.js` | goal: capture screenshots for multiple URLs with PhantomJS | usage: `phantomjs archive/experimental/screenshot.js URL ...`
- `archive/experimental/usbreset.c` | goal: reset a USB device from Linux userspace | usage: `gcc archive/experimental/usbreset.c -o usbreset`
- `archive/experimental/sep_test.sh` | goal: binary-search Docker image tags for a malware detection change | usage: `archive/experimental/sep_test.sh image start_tag end_tag`
- `archive/experimental/flm.py` | goal: keep an old experimental script available for salvage | usage: `python3 archive/experimental/flm.py`
- `archive/experimental/fuzz.sh` | goal: keep an old shell fuzzing experiment available for salvage | usage: `archive/experimental/fuzz.sh`
- `archive/experimental/hydrogentest.py` | goal: keep an old experiment available for salvage | usage: `python3 archive/experimental/hydrogentest.py`
- `archive/experimental/kv.py` | goal: keep an old key-value parsing experiment available for salvage | usage: `python3 archive/experimental/kv.py`
- `archive/experimental/lpic.sh` | goal: keep an old system experiment available for salvage | usage: `archive/experimental/lpic.sh`
- `archive/experimental/matplottest.py` | goal: keep an old plotting experiment available for salvage | usage: `python3 archive/experimental/matplottest.py`
- `archive/experimental/rootshell.c` | goal: keep a dangerous historical C example archived rather than active | usage: `do not run; reference only`
## `tools/`: Standalone Utilities ## `tools/`: Standalone Utilities
### Security ### Security
@@ -159,4 +313,4 @@ Applied to the current tree, the remaining rough edges are:
- If you want a more complete tool with tests or a build flow, check `projects/`. - If you want a more complete tool with tests or a build flow, check `projects/`.
- If you want local environment setup or terminal customizations, check `config/`. - If you want local environment setup or terminal customizations, check `config/`.
- If you want containers, check `dockerfiles/`. - If you want containers, check `dockerfiles/`.
- If you are unsure, run `./what -l` or search through `.what_db.json`. - If you are unsure, run `./what -l` or ask `./what "query"`.

670
what
View File

@@ -1,423 +1,303 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
'what' - Smart repository search tool with progressive enhancement `what` - README-driven repository search using Ollama only.
Fallback hierarchy:
1. Ollama + Gemma2 (natural language search)
2. fzf (fuzzy finding)
3. grep (simple text search)
Usage: Usage:
what <query> # Find tools matching query what <query> # Find tools matching a natural-language query
what -h # Show help what -l # List catalogued tools
what -l # List all tools with short descriptions what --model <model> ... # Override the default Ollama model
what -a <filepath> # Add new file to database
""" """
import os from __future__ import annotations
import sys
import json
import argparse import argparse
import subprocess import json
import shutil import os
from pathlib import Path
import re import re
import subprocess
import sys
from pathlib import Path
# Configuration REPO_ROOT = Path(__file__).parent.resolve()
REPO_ROOT = Path(__file__).parent.absolute() README_PATH = REPO_ROOT / "README.md"
DB_FILE = REPO_ROOT / ".what_db.json" DEFAULT_MODEL = os.environ.get("WHAT_OLLAMA_MODEL", "ministral-3:3b")
CATALOG_HEADING = "## Tool Catalog"
ENTRY_RE = re.compile(
r"^- `([^`]+)` \| goal: (.*?) \| usage: (.*)$"
)
TOKEN_RE = re.compile(r"[a-z0-9_.+-]+")
class WhatTool:
def __init__(self):
self.db_path = DB_FILE
self.data = self.load_db()
# Detect available tools
self.has_ollama = self.check_ollama()
self.has_fzf = shutil.which('fzf') is not None
def load_db(self):
"""Load the tool database"""
if self.db_path.exists():
try:
with open(self.db_path, 'r') as f:
return json.load(f)
except json.JSONDecodeError:
print(f"Warning: Corrupted database {self.db_path}, creating new one")
return {
"version": "1.0",
"tools": {}
}
def save_db(self):
"""Save the tool database"""
with open(self.db_path, 'w') as f:
json.dump(self.data, f, indent=2, sort_keys=True)
def check_ollama(self):
"""Check if ollama with gemma2 is available"""
try:
result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
# Check if gemma2 model is available
models = result.stdout.lower()
return 'gemma2' in models
except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
pass
return False
def get_file_type(self, filepath):
"""Determine file type"""
if not filepath.exists():
return "missing"
if filepath.is_dir():
return "directory"
# Check if executable
is_executable = os.access(filepath, os.X_OK)
# Check extension
suffix = filepath.suffix.lower()
if suffix == '.py':
return "python script" if is_executable else "python module"
elif suffix == '.sh':
return "shell script"
elif suffix == '.go':
return "go program"
elif suffix == '.js':
return "javascript"
elif suffix == '.ps1':
return "powershell script"
elif suffix == '.rs':
return "rust program"
elif suffix in ['.c', '.cpp']:
return "c/c++ source"
elif suffix == '.awk':
return "awk script"
elif not suffix and is_executable:
return "binary executable"
elif not suffix:
return "script"
else:
return f"{suffix[1:]} file"
def analyze_file_with_ollama(self, filepath):
"""Analyze file using Ollama Gemma2"""
try:
# Read file content (limit size for analysis)
content = ""
if filepath.stat().st_size > 50000: # Skip very large files
content = "[File too large for analysis]"
else:
try:
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()[:10000] # First 10KB
except:
content = "[Binary or unreadable file]"
prompt = f"""
Analyze this code/script file and provide ONLY a JSON response with these fields:
Filename: {filepath.name} class WhatError(Exception):
File type: {self.get_file_type(filepath)} pass
Content preview:
{content[:2000]}
Respond with ONLY this JSON structure:
{{
"summary": "Brief 1-2 sentence summary of what this tool does and how it works",
"purpose": "What this tool is used for (e.g., 'Network analysis', 'File processing', 'Security scanning')",
"short_description": "Very short description for listings (e.g., 'like md5sum but for files inside tarballs')"
}}
"""
result = subprocess.run([ def load_readme() -> str:
'ollama', 'run', 'gemma2:2b', prompt if not README_PATH.exists():
], capture_output=True, text=True, timeout=30) raise WhatError(f"README not found at {README_PATH}")
return README_PATH.read_text(encoding="utf-8")
if result.returncode == 0:
# Extract JSON from response
response = result.stdout.strip()
# Try to find JSON in the response
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
return json.loads(json_match.group())
except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e:
print(f"Ollama analysis failed: {e}")
return None
def add_file_interactive(self, filepath):
"""Add file with interactive prompts"""
rel_path = str(filepath.relative_to(REPO_ROOT))
file_type = self.get_file_type(filepath)
print(f"\nAdding: {rel_path}")
print(f"Type: {file_type}")
print()
if self.has_ollama:
print("Analyzing with Ollama Gemma2...")
analysis = self.analyze_file_with_ollama(filepath)
if analysis:
print("AI Analysis complete. Review and edit if needed:")
summary = input(f"Summary [{analysis.get('summary', '')}]: ").strip()
purpose = input(f"Purpose [{analysis.get('purpose', '')}]: ").strip()
short_desc = input(f"Short description [{analysis.get('short_description', '')}]: ").strip()
# Use AI suggestions if user didn't provide alternatives
summary = summary or analysis.get('summary', '')
purpose = purpose or analysis.get('purpose', '')
short_desc = short_desc or analysis.get('short_description', '')
else:
print("AI analysis failed, using manual input:")
summary = input("Summary (what it does and how): ").strip()
purpose = input("Purpose (what it's used for): ").strip()
short_desc = input("Short description (for listings): ").strip()
else:
print("Manual input (Ollama not available):")
summary = input("Summary (what it does and how): ").strip()
purpose = input("Purpose (what it's used for): ").strip()
short_desc = input("Short description (for listings): ").strip()
# Store in database
self.data["tools"][rel_path] = {
"path": rel_path,
"name": filepath.name,
"type": file_type,
"summary": summary,
"purpose": purpose,
"short_description": short_desc,
"executable": os.access(filepath, os.X_OK)
}
self.save_db()
print(f"✓ Added {rel_path} to database")
def search_with_ollama(self, query):
"""Search using natural language with Ollama"""
try:
tools_info = []
for tool_data in self.data["tools"].values():
tools_info.append(f"{tool_data['name']}: {tool_data['summary']} (Purpose: {tool_data['purpose']})")
tools_text = "\n".join(tools_info)
prompt = f"""
Given this query: "{query}"
Find the most relevant tools from this list. Respond with ONLY the tool names (one per line) in order of relevance:
{tools_text} def extract_catalog(readme_text: str) -> list[dict[str, str]]:
in_catalog = False
entries: list[dict[str, str]] = []
for raw_line in readme_text.splitlines():
line = raw_line.rstrip()
if line == CATALOG_HEADING:
in_catalog = True
continue
if in_catalog and line.startswith("## "):
break
if not in_catalog:
continue
match = ENTRY_RE.match(line)
if not match:
continue
path, goal, usage = match.groups()
entries.append(
{
"path": path,
"goal": goal.strip(),
"usage": usage.strip(),
}
)
if not entries:
raise WhatError(
"No tool catalog entries found in README. "
f"Expected entries under '{CATALOG_HEADING}'."
)
return entries
def ensure_ollama_available(model: str) -> None:
if not shutil_which("ollama"):
raise WhatError("`ollama` is not installed or not in PATH.")
try:
result = subprocess.run(
["ollama", "list"],
capture_output=True,
text=True,
timeout=10,
check=False,
)
except subprocess.SubprocessError as exc:
raise WhatError(f"Failed to talk to Ollama: {exc}") from exc
if result.returncode != 0:
stderr = result.stderr.strip() or "unknown error"
raise WhatError(f"Ollama is unavailable: {stderr}")
models = result.stdout.lower()
if model.lower() not in models:
raise WhatError(
f"Model '{model}' is not available locally. "
"Pull it first with `ollama pull ...`."
)
def shutil_which(binary: str) -> str | None:
for directory in os.environ.get("PATH", "").split(os.pathsep):
candidate = Path(directory) / binary
if candidate.is_file() and os.access(candidate, os.X_OK):
return str(candidate)
return None
def build_prompt(query: str, entries: list[dict[str, str]]) -> str:
catalog_lines = [
f'- {entry["path"]} | goal: {entry["goal"]} | usage: {entry["usage"]}'
for entry in entries
]
catalog = "\n".join(catalog_lines)
return f"""You are selecting tools from a repository catalog.
Use only the catalog below. Prefer direct matches. Use archived tools only if they clearly fit the request.
Return strict JSON only. The response must be a JSON array with up to 8 objects.
Each object must contain:
- "path": exact catalog path
- "reason": one short sentence
Do not invent paths. Do not include markdown.
Prefer the entry whose action best matches the query: compare beats hash for comparison queries, open beats convert for opening queries, and mount beats inspect for mount queries.
Query: {query} Query: {query}
Response (tool names only, one per line, max 10): Catalog:
{catalog}
""" """
result = subprocess.run([
'ollama', 'run', 'gemma2:2b', prompt
], capture_output=True, text=True, timeout=20)
if result.returncode == 0:
tool_names = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
# Find matching tools in database
matches = []
for tool_name in tool_names[:10]: # Limit to top 10
for tool_data in self.data["tools"].values():
if tool_data['name'] == tool_name:
matches.append(tool_data)
break
return matches
except Exception as e:
print(f"Ollama search failed: {e}")
return None
def search_with_fzf(self, query):
"""Search using fzf fuzzy finder"""
try:
# Prepare search data for fzf
search_lines = []
for tool_data in self.data["tools"].values():
line = f"{tool_data['name']} # {tool_data['short_description']} | {tool_data['path']}"
search_lines.append(line)
search_input = "\n".join(search_lines)
# Run fzf with initial query
result = subprocess.run([
'fzf', '--filter', query, '--no-sort'
], input=search_input, capture_output=True, text=True)
if result.returncode == 0:
matches = []
for line in result.stdout.strip().split('\n'):
if ' | ' in line:
path = line.split(' | ')[-1]
if path in self.data["tools"]:
matches.append(self.data["tools"][path])
return matches
except Exception as e:
print(f"fzf search failed: {e}")
return None
def search_with_grep(self, query):
"""Fallback search using grep-like functionality"""
matches = []
query_lower = query.lower()
for tool_data in self.data["tools"].values():
# Search in name, summary, purpose, and short description
searchable = f"{tool_data['name']} {tool_data['summary']} {tool_data['purpose']} {tool_data['short_description']}".lower()
if query_lower in searchable:
matches.append(tool_data)
# Simple relevance scoring
def score_match(tool):
score = 0
query_lower = query.lower()
if query_lower in tool['name'].lower():
score += 10
if query_lower in tool['short_description'].lower():
score += 5
if query_lower in tool['summary'].lower():
score += 3
if query_lower in tool['purpose'].lower():
score += 2
return score
matches.sort(key=score_match, reverse=True)
return matches[:20] # Limit results
def search(self, query):
"""Search using the best available method"""
if not query:
return []
print(f"Searching for: {query}")
# Try Ollama first
if self.has_ollama:
print("Using Ollama Gemma2 for natural language search...")
results = self.search_with_ollama(query)
if results is not None:
return results
print("Ollama search failed, falling back to fzf...")
# Try fzf
if self.has_fzf:
print("Using fzf for fuzzy search...")
results = self.search_with_fzf(query)
if results is not None:
return results
print("fzf search failed, falling back to grep...")
# Fallback to grep
print("Using basic text search...")
return self.search_with_grep(query)
def list_all_tools(self):
"""List all tools with short descriptions"""
if not self.data["tools"]:
print("No tools in database. Use 'what -a <file>' to add tools.")
return
print("Available tools:")
print()
# Sort by name
tools = sorted(self.data["tools"].values(), key=lambda x: x['name'])
# Calculate max name length for alignment
max_name_len = max(len(tool['name']) for tool in tools)
for tool in tools:
executable_mark = "*" if tool.get('executable', False) else " "
name_padded = tool['name'].ljust(max_name_len)
print(f"{executable_mark}{name_padded} # {tool['short_description']}")
def show_search_results(self, results):
"""Display search results"""
if not results:
print("No tools found matching your query.")
return
print(f"\nFound {len(results)} tool(s):")
print()
for i, tool in enumerate(results, 1):
executable_mark = "*" if tool.get('executable', False) else " "
print(f"{i:2d}. {executable_mark}{tool['name']}")
print(f" Path: {tool['path']}")
print(f" Type: {tool['type']}")
print(f" Purpose: {tool['purpose']}")
print(f" Summary: {tool['summary']}")
print()
def main():
parser = argparse.ArgumentParser(description="Smart repository search tool") def tokenize(text: str) -> set[str]:
parser.add_argument("query", nargs="?", help="Search query") return set(TOKEN_RE.findall(text.lower()))
parser.add_argument("-l", "--list", action="store_true",
help="List all tools with short descriptions")
parser.add_argument("-a", "--add", metavar="PATH", def shortlist_entries(query: str, entries: list[dict[str, str]], limit: int = 28) -> list[dict[str, str]]:
help="Add new file to database") query_tokens = tokenize(query)
if not query_tokens:
return entries[:limit]
scored: list[tuple[int, dict[str, str]]] = []
for entry in entries:
haystack = f'{entry["path"]} {entry["goal"]} {entry["usage"]}'.lower()
entry_tokens = tokenize(haystack)
overlap = len(query_tokens & entry_tokens)
substring_hits = sum(1 for token in query_tokens if token in haystack)
archive_penalty = 1 if entry["path"].startswith("archive/") else 0
score = overlap * 5 + substring_hits - archive_penalty
scored.append((score, entry))
scored.sort(key=lambda item: item[0], reverse=True)
best = [entry for score, entry in scored if score > 0][:limit]
return best or entries[:limit]
def extract_json_array(output: str) -> list[dict[str, str]]:
match = re.search(r"\[\s*\{.*\}\s*\]", output, re.DOTALL)
payload = match.group(0) if match else output
data = json.loads(payload)
if not isinstance(data, list):
raise WhatError("Model output must be a JSON array.")
normalized: list[dict[str, str]] = []
for item in data:
if not isinstance(item, dict):
continue
path = str(item.get("path", "")).strip()
reason = str(item.get("reason", "")).strip()
if path:
normalized.append({"path": path, "reason": reason})
return normalized
def run_ollama_once(prompt: str, model: str) -> str:
try:
result = subprocess.run(
["ollama", "run", model, prompt],
capture_output=True,
text=True,
timeout=60,
check=False,
)
except subprocess.SubprocessError as exc:
raise WhatError(f"Ollama run failed: {exc}") from exc
if result.returncode != 0:
stderr = result.stderr.strip() or "unknown error"
raise WhatError(f"Ollama run failed: {stderr}")
return result.stdout.strip()
def run_ollama(prompt: str, model: str) -> list[dict[str, str]]:
first_output = run_ollama_once(prompt, model)
try:
return extract_json_array(first_output)
except (json.JSONDecodeError, WhatError):
repair_prompt = (
"Rewrite the following response as strict JSON only.\n"
'Target format: [{"path":"exact catalog path","reason":"short reason"}]\n'
"Do not add markdown or commentary.\n\n"
f"Response to repair:\n{first_output}\n"
)
repaired_output = run_ollama_once(repair_prompt, model)
try:
return extract_json_array(repaired_output)
except (json.JSONDecodeError, WhatError) as exc:
raise WhatError(
"Model output was not valid JSON after repair. "
f"Raw output was:\n{repaired_output}"
) from exc
def search(query: str, entries: list[dict[str, str]], model: str) -> list[dict[str, str]]:
ensure_ollama_available(model)
prompt_entries = shortlist_entries(query, entries)
raw_results = run_ollama(build_prompt(query, prompt_entries), model)
entry_map = {entry["path"]: entry for entry in entries}
results: list[dict[str, str]] = []
seen: set[str] = set()
for item in raw_results:
path = item["path"]
if path not in entry_map or path in seen:
continue
seen.add(path)
merged = dict(entry_map[path])
merged["reason"] = item.get("reason", "")
results.append(merged)
return results
def list_entries(entries: list[dict[str, str]]) -> None:
for entry in entries:
print(f'{entry["path"]}')
print(f' goal: {entry["goal"]}')
print(f' usage: {entry["usage"]}')
def show_results(query: str, results: list[dict[str, str]], model: str) -> None:
if not results:
print(f"No catalogued tool matched: {query}")
return
print(f"Model: {model}")
print(f"Query: {query}")
print()
for idx, item in enumerate(results, 1):
print(f"{idx}. {item['path']}")
print(f" Goal: {item['goal']}")
print(f" Usage: {item['usage']}")
if item.get("reason"):
print(f" Why: {item['reason']}")
print()
def main() -> int:
parser = argparse.ArgumentParser(description="README-driven repository search using Ollama")
parser.add_argument("query", nargs="?", help="Natural-language search query")
parser.add_argument("-l", "--list", action="store_true", help="List catalogued tools")
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Ollama model to use (default: {DEFAULT_MODEL})")
args = parser.parse_args() args = parser.parse_args()
tool = WhatTool() try:
entries = extract_catalog(load_readme())
except WhatError as exc:
print(f"Error: {exc}", file=sys.stderr)
return 1
if args.list: if args.list:
tool.list_all_tools() list_entries(entries)
return return 0
if args.add:
filepath = Path(args.add)
if not filepath.exists():
print(f"Error: File {filepath} does not exist")
sys.exit(1)
if not filepath.is_relative_to(REPO_ROOT):
print(f"Error: File must be within the repository ({REPO_ROOT})")
sys.exit(1)
tool.add_file_interactive(filepath)
return
if not args.query: if not args.query:
parser.print_help() parser.print_help()
print() print()
print("Available search methods:") print(f"Catalog source: {README_PATH}")
if tool.has_ollama: print(f"Default model: {args.model}")
print(" ✓ Ollama + Gemma2 (natural language)") return 0
else:
print(" ✗ Ollama + Gemma2 (not available)") try:
results = search(args.query, entries, args.model)
if tool.has_fzf: except WhatError as exc:
print(" ✓ fzf (fuzzy finding)") print(f"Error: {exc}", file=sys.stderr)
else: return 1
print(" ✗ fzf (not available)")
show_results(args.query, results, args.model)
print(" ✓ grep (basic text search)") return 0
return
# Perform search
results = tool.search(args.query)
tool.show_search_results(results)
if __name__ == "__main__": if __name__ == "__main__":
main() raise SystemExit(main())