Rewrite what around README catalog and Ollama

Remove the JSON tool database and move tool metadata into a compact README catalog. Make what README-driven and Ollama-only, with shortlist generation and JSON-repair retry handling. Pull qwen3.5:2b and ministral-3:3b, compare them on fixed repository queries, and set ministral-3:3b as the default model. Tighten README wording so similar tools like domgrep/geturls and sparsecmp/scatterhash rank correctly.
2026-03-07 20:39:24 +01:00
parent fd515742b5
commit 559fa38c04
4 changed files with 433 additions and 547 deletions
@@ -5,3 +5,4 @@ tmp
 Icon?
 ._*
 __MACOSX/
 __pycache__/
@@ -1,149 +0,0 @@
 {
  "version": "1.0",
  "tools": {
    "tools/security/scan_vt.py": {
      "path": "tools/security/scan_vt.py",
      "name": "scan_vt.py",
      "type": "python script",
      "summary": "Scans files against VirusTotal using MD5 hashes and displays detection results with positives/total ratios and permalink.",
      "purpose": "Malware detection and threat analysis",
      "short_description": "VirusTotal file scanner with detection ratios",
      "executable": true
    },
    "tools/security/imphash.py": {
      "path": "tools/security/imphash.py", 
      "name": "imphash.py",
      "type": "python script",
      "summary": "Calculates and displays the import hash (imphash) of PE files using pefile library for malware analysis.",
      "purpose": "Malware analysis and PE file fingerprinting",
      "short_description": "PE import hash calculator",
      "executable": true
    },
    "tools/security/scapy_arp.py": {
      "path": "tools/security/scapy_arp.py",
      "name": "scapy_arp.py", 
      "type": "python script",
      "summary": "Multi-threaded ARP network scanner using Scapy to discover live hosts on a /24 network range with MAC addresses.",
      "purpose": "Network discovery and reconnaissance",
      "short_description": "threaded ARP network scanner",
      "executable": true
    },
    "tools/data/domgrep.py": {
      "path": "tools/data/domgrep.py",
      "name": "domgrep.py",
      "type": "python script", 
      "summary": "Extracts domain names from URLs read from stdin, filtering out IP addresses and handling malformed URLs gracefully.",
      "purpose": "Data extraction and URL processing",
      "short_description": "extract domains from URL lists",
      "executable": true
    },
    "tools/data/unum.py": {
      "path": "tools/data/unum.py",
      "name": "unum.py",
      "type": "python script",
      "summary": "Analyzes Unicode characters showing decimal/hex codes, categories, and official Unicode names with proper formatting.",
      "purpose": "Text analysis and Unicode debugging",
      "short_description": "detailed Unicode character analyzer",
      "executable": true
    },
    "tools/forensics/chechsqlite.py": {
      "path": "tools/forensics/chechsqlite.py",
      "name": "chechsqlite.py",
      "type": "python script",
      "summary": "Scans SQLite databases for tables containing password or hash-related columns for security analysis.",
      "purpose": "Database security analysis",
      "short_description": "find password/hash columns in SQLite DBs",
      "executable": true
    },
    "tools/hashing/scatterhash.py": {
      "path": "tools/hashing/scatterhash.py",
      "name": "scatterhash.py", 
      "type": "python script",
      "summary": "Performs sparse hashing of large files by sampling blocks across the file for efficient integrity checking and validation.",
      "purpose": "Large file integrity verification",
      "short_description": "sparse hashing for huge files",
      "executable": true
    },
    "tools/hashing/libarchivesum.py": {
      "path": "tools/hashing/libarchivesum.py",
      "name": "libarchivesum.py",
      "type": "python script",
      "summary": "Calculates hashes of individual files within archives (zip, tar, etc.) without extracting them.",
      "purpose": "Archive analysis and integrity checking",
      "short_description": "like md5sum but for files inside archives",
      "executable": true
    },
    "tools/system/ltop.py": {
      "path": "tools/system/ltop.py",
      "name": "ltop.py", 
      "type": "python script",
      "summary": "Real-time frequency counter for stdin lines, showing top N most common entries with live updates using curses.",
      "purpose": "Log analysis and monitoring",
      "short_description": "like top but for line frequency in streams",
      "executable": true
    },
    "tools/network/ipgrep": {
      "path": "tools/network/ipgrep",
      "name": "ipgrep",
      "type": "shell script",
      "summary": "Comprehensive IP and MAC address extractor with sorting, deduplication, ping testing, and DNS resolution capabilities.",
      "purpose": "Network analysis and IP processing", 
      "short_description": "advanced IP/MAC extractor with ping testing",
      "executable": true
    },
    "tools/security/certwipe": {
      "path": "tools/security/certwipe",
      "name": "certwipe",
      "type": "shell script",
      "summary": "Professional disk wiping tool supporting ATA SecureErase with frozen disk handling and fallback to dc3dd overwriting.",
      "purpose": "Data destruction and security",
      "short_description": "professional disk wiper with SecureErase",
      "executable": true
    },
    "tools/system/watchgrowth.sh": {
      "path": "tools/system/watchgrowth.sh", 
      "name": "watchgrowth.sh",
      "type": "shell script",
      "summary": "Monitors file/directory size growth in real-time, showing transfer speeds and optional progress percentage.",
      "purpose": "File monitoring and transfer analysis",
      "short_description": "real-time file growth monitor",
      "executable": true
    },
    "projects/timesketch/deploy_timesketch.sh": {
      "path": "projects/timesketch/deploy_timesketch.sh",
      "name": "deploy_timesketch.sh",
      "type": "shell script", 
      "summary": "Automated deployment script for Timesketch digital forensics timeline analysis platform with Docker Compose setup.",
      "purpose": "Digital forensics infrastructure deployment",
      "short_description": "deploy Timesketch forensic timeline platform",
      "executable": true
    },
    "tools/system/backup_docker.sh": {
      "path": "tools/system/backup_docker.sh",
      "name": "backup_docker.sh", 
      "type": "shell script",
      "summary": "Comprehensive Docker Compose stack backup including images, configs, and volumes with incremental storage optimization.",
      "purpose": "Container infrastructure backup",
      "short_description": "backup entire Docker Compose stacks",
      "executable": true
    },
    "tools/cloud/cloudsend.py": {
      "path": "tools/cloud/cloudsend.py",
      "name": "cloudsend.py",
      "type": "python script",
      "summary": "Uploads files to NextCloud/OwnCloud public shares with optional GPG encryption support via command line interface.", 
      "purpose": "Cloud file sharing and backup",
      "short_description": "upload files to NextCloud public shares",
      "executable": true
    },
    "tools/cloud/vqa3.py": {
      "path": "tools/cloud/vqa3.py",
      "name": "vqa3.py",
      "type": "python script",
      "summary": "AI-powered image classification using OpenAI CLIP models for content categorization with customizable classification categories.",
      "purpose": "AI image analysis and content filtering",
      "short_description": "AI image classifier using CLIP models",
      "executable": true
    }
  }
 }
@@ -41,11 +41,165 @@ Applied to the current tree, the remaining rough edges are:
 ## Top-Level Files
- `what`: repository search helper. It can list known tools, search by query, and progressively falls back from Ollama-based natural-language search to `fzf` or plain grep.
+- `what`: README-driven repository search helper. It uses one local Ollama model and searches only the catalog below.
 - `.what_db.json`: the metadata database used by `what`. It stores short descriptions for known tools.
 - `README.md`: this guide.
 - `.gitignore`: standard repository ignore rules.
 ## Tool Catalog
 Format: `path | goal | usage`. This section is intentionally compact so `what` can pass it to a small local model without dragging the whole repository into context.
 ### Active Tools
 - `what` | goal: search this repository's tool catalog with Ollama only | usage: `./what "query"` or `./what -l`
 ### Security
 - `tools/security/scan_vt.py` | goal: check file hashes against VirusTotal | usage: `python3 tools/security/scan_vt.py sample.bin`
 - `tools/security/imphash.py` | goal: calculate PE import hashes for malware triage | usage: `python3 tools/security/imphash.py file.exe`
 - `tools/security/scapy_arp.py` | goal: scan a local network with ARP requests | usage: `python3 tools/security/scapy_arp.py`
 - `tools/security/simple_portscan.py` | goal: do a lightweight TCP port scan | usage: `python3 tools/security/simple_portscan.py host`
 - `tools/security/smtpbanner.py` | goal: grab SMTP banners from remote servers | usage: `python3 tools/security/smtpbanner.py host`
 - `tools/security/testpw.py` | goal: test password candidates against a target workflow | usage: `python3 tools/security/testpw.py ...`
 - `tools/security/vt_download.py` | goal: download malware samples or data from VirusTotal-related workflows | usage: `python3 tools/security/vt_download.py ...`
 - `tools/security/vt_ip.py` | goal: enrich IP addresses with VirusTotal intel | usage: `python3 tools/security/vt_ip.py 8.8.8.8`
 - `tools/security/vt_pdns.py` | goal: query passive DNS style data from VirusTotal workflows | usage: `python3 tools/security/vt_pdns.py domain.tld`
 - `tools/security/certwipe` | goal: wipe disks with secure-erase oriented steps | usage: `tools/security/certwipe /dev/sdX`
 ### Forensics
 - `tools/forensics/chechsqlite.py` | goal: inspect SQLite databases for password or hash style columns | usage: `python3 tools/forensics/chechsqlite.py sample.db`
 - `tools/forensics/extractfolder.py` | goal: bulk-extract or sort files from a folder workflow | usage: `python3 tools/forensics/extractfolder.py input_dir`
 - `tools/forensics/process_leak.py` | goal: inspect process-leak style artifacts | usage: `python3 tools/forensics/process_leak.py artifact`
 - `tools/forensics/mailunpack` | goal: extract mail attachments inside a constrained container workflow | usage: `tools/forensics/mailunpack message.eml`
 - `tools/forensics/showgm.sh` | goal: open image GPS EXIF coordinates in Google Maps | usage: `tools/forensics/showgm.sh image.jpg`
 - `tools/forensics/showosm.sh` | goal: open image GPS EXIF coordinates in OpenStreetMap | usage: `tools/forensics/showosm.sh image.jpg`
 ### Data And Text
 - `tools/data/domgrep.py` | goal: extract domain names specifically from URLs or mixed text input | usage: `cat urls.txt | python3 tools/data/domgrep.py`
 - `tools/data/geturls.py` | goal: extract full raw URLs from text when you want links rather than domains | usage: `python3 tools/data/geturls.py file.txt`
 - `tools/data/unum.py` | goal: inspect Unicode code points and names | usage: `echo "text" | python3 tools/data/unum.py`
 - `tools/data/quickchardet.py` | goal: guess file encoding quickly | usage: `python3 tools/data/quickchardet.py file.txt`
 - `tools/data/json_save.py` | goal: normalize or save JSON fragments from text streams | usage: `python3 tools/data/json_save.py ...`
 - `tools/data/kv_parse.py` | goal: parse key-value formatted text | usage: `python3 tools/data/kv_parse.py input.txt`
 - `tools/data/vba_chr_decode.py` | goal: decode VBA `Chr(...)` obfuscation patterns | usage: `python3 tools/data/vba_chr_decode.py macro.txt`
 - `tools/data/concat.py` | goal: concatenate structured text inputs in a repeatable way | usage: `python3 tools/data/concat.py file1 file2`
 - `tools/data/split_linewise.py` | goal: split text into line-based chunks | usage: `python3 tools/data/split_linewise.py input.txt`
 - `tools/data/uniq.py` | goal: remove duplicate lines while preserving first occurrence order | usage: `python3 tools/data/uniq.py file.txt`
 - `tools/data/urldecode.py` | goal: URL-decode strings from stdin or files | usage: `python3 tools/data/urldecode.py`
 - `tools/data/between` | goal: print text between delimiters | usage: `tools/data/between START END < file.txt`
 - `tools/data/csv_get` | goal: extract selected CSV fields quickly | usage: `tools/data/csv_get file.csv column`
 - `tools/data/csv2dot` | goal: turn CSV relationships into Graphviz dot edges | usage: `tools/data/csv2dot`
 ### Hashing And Archives
 - `tools/hashing/libarchivesum.py` | goal: hash files inside archives without full extraction | usage: `python3 tools/hashing/libarchivesum.py archive.zip`
 - `tools/hashing/scatterhash.py` | goal: hash very large files by sparse sampling when you need a fingerprint rather than a comparison | usage: `python3 tools/hashing/scatterhash.py huge.img`
 - `tools/hashing/hashzip.py` | goal: hash ZIP contents or metadata for comparison | usage: `python3 tools/hashing/hashzip.py sample.zip`
 - `tools/hashing/tarsum.py` | goal: compute tar-oriented checksums in Python | usage: `python3 tools/hashing/tarsum.py archive.tar`
 - `tools/hashing/sparsecmp.sh` | goal: compare very large files or block devices by sampling chunks at fixed offsets | usage: `tools/hashing/sparsecmp.sh source target 100`
 - `tools/hashing/trunc_by_hash.py` | goal: find the byte length where a rolling hash matches a target digest | usage: `python3 tools/hashing/trunc_by_hash.py HASH file.bin`
 ### Network And Cloud
 - `tools/network/ipgrep` | goal: extract IP or MAC indicators from text | usage: `cat file.txt | tools/network/ipgrep`
 - `tools/network/fritzshark.sh` | goal: inspect or capture FritzBox traffic workflows | usage: `tools/network/fritzshark.sh`
 - `tools/network/fritzshark2.sh` | goal: alternate FritzBox traffic workflow | usage: `tools/network/fritzshark2.sh`
 - `tools/network/get_ntp.py` | goal: query NTP information from remote systems | usage: `python3 tools/network/get_ntp.py host`
 - `tools/network/get_stp.sh` | goal: inspect spanning-tree data on a network | usage: `tools/network/get_stp.sh device`
 - `tools/cloud/cloudsend.py` | goal: upload files to Nextcloud or OwnCloud shares | usage: `python3 tools/cloud/cloudsend.py file`
 - `tools/cloud/cloudsend.sh` | goal: shell wrapper for cloud share upload workflows | usage: `tools/cloud/cloudsend.sh file`
 - `tools/cloud/docker_pull.py` | goal: download image layers from a container registry without `docker pull` | usage: `python3 tools/cloud/docker_pull.py ubuntu:latest`
 - `tools/cloud/speech.py` | goal: run cloud-backed speech or transcription tasks | usage: `python3 tools/cloud/speech.py input`
 - `tools/cloud/vqa3.py` | goal: classify images with a local or model-backed VQA workflow | usage: `python3 tools/cloud/vqa3.py image.jpg`
 - `tools/cloud/youtube_resolve.sh` | goal: resolve direct media URLs from YouTube-like inputs | usage: `tools/cloud/youtube_resolve.sh URL`
 ### Formats, System, And Text Experiments
 - `tools/formats/convert2pdf.sh` | goal: convert documents into PDF form | usage: `tools/formats/convert2pdf.sh input.docx`
 - `tools/formats/flatpdf.sh` | goal: flatten or normalize PDFs for downstream handling | usage: `tools/formats/flatpdf.sh input.pdf`
 - `tools/formats/openflattenpdf.sh` | goal: flatten a PDF through PostScript and open the result | usage: `tools/formats/openflattenpdf.sh input.pdf`
 - `tools/formats/rename.mime.py` | goal: rename or sort files by MIME type | usage: `python3 tools/formats/rename.mime.py`
 - `tools/system/backup_docker.sh` | goal: back up a Docker Compose stack | usage: `tools/system/backup_docker.sh docker-compose.yml`
 - `tools/system/restore_docker.sh` | goal: restore a saved Docker workflow | usage: `tools/system/restore_docker.sh`
 - `tools/system/watchgrowth.sh` | goal: watch a file or directory grow over time | usage: `tools/system/watchgrowth.sh path`
 - `tools/system/ltop.py` | goal: show the most frequent lines from a stream like `top` | usage: `tail -f log | python3 tools/system/ltop.py`
 - `tools/system/noerr` | goal: run a command with stderr suppressed | usage: `tools/system/noerr some command`
 - `tools/system/wipe.sh` | goal: perform destructive wipe or cleanup steps | usage: `tools/system/wipe.sh target`
 - `tools/text/probability.py` | goal: run a small text probability experiment | usage: `python3 tools/text/probability.py`
 - `tools/text/depth` | goal: inspect text depth or nesting characteristics | usage: `tools/text/depth input.txt`
 ### CTF Helpers
 - `tools/ctf/filtertext.py` | goal: filter challenge text to useful fragments | usage: `python3 tools/ctf/filtertext.py input.txt`
 - `tools/ctf/getjs.py` | goal: extract JavaScript from challenge pages | usage: `python3 tools/ctf/getjs.py page.html`
 - `tools/ctf/guess.py` | goal: brute-force or guess through a challenge search space | usage: `python3 tools/ctf/guess.py ...`
 - `tools/ctf/ps_.py` | goal: run a CTF-specific parsing or post-processing step | usage: `python3 tools/ctf/ps_.py ...`
 - `tools/ctf/search.py` | goal: search challenge artifacts for signals | usage: `python3 tools/ctf/search.py input`
 - `tools/ctf/submit_flag.sh` | goal: submit flags to a challenge endpoint | usage: `tools/ctf/submit_flag.sh FLAG`
 - `tools/ctf/transpose.py` | goal: transpose text or matrix-like challenge data | usage: `python3 tools/ctf/transpose.py input`
 ### Go Tools And Small Projects
 - `tools/go/bincmp/gobincmp.go` | goal: compare files or directories with fuzzy hashing | usage: `go run tools/go/bincmp/gobincmp.go left right`
 - `tools/go/gopname/pname.go` | goal: demo process-title renaming with `gspt` | usage: `go run tools/go/gopname/pname.go`
 - `tools/go/tarsum/tarsum.go` | goal: print a SHA-256 checksum for a tar file | usage: `go run tools/go/tarsum/tarsum.go archive.tar`
 - `projects/go-tools/go/goipgrep/` | goal: production-grade IP and MAC extractor with ping, DNS, and lookup support | usage: `projects/go-tools/go/goipgrep/scripts/build.sh`
 - `projects/go-tools/go/csv2json/csv2json.go` | goal: convert CSV input to JSON | usage: `go run projects/go-tools/go/csv2json/csv2json.go`
 - `projects/go-tools/go/gobetween/gobetween.go` | goal: extract text between delimiters in Go | usage: `go run projects/go-tools/go/gobetween/gobetween.go`
 - `projects/go-tools/go/goinfo/goinfo.go` | goal: inspect file or system information in Go | usage: `go run projects/go-tools/go/goinfo/goinfo.go`
 - `projects/go-tools/go/gosoft/gosoft.go` | goal: enumerate installed software from multiple package sources | usage: `go run projects/go-tools/go/gosoft/gosoft.go`
 - `projects/go-tools/go/gouniq/gouniq.go` | goal: remove duplicate lines in Go | usage: `go run projects/go-tools/go/gouniq/gouniq.go < file.txt`
 - `projects/rust-tools/between.rs` | goal: Rust version of between-delimiter extraction | usage: `rustc projects/rust-tools/between.rs && ./between`
 - `projects/rust-tools/uniq.rs` | goal: Rust uniq implementation preserving first occurrences | usage: `rustc projects/rust-tools/uniq.rs && ./uniq file.txt`
 - `projects/rust-tools/uniq2.rs` | goal: alternate Rust uniq implementation | usage: `rustc projects/rust-tools/uniq2.rs && ./uniq2 file.txt`
 - `projects/puzzlebox/` | goal: solve voxel and puzzlebox search problems with several solver variants | usage: `python3 projects/puzzlebox/solve.py`
 - `projects/timesketch/deploy_timesketch.sh` | goal: deploy a Timesketch environment | usage: `projects/timesketch/deploy_timesketch.sh`
 ### Admin And Setup Scripts
 - `scripts/proxy/get_proxy.sh` | goal: print current proxy settings | usage: `scripts/proxy/get_proxy.sh`
 - `scripts/proxy/update_apt_proxy.sh` | goal: write apt proxy configuration | usage: `scripts/proxy/update_apt_proxy.sh host port`
 - `scripts/proxy/update_bashrc_proxy.sh` | goal: add shell proxy exports to a bash config | usage: `scripts/proxy/update_bashrc_proxy.sh host port`
 - `scripts/proxy/update_service_proxy.sh` | goal: apply proxy settings to service units | usage: `scripts/proxy/update_service_proxy.sh service`
 - `scripts/display/3_screen_setup.sh` | goal: apply a fixed three-monitor `xrandr` layout | usage: `scripts/display/3_screen_setup.sh`
 - `scripts/display/notebook_extended.sh` | goal: apply a laptop-plus-external-display layout | usage: `scripts/display/notebook_extended.sh`
 - `scripts/display/reset_screens.sh` | goal: reset screen outputs to a known state | usage: `scripts/display/reset_screens.sh`
 - `scripts/display/single_fullhd.sh` | goal: force a single full-HD laptop display mode | usage: `scripts/display/single_fullhd.sh`
 - `scripts/display/toggle_display.sh` | goal: toggle an external display workflow | usage: `scripts/display/toggle_display.sh`
 - `scripts/display/toggle_touchpad` | goal: toggle touchpad state on or off | usage: `scripts/display/toggle_touchpad`
 - `scripts/setup/automountctl` | goal: manage automount-related setup | usage: `scripts/setup/automountctl`
 - `scripts/setup/disable_ubuntu_telemetry.sh` | goal: disable Ubuntu telemetry packages and endpoints | usage: `sudo scripts/setup/disable_ubuntu_telemetry.sh`
 - `scripts/setup/mount_container` | goal: mount or unmount LUKS container files listed in `.containers` manifests | usage: `scripts/setup/mount_container mount`
 - `scripts/setup/share.sh` | goal: run a local sharing workflow | usage: `scripts/setup/share.sh`
 - `scripts/setup/terminal-logs.sh` | goal: configure or collect terminal logging | usage: `scripts/setup/terminal-logs.sh`
 - `scripts/windows/Get-ZimmermanTools.ps1` | goal: download Zimmerman forensic tools on Windows | usage: `powershell -File scripts/windows/Get-ZimmermanTools.ps1`
 - `scripts/windows/getscreen.psm1` | goal: provide PowerShell screen-capture helpers | usage: `Import-Module scripts/windows/getscreen.psm1`
 - `scripts/windows/sbom.ps1` | goal: generate or inspect SBOM-related data in PowerShell | usage: `powershell -File scripts/windows/sbom.ps1`
 ### Config And Reference Entry Points
 - `config/visidata/` | goal: install and use the local VisiData config plus plugins | usage: `cd config/visidata && ./install.sh --link`
 - `config/install.sh` | goal: bootstrap local environment configuration | usage: `config/install.sh`
 - `config/z.sh` | goal: provide a shell directory-jump helper | usage: `source config/z.sh`
 - `config/shell/completions/eslogger.zsh` | goal: add Zsh completion for Apple's `eslogger` | usage: `source config/shell/completions/eslogger.zsh`
 ### Archived Or Narrow Tools
 - `archive/experimental/ctf_primefac.py` | goal: factor a hard-coded challenge integer with `primefac` | usage: `python3 archive/experimental/ctf_primefac.py`
 - `archive/experimental/screen2.js` | goal: capture a specific webpage screenshot with PhantomJS | usage: `phantomjs archive/experimental/screen2.js`
 - `archive/experimental/screenshot.js` | goal: capture screenshots for multiple URLs with PhantomJS | usage: `phantomjs archive/experimental/screenshot.js URL ...`
 - `archive/experimental/usbreset.c` | goal: reset a USB device from Linux userspace | usage: `gcc archive/experimental/usbreset.c -o usbreset`
 - `archive/experimental/sep_test.sh` | goal: binary-search Docker image tags for a malware detection change | usage: `archive/experimental/sep_test.sh image start_tag end_tag`
 - `archive/experimental/flm.py` | goal: keep an old experimental script available for salvage | usage: `python3 archive/experimental/flm.py`
 - `archive/experimental/fuzz.sh` | goal: keep an old shell fuzzing experiment available for salvage | usage: `archive/experimental/fuzz.sh`
 - `archive/experimental/hydrogentest.py` | goal: keep an old experiment available for salvage | usage: `python3 archive/experimental/hydrogentest.py`
 - `archive/experimental/kv.py` | goal: keep an old key-value parsing experiment available for salvage | usage: `python3 archive/experimental/kv.py`
 - `archive/experimental/lpic.sh` | goal: keep an old system experiment available for salvage | usage: `archive/experimental/lpic.sh`
 - `archive/experimental/matplottest.py` | goal: keep an old plotting experiment available for salvage | usage: `python3 archive/experimental/matplottest.py`
 - `archive/experimental/rootshell.c` | goal: keep a dangerous historical C example archived rather than active | usage: `do not run; reference only`
 ## `tools/`: Standalone Utilities
 ### Security
@@ -159,4 +313,4 @@ Applied to the current tree, the remaining rough edges are:
 - If you want a more complete tool with tests or a build flow, check `projects/`.
 - If you want local environment setup or terminal customizations, check `config/`.
 - If you want containers, check `dockerfiles/`.
- If you are unsure, run `./what -l` or search through `.what_db.json`.
+- If you are unsure, run `./what -l` or ask `./what "query"`.
@@ -1,423 +1,303 @@
 #!/usr/bin/env python3
 """
-'what' - Smart repository search tool with progressive enhancement
+`what` - README-driven repository search using Ollama only.
 Fallback hierarchy:
 1. Ollama + Gemma2 (natural language search) 
 2. fzf (fuzzy finding)
 3. grep (simple text search)
 Usage:
-    what <query>        # Find tools matching query
+    what <query>              # Find tools matching a natural-language query
-    what -h             # Show help
+    what -l                   # List catalogued tools
-    what -l             # List all tools with short descriptions  
+    what --model <model> ...  # Override the default Ollama model
    what -a <filepath>  # Add new file to database
 """
-import os
+from __future__ import annotations
-import sys
+
 import json
 import argparse
-import subprocess
+import json
-import shutil
+import os
 from pathlib import Path
 import re
 import subprocess
 import sys
 from pathlib import Path
-# Configuration
+REPO_ROOT = Path(__file__).parent.resolve()
-REPO_ROOT = Path(__file__).parent.absolute()
+README_PATH = REPO_ROOT / "README.md"
-DB_FILE = REPO_ROOT / ".what_db.json"
+DEFAULT_MODEL = os.environ.get("WHAT_OLLAMA_MODEL", "ministral-3:3b")
 CATALOG_HEADING = "## Tool Catalog"
 ENTRY_RE = re.compile(
    r"^- `([^`]+)` \| goal: (.*?) \| usage: (.*)$"
 )
 TOKEN_RE = re.compile(r"[a-z0-9_.+-]+")
 class WhatTool:
    def __init__(self):
        self.db_path = DB_FILE
        self.data = self.load_db()
        # Detect available tools
        self.has_ollama = self.check_ollama()
        self.has_fzf = shutil.which('fzf') is not None
    def load_db(self):
        """Load the tool database"""
        if self.db_path.exists():
            try:
                with open(self.db_path, 'r') as f:
                    return json.load(f)
            except json.JSONDecodeError:
                print(f"Warning: Corrupted database {self.db_path}, creating new one")
        return {
            "version": "1.0",
            "tools": {}
        }
    def save_db(self):
        """Save the tool database"""
        with open(self.db_path, 'w') as f:
            json.dump(self.data, f, indent=2, sort_keys=True)
    def check_ollama(self):
        """Check if ollama with gemma2 is available"""
        try:
            result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=5)
            if result.returncode == 0:
                # Check if gemma2 model is available
                models = result.stdout.lower()
                return 'gemma2' in models
        except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
            pass
        return False
    def get_file_type(self, filepath):
        """Determine file type"""
        if not filepath.exists():
            return "missing"
        if filepath.is_dir():
            return "directory"
        # Check if executable
        is_executable = os.access(filepath, os.X_OK)
        # Check extension
        suffix = filepath.suffix.lower()
        if suffix == '.py':
            return "python script" if is_executable else "python module"
        elif suffix == '.sh':
            return "shell script"
        elif suffix == '.go':
            return "go program"
        elif suffix == '.js':
            return "javascript"
        elif suffix == '.ps1':
            return "powershell script"
        elif suffix == '.rs':
            return "rust program"
        elif suffix in ['.c', '.cpp']:
            return "c/c++ source"
        elif suffix == '.awk':
            return "awk script"
        elif not suffix and is_executable:
            return "binary executable"
        elif not suffix:
            return "script"
        else:
            return f"{suffix[1:]} file"
    def analyze_file_with_ollama(self, filepath):
        """Analyze file using Ollama Gemma2"""
        try:
            # Read file content (limit size for analysis)
            content = ""
            if filepath.stat().st_size > 50000:  # Skip very large files
                content = "[File too large for analysis]"
            else:
                try:
                    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()[:10000]  # First 10KB
                except:
                    content = "[Binary or unreadable file]"
            prompt = f"""
 Analyze this code/script file and provide ONLY a JSON response with these fields:
-Filename: {filepath.name}
+class WhatError(Exception):
-File type: {self.get_file_type(filepath)}
+    pass
 Content preview:
 {content[:2000]}
 Respond with ONLY this JSON structure:
 {{
    "summary": "Brief 1-2 sentence summary of what this tool does and how it works",
    "purpose": "What this tool is used for (e.g., 'Network analysis', 'File processing', 'Security scanning')",
    "short_description": "Very short description for listings (e.g., 'like md5sum but for files inside tarballs')"
 }}
 """
-            result = subprocess.run([
+def load_readme() -> str:
-                'ollama', 'run', 'gemma2:2b', prompt
+    if not README_PATH.exists():
-            ], capture_output=True, text=True, timeout=30)
+        raise WhatError(f"README not found at {README_PATH}")
-            
+    return README_PATH.read_text(encoding="utf-8")
            if result.returncode == 0:
                # Extract JSON from response
                response = result.stdout.strip()
                # Try to find JSON in the response
                json_match = re.search(r'\{.*\}', response, re.DOTALL)
                if json_match:
                    return json.loads(json_match.group())
        except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e:
            print(f"Ollama analysis failed: {e}")
        return None
    def add_file_interactive(self, filepath):
        """Add file with interactive prompts"""
        rel_path = str(filepath.relative_to(REPO_ROOT))
        file_type = self.get_file_type(filepath)
        print(f"\nAdding: {rel_path}")
        print(f"Type: {file_type}")
        print()
        if self.has_ollama:
            print("Analyzing with Ollama Gemma2...")
            analysis = self.analyze_file_with_ollama(filepath)
            if analysis:
                print("AI Analysis complete. Review and edit if needed:")
                summary = input(f"Summary [{analysis.get('summary', '')}]: ").strip()
                purpose = input(f"Purpose [{analysis.get('purpose', '')}]: ").strip()
                short_desc = input(f"Short description [{analysis.get('short_description', '')}]: ").strip()
                # Use AI suggestions if user didn't provide alternatives
                summary = summary or analysis.get('summary', '')
                purpose = purpose or analysis.get('purpose', '')
                short_desc = short_desc or analysis.get('short_description', '')
            else:
                print("AI analysis failed, using manual input:")
                summary = input("Summary (what it does and how): ").strip()
                purpose = input("Purpose (what it's used for): ").strip()
                short_desc = input("Short description (for listings): ").strip()
        else:
            print("Manual input (Ollama not available):")
            summary = input("Summary (what it does and how): ").strip()
            purpose = input("Purpose (what it's used for): ").strip()
            short_desc = input("Short description (for listings): ").strip()
        # Store in database
        self.data["tools"][rel_path] = {
            "path": rel_path,
            "name": filepath.name,
            "type": file_type,
            "summary": summary,
            "purpose": purpose,
            "short_description": short_desc,
            "executable": os.access(filepath, os.X_OK)
        }
        self.save_db()
        print(f"✓ Added {rel_path} to database")
    def search_with_ollama(self, query):
        """Search using natural language with Ollama"""
        try:
            tools_info = []
            for tool_data in self.data["tools"].values():
                tools_info.append(f"{tool_data['name']}: {tool_data['summary']} (Purpose: {tool_data['purpose']})")
            tools_text = "\n".join(tools_info)
            prompt = f"""
 Given this query: "{query}"
 Find the most relevant tools from this list. Respond with ONLY the tool names (one per line) in order of relevance:
-{tools_text}
+def extract_catalog(readme_text: str) -> list[dict[str, str]]:
    in_catalog = False
    entries: list[dict[str, str]] = []
    for raw_line in readme_text.splitlines():
        line = raw_line.rstrip()
        if line == CATALOG_HEADING:
            in_catalog = True
            continue
        if in_catalog and line.startswith("## "):
            break
        if not in_catalog:
            continue
        match = ENTRY_RE.match(line)
        if not match:
            continue
        path, goal, usage = match.groups()
        entries.append(
            {
                "path": path,
                "goal": goal.strip(),
                "usage": usage.strip(),
            }
        )
    if not entries:
        raise WhatError(
            "No tool catalog entries found in README. "
            f"Expected entries under '{CATALOG_HEADING}'."
        )
    return entries
 def ensure_ollama_available(model: str) -> None:
    if not shutil_which("ollama"):
        raise WhatError("`ollama` is not installed or not in PATH.")
    try:
        result = subprocess.run(
            ["ollama", "list"],
            capture_output=True,
            text=True,
            timeout=10,
            check=False,
        )
    except subprocess.SubprocessError as exc:
        raise WhatError(f"Failed to talk to Ollama: {exc}") from exc
    if result.returncode != 0:
        stderr = result.stderr.strip() or "unknown error"
        raise WhatError(f"Ollama is unavailable: {stderr}")
    models = result.stdout.lower()
    if model.lower() not in models:
        raise WhatError(
            f"Model '{model}' is not available locally. "
            "Pull it first with `ollama pull ...`."
        )
 def shutil_which(binary: str) -> str | None:
    for directory in os.environ.get("PATH", "").split(os.pathsep):
        candidate = Path(directory) / binary
        if candidate.is_file() and os.access(candidate, os.X_OK):
            return str(candidate)
    return None
 def build_prompt(query: str, entries: list[dict[str, str]]) -> str:
    catalog_lines = [
        f'- {entry["path"]} | goal: {entry["goal"]} | usage: {entry["usage"]}'
        for entry in entries
    ]
    catalog = "\n".join(catalog_lines)
    return f"""You are selecting tools from a repository catalog.
 Use only the catalog below. Prefer direct matches. Use archived tools only if they clearly fit the request.
 Return strict JSON only. The response must be a JSON array with up to 8 objects.
 Each object must contain:
 - "path": exact catalog path
 - "reason": one short sentence
 Do not invent paths. Do not include markdown.
 Prefer the entry whose action best matches the query: compare beats hash for comparison queries, open beats convert for opening queries, and mount beats inspect for mount queries.
 Query: {query}
-Response (tool names only, one per line, max 10):
+Catalog:
 {catalog}
 """
            result = subprocess.run([
                'ollama', 'run', 'gemma2:2b', prompt
            ], capture_output=True, text=True, timeout=20)
            if result.returncode == 0:
                tool_names = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]
                # Find matching tools in database
                matches = []
                for tool_name in tool_names[:10]:  # Limit to top 10
                    for tool_data in self.data["tools"].values():
                        if tool_data['name'] == tool_name:
                            matches.append(tool_data)
                            break
                return matches
        except Exception as e:
            print(f"Ollama search failed: {e}")
        return None
    def search_with_fzf(self, query):
        """Search using fzf fuzzy finder"""
        try:
            # Prepare search data for fzf
            search_lines = []
            for tool_data in self.data["tools"].values():
                line = f"{tool_data['name']} # {tool_data['short_description']} | {tool_data['path']}"
                search_lines.append(line)
            search_input = "\n".join(search_lines)
            # Run fzf with initial query
            result = subprocess.run([
                'fzf', '--filter', query, '--no-sort'
            ], input=search_input, capture_output=True, text=True)
            if result.returncode == 0:
                matches = []
                for line in result.stdout.strip().split('\n'):
                    if ' | ' in line:
                        path = line.split(' | ')[-1]
                        if path in self.data["tools"]:
                            matches.append(self.data["tools"][path])
                return matches
        except Exception as e:
            print(f"fzf search failed: {e}")
        return None
    def search_with_grep(self, query):
        """Fallback search using grep-like functionality"""
        matches = []
        query_lower = query.lower()
        for tool_data in self.data["tools"].values():
            # Search in name, summary, purpose, and short description
            searchable = f"{tool_data['name']} {tool_data['summary']} {tool_data['purpose']} {tool_data['short_description']}".lower()
            if query_lower in searchable:
                matches.append(tool_data)
        # Simple relevance scoring
        def score_match(tool):
            score = 0
            query_lower = query.lower()
            if query_lower in tool['name'].lower():
                score += 10
            if query_lower in tool['short_description'].lower():
                score += 5
            if query_lower in tool['summary'].lower():
                score += 3
            if query_lower in tool['purpose'].lower():
                score += 2
            return score
        matches.sort(key=score_match, reverse=True)
        return matches[:20]  # Limit results
    def search(self, query):
        """Search using the best available method"""
        if not query:
            return []
        print(f"Searching for: {query}")
        # Try Ollama first
        if self.has_ollama:
            print("Using Ollama Gemma2 for natural language search...")
            results = self.search_with_ollama(query)
            if results is not None:
                return results
            print("Ollama search failed, falling back to fzf...")
        # Try fzf
        if self.has_fzf:
            print("Using fzf for fuzzy search...")
            results = self.search_with_fzf(query)
            if results is not None:
                return results
            print("fzf search failed, falling back to grep...")
        # Fallback to grep
        print("Using basic text search...")
        return self.search_with_grep(query)
    def list_all_tools(self):
        """List all tools with short descriptions"""
        if not self.data["tools"]:
            print("No tools in database. Use 'what -a <file>' to add tools.")
            return
        print("Available tools:")
        print()
        # Sort by name
        tools = sorted(self.data["tools"].values(), key=lambda x: x['name'])
        # Calculate max name length for alignment
        max_name_len = max(len(tool['name']) for tool in tools)
        for tool in tools:
            executable_mark = "*" if tool.get('executable', False) else " "
            name_padded = tool['name'].ljust(max_name_len)
            print(f"{executable_mark}{name_padded}  #  {tool['short_description']}")
    def show_search_results(self, results):
        """Display search results"""
        if not results:
            print("No tools found matching your query.")
            return
        print(f"\nFound {len(results)} tool(s):")
        print()
        for i, tool in enumerate(results, 1):
            executable_mark = "*" if tool.get('executable', False) else " "
            print(f"{i:2d}. {executable_mark}{tool['name']}")
            print(f"    Path: {tool['path']}")
            print(f"    Type: {tool['type']}")
            print(f"    Purpose: {tool['purpose']}")
            print(f"    Summary: {tool['summary']}")
            print()
-def main():
+
-    parser = argparse.ArgumentParser(description="Smart repository search tool")
+def tokenize(text: str) -> set[str]:
-    parser.add_argument("query", nargs="?", help="Search query")
+    return set(TOKEN_RE.findall(text.lower()))
-    parser.add_argument("-l", "--list", action="store_true", 
+
-                       help="List all tools with short descriptions")
+
-    parser.add_argument("-a", "--add", metavar="PATH", 
+def shortlist_entries(query: str, entries: list[dict[str, str]], limit: int = 28) -> list[dict[str, str]]:
-                       help="Add new file to database")
+    query_tokens = tokenize(query)
-    
+    if not query_tokens:
        return entries[:limit]
    scored: list[tuple[int, dict[str, str]]] = []
    for entry in entries:
        haystack = f'{entry["path"]} {entry["goal"]} {entry["usage"]}'.lower()
        entry_tokens = tokenize(haystack)
        overlap = len(query_tokens & entry_tokens)
        substring_hits = sum(1 for token in query_tokens if token in haystack)
        archive_penalty = 1 if entry["path"].startswith("archive/") else 0
        score = overlap * 5 + substring_hits - archive_penalty
        scored.append((score, entry))
    scored.sort(key=lambda item: item[0], reverse=True)
    best = [entry for score, entry in scored if score > 0][:limit]
    return best or entries[:limit]
 def extract_json_array(output: str) -> list[dict[str, str]]:
    match = re.search(r"\[\s*\{.*\}\s*\]", output, re.DOTALL)
    payload = match.group(0) if match else output
    data = json.loads(payload)
    if not isinstance(data, list):
        raise WhatError("Model output must be a JSON array.")
    normalized: list[dict[str, str]] = []
    for item in data:
        if not isinstance(item, dict):
            continue
        path = str(item.get("path", "")).strip()
        reason = str(item.get("reason", "")).strip()
        if path:
            normalized.append({"path": path, "reason": reason})
    return normalized
 def run_ollama_once(prompt: str, model: str) -> str:
    try:
        result = subprocess.run(
            ["ollama", "run", model, prompt],
            capture_output=True,
            text=True,
            timeout=60,
            check=False,
        )
    except subprocess.SubprocessError as exc:
        raise WhatError(f"Ollama run failed: {exc}") from exc
    if result.returncode != 0:
        stderr = result.stderr.strip() or "unknown error"
        raise WhatError(f"Ollama run failed: {stderr}")
    return result.stdout.strip()
 def run_ollama(prompt: str, model: str) -> list[dict[str, str]]:
    first_output = run_ollama_once(prompt, model)
    try:
        return extract_json_array(first_output)
    except (json.JSONDecodeError, WhatError):
        repair_prompt = (
            "Rewrite the following response as strict JSON only.\n"
            'Target format: [{"path":"exact catalog path","reason":"short reason"}]\n'
            "Do not add markdown or commentary.\n\n"
            f"Response to repair:\n{first_output}\n"
        )
        repaired_output = run_ollama_once(repair_prompt, model)
        try:
            return extract_json_array(repaired_output)
        except (json.JSONDecodeError, WhatError) as exc:
            raise WhatError(
                "Model output was not valid JSON after repair. "
                f"Raw output was:\n{repaired_output}"
            ) from exc
 def search(query: str, entries: list[dict[str, str]], model: str) -> list[dict[str, str]]:
    ensure_ollama_available(model)
    prompt_entries = shortlist_entries(query, entries)
    raw_results = run_ollama(build_prompt(query, prompt_entries), model)
    entry_map = {entry["path"]: entry for entry in entries}
    results: list[dict[str, str]] = []
    seen: set[str] = set()
    for item in raw_results:
        path = item["path"]
        if path not in entry_map or path in seen:
            continue
        seen.add(path)
        merged = dict(entry_map[path])
        merged["reason"] = item.get("reason", "")
        results.append(merged)
    return results
 def list_entries(entries: list[dict[str, str]]) -> None:
    for entry in entries:
        print(f'{entry["path"]}')
        print(f'  goal:  {entry["goal"]}')
        print(f'  usage: {entry["usage"]}')
 def show_results(query: str, results: list[dict[str, str]], model: str) -> None:
    if not results:
        print(f"No catalogued tool matched: {query}")
        return
    print(f"Model: {model}")
    print(f"Query: {query}")
    print()
    for idx, item in enumerate(results, 1):
        print(f"{idx}. {item['path']}")
        print(f"   Goal: {item['goal']}")
        print(f"   Usage: {item['usage']}")
        if item.get("reason"):
            print(f"   Why: {item['reason']}")
        print()
 def main() -> int:
    parser = argparse.ArgumentParser(description="README-driven repository search using Ollama")
    parser.add_argument("query", nargs="?", help="Natural-language search query")
    parser.add_argument("-l", "--list", action="store_true", help="List catalogued tools")
    parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Ollama model to use (default: {DEFAULT_MODEL})")
    args = parser.parse_args()
-    
+
-    tool = WhatTool()
+    try:
-    
+        entries = extract_catalog(load_readme())
    except WhatError as exc:
        print(f"Error: {exc}", file=sys.stderr)
        return 1
    if args.list:
-        tool.list_all_tools()
+        list_entries(entries)
-        return
+        return 0
-    
+
    if args.add:
        filepath = Path(args.add)
        if not filepath.exists():
            print(f"Error: File {filepath} does not exist")
            sys.exit(1)
        if not filepath.is_relative_to(REPO_ROOT):
            print(f"Error: File must be within the repository ({REPO_ROOT})")
            sys.exit(1)
        tool.add_file_interactive(filepath)
        return
    if not args.query:
        parser.print_help()
        print()
-        print("Available search methods:")
+        print(f"Catalog source: {README_PATH}")
-        if tool.has_ollama:
+        print(f"Default model: {args.model}")
-            print("  ✓ Ollama + Gemma2 (natural language)")
+        return 0
-        else:
+
-            print("  ✗ Ollama + Gemma2 (not available)")
+    try:
-        
+        results = search(args.query, entries, args.model)
-        if tool.has_fzf:
+    except WhatError as exc:
-            print("  ✓ fzf (fuzzy finding)")
+        print(f"Error: {exc}", file=sys.stderr)
-        else:
+        return 1
-            print("  ✗ fzf (not available)")
+
-        
+    show_results(args.query, results, args.model)
-        print("  ✓ grep (basic text search)")
+    return 0
-        return
+
    # Perform search
    results = tool.search(args.query)
    tool.show_search_results(results)
 if __name__ == "__main__":
-    main()
+    raise SystemExit(main())