#!/usr/bin/env python3 """ `what` - README-driven repository search using Ollama only. Usage: what # Find tools matching a natural-language query what -l # List catalogued tools what --model ... # Override the default Ollama model """ from __future__ import annotations import argparse import json import os import re import subprocess import sys from pathlib import Path REPO_ROOT = Path(__file__).parent.resolve() README_PATH = REPO_ROOT / "README.md" DEFAULT_MODEL = os.environ.get("WHAT_OLLAMA_MODEL", "ministral-3:3b") CATALOG_HEADING = "## Tool Catalog" ENTRY_RE = re.compile( r"^- `([^`]+)` \| goal: (.*?) \| usage: (.*)$" ) TOKEN_RE = re.compile(r"[a-z0-9_.+-]+") class WhatError(Exception): pass def load_readme() -> str: if not README_PATH.exists(): raise WhatError(f"README not found at {README_PATH}") return README_PATH.read_text(encoding="utf-8") def extract_catalog(readme_text: str) -> list[dict[str, str]]: in_catalog = False entries: list[dict[str, str]] = [] for raw_line in readme_text.splitlines(): line = raw_line.rstrip() if line == CATALOG_HEADING: in_catalog = True continue if in_catalog and line.startswith("## "): break if not in_catalog: continue match = ENTRY_RE.match(line) if not match: continue path, goal, usage = match.groups() entries.append( { "path": path, "goal": goal.strip(), "usage": usage.strip(), } ) if not entries: raise WhatError( "No tool catalog entries found in README. " f"Expected entries under '{CATALOG_HEADING}'." ) return entries def ensure_ollama_available(model: str) -> None: if not shutil_which("ollama"): raise WhatError("`ollama` is not installed or not in PATH.") try: result = subprocess.run( ["ollama", "list"], capture_output=True, text=True, timeout=10, check=False, ) except subprocess.SubprocessError as exc: raise WhatError(f"Failed to talk to Ollama: {exc}") from exc if result.returncode != 0: stderr = result.stderr.strip() or "unknown error" raise WhatError(f"Ollama is unavailable: {stderr}") models = result.stdout.lower() if model.lower() not in models: raise WhatError( f"Model '{model}' is not available locally. " "Pull it first with `ollama pull ...`." ) def shutil_which(binary: str) -> str | None: for directory in os.environ.get("PATH", "").split(os.pathsep): candidate = Path(directory) / binary if candidate.is_file() and os.access(candidate, os.X_OK): return str(candidate) return None def build_prompt(query: str, entries: list[dict[str, str]]) -> str: catalog_lines = [ f'- {entry["path"]} | goal: {entry["goal"]} | usage: {entry["usage"]}' for entry in entries ] catalog = "\n".join(catalog_lines) return f"""You are selecting tools from a repository catalog. Use only the catalog below. Prefer direct matches. Use archived tools only if they clearly fit the request. Return strict JSON only. The response must be a JSON array with up to 8 objects. Each object must contain: - "path": exact catalog path - "reason": one short sentence Do not invent paths. Do not include markdown. Prefer the entry whose action best matches the query: compare beats hash for comparison queries, open beats convert for opening queries, and mount beats inspect for mount queries. Query: {query} Catalog: {catalog} """ def tokenize(text: str) -> set[str]: return set(TOKEN_RE.findall(text.lower())) def shortlist_entries(query: str, entries: list[dict[str, str]], limit: int = 28) -> list[dict[str, str]]: query_tokens = tokenize(query) if not query_tokens: return entries[:limit] scored: list[tuple[int, dict[str, str]]] = [] for entry in entries: haystack = f'{entry["path"]} {entry["goal"]} {entry["usage"]}'.lower() entry_tokens = tokenize(haystack) overlap = len(query_tokens & entry_tokens) substring_hits = sum(1 for token in query_tokens if token in haystack) archive_penalty = 1 if entry["path"].startswith("archive/") else 0 score = overlap * 5 + substring_hits - archive_penalty scored.append((score, entry)) scored.sort(key=lambda item: item[0], reverse=True) best = [entry for score, entry in scored if score > 0][:limit] return best or entries[:limit] def extract_json_array(output: str) -> list[dict[str, str]]: match = re.search(r"\[\s*\{.*\}\s*\]", output, re.DOTALL) payload = match.group(0) if match else output data = json.loads(payload) if not isinstance(data, list): raise WhatError("Model output must be a JSON array.") normalized: list[dict[str, str]] = [] for item in data: if not isinstance(item, dict): continue path = str(item.get("path", "")).strip() reason = str(item.get("reason", "")).strip() if path: normalized.append({"path": path, "reason": reason}) return normalized def run_ollama_once(prompt: str, model: str) -> str: try: result = subprocess.run( ["ollama", "run", model, prompt], capture_output=True, text=True, timeout=60, check=False, ) except subprocess.SubprocessError as exc: raise WhatError(f"Ollama run failed: {exc}") from exc if result.returncode != 0: stderr = result.stderr.strip() or "unknown error" raise WhatError(f"Ollama run failed: {stderr}") return result.stdout.strip() def run_ollama(prompt: str, model: str) -> list[dict[str, str]]: first_output = run_ollama_once(prompt, model) try: return extract_json_array(first_output) except (json.JSONDecodeError, WhatError): repair_prompt = ( "Rewrite the following response as strict JSON only.\n" 'Target format: [{"path":"exact catalog path","reason":"short reason"}]\n' "Do not add markdown or commentary.\n\n" f"Response to repair:\n{first_output}\n" ) repaired_output = run_ollama_once(repair_prompt, model) try: return extract_json_array(repaired_output) except (json.JSONDecodeError, WhatError) as exc: raise WhatError( "Model output was not valid JSON after repair. " f"Raw output was:\n{repaired_output}" ) from exc def search(query: str, entries: list[dict[str, str]], model: str) -> list[dict[str, str]]: ensure_ollama_available(model) prompt_entries = shortlist_entries(query, entries) raw_results = run_ollama(build_prompt(query, prompt_entries), model) entry_map = {entry["path"]: entry for entry in entries} results: list[dict[str, str]] = [] seen: set[str] = set() for item in raw_results: path = item["path"] if path not in entry_map or path in seen: continue seen.add(path) merged = dict(entry_map[path]) merged["reason"] = item.get("reason", "") results.append(merged) return results def list_entries(entries: list[dict[str, str]]) -> None: for entry in entries: print(f'{entry["path"]}') print(f' goal: {entry["goal"]}') print(f' usage: {entry["usage"]}') def show_results(query: str, results: list[dict[str, str]], model: str) -> None: if not results: print(f"No catalogued tool matched: {query}") return print(f"Model: {model}") print(f"Query: {query}") print() for idx, item in enumerate(results, 1): print(f"{idx}. {item['path']}") print(f" Goal: {item['goal']}") print(f" Usage: {item['usage']}") if item.get("reason"): print(f" Why: {item['reason']}") print() def main() -> int: parser = argparse.ArgumentParser(description="README-driven repository search using Ollama") parser.add_argument("query", nargs="?", help="Natural-language search query") parser.add_argument("-l", "--list", action="store_true", help="List catalogued tools") parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Ollama model to use (default: {DEFAULT_MODEL})") args = parser.parse_args() try: entries = extract_catalog(load_readme()) except WhatError as exc: print(f"Error: {exc}", file=sys.stderr) return 1 if args.list: list_entries(entries) return 0 if not args.query: parser.print_help() print() print(f"Catalog source: {README_PATH}") print(f"Default model: {args.model}") return 0 try: results = search(args.query, entries, args.model) except WhatError as exc: print(f"Error: {exc}", file=sys.stderr) return 1 show_results(args.query, results, args.model) return 0 if __name__ == "__main__": raise SystemExit(main())