Remove the JSON tool database and move tool metadata into a compact README catalog. Make what README-driven and Ollama-only, with shortlist generation and JSON-repair retry handling. Pull qwen3.5:2b and ministral-3:3b, compare them on fixed repository queries, and set ministral-3:3b as the default model. Tighten README wording so similar tools like domgrep/geturls and sparsecmp/scatterhash rank correctly.
304 lines
9.1 KiB
Python
Executable File
304 lines
9.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
"""
|
|
`what` - README-driven repository search using Ollama only.
|
|
|
|
Usage:
|
|
what <query> # Find tools matching a natural-language query
|
|
what -l # List catalogued tools
|
|
what --model <model> ... # Override the default Ollama model
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).parent.resolve()
|
|
README_PATH = REPO_ROOT / "README.md"
|
|
DEFAULT_MODEL = os.environ.get("WHAT_OLLAMA_MODEL", "ministral-3:3b")
|
|
CATALOG_HEADING = "## Tool Catalog"
|
|
ENTRY_RE = re.compile(
|
|
r"^- `([^`]+)` \| goal: (.*?) \| usage: (.*)$"
|
|
)
|
|
TOKEN_RE = re.compile(r"[a-z0-9_.+-]+")
|
|
|
|
|
|
class WhatError(Exception):
|
|
pass
|
|
|
|
|
|
def load_readme() -> str:
|
|
if not README_PATH.exists():
|
|
raise WhatError(f"README not found at {README_PATH}")
|
|
return README_PATH.read_text(encoding="utf-8")
|
|
|
|
|
|
def extract_catalog(readme_text: str) -> list[dict[str, str]]:
|
|
in_catalog = False
|
|
entries: list[dict[str, str]] = []
|
|
|
|
for raw_line in readme_text.splitlines():
|
|
line = raw_line.rstrip()
|
|
|
|
if line == CATALOG_HEADING:
|
|
in_catalog = True
|
|
continue
|
|
|
|
if in_catalog and line.startswith("## "):
|
|
break
|
|
|
|
if not in_catalog:
|
|
continue
|
|
|
|
match = ENTRY_RE.match(line)
|
|
if not match:
|
|
continue
|
|
|
|
path, goal, usage = match.groups()
|
|
entries.append(
|
|
{
|
|
"path": path,
|
|
"goal": goal.strip(),
|
|
"usage": usage.strip(),
|
|
}
|
|
)
|
|
|
|
if not entries:
|
|
raise WhatError(
|
|
"No tool catalog entries found in README. "
|
|
f"Expected entries under '{CATALOG_HEADING}'."
|
|
)
|
|
|
|
return entries
|
|
|
|
|
|
def ensure_ollama_available(model: str) -> None:
|
|
if not shutil_which("ollama"):
|
|
raise WhatError("`ollama` is not installed or not in PATH.")
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["ollama", "list"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
check=False,
|
|
)
|
|
except subprocess.SubprocessError as exc:
|
|
raise WhatError(f"Failed to talk to Ollama: {exc}") from exc
|
|
|
|
if result.returncode != 0:
|
|
stderr = result.stderr.strip() or "unknown error"
|
|
raise WhatError(f"Ollama is unavailable: {stderr}")
|
|
|
|
models = result.stdout.lower()
|
|
if model.lower() not in models:
|
|
raise WhatError(
|
|
f"Model '{model}' is not available locally. "
|
|
"Pull it first with `ollama pull ...`."
|
|
)
|
|
|
|
|
|
def shutil_which(binary: str) -> str | None:
|
|
for directory in os.environ.get("PATH", "").split(os.pathsep):
|
|
candidate = Path(directory) / binary
|
|
if candidate.is_file() and os.access(candidate, os.X_OK):
|
|
return str(candidate)
|
|
return None
|
|
|
|
|
|
def build_prompt(query: str, entries: list[dict[str, str]]) -> str:
|
|
catalog_lines = [
|
|
f'- {entry["path"]} | goal: {entry["goal"]} | usage: {entry["usage"]}'
|
|
for entry in entries
|
|
]
|
|
catalog = "\n".join(catalog_lines)
|
|
|
|
return f"""You are selecting tools from a repository catalog.
|
|
Use only the catalog below. Prefer direct matches. Use archived tools only if they clearly fit the request.
|
|
|
|
Return strict JSON only. The response must be a JSON array with up to 8 objects.
|
|
Each object must contain:
|
|
- "path": exact catalog path
|
|
- "reason": one short sentence
|
|
|
|
Do not invent paths. Do not include markdown.
|
|
Prefer the entry whose action best matches the query: compare beats hash for comparison queries, open beats convert for opening queries, and mount beats inspect for mount queries.
|
|
|
|
Query: {query}
|
|
|
|
Catalog:
|
|
{catalog}
|
|
"""
|
|
|
|
|
|
def tokenize(text: str) -> set[str]:
|
|
return set(TOKEN_RE.findall(text.lower()))
|
|
|
|
|
|
def shortlist_entries(query: str, entries: list[dict[str, str]], limit: int = 28) -> list[dict[str, str]]:
|
|
query_tokens = tokenize(query)
|
|
if not query_tokens:
|
|
return entries[:limit]
|
|
|
|
scored: list[tuple[int, dict[str, str]]] = []
|
|
for entry in entries:
|
|
haystack = f'{entry["path"]} {entry["goal"]} {entry["usage"]}'.lower()
|
|
entry_tokens = tokenize(haystack)
|
|
overlap = len(query_tokens & entry_tokens)
|
|
substring_hits = sum(1 for token in query_tokens if token in haystack)
|
|
archive_penalty = 1 if entry["path"].startswith("archive/") else 0
|
|
score = overlap * 5 + substring_hits - archive_penalty
|
|
scored.append((score, entry))
|
|
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
best = [entry for score, entry in scored if score > 0][:limit]
|
|
return best or entries[:limit]
|
|
|
|
|
|
def extract_json_array(output: str) -> list[dict[str, str]]:
|
|
match = re.search(r"\[\s*\{.*\}\s*\]", output, re.DOTALL)
|
|
payload = match.group(0) if match else output
|
|
|
|
data = json.loads(payload)
|
|
if not isinstance(data, list):
|
|
raise WhatError("Model output must be a JSON array.")
|
|
|
|
normalized: list[dict[str, str]] = []
|
|
for item in data:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
path = str(item.get("path", "")).strip()
|
|
reason = str(item.get("reason", "")).strip()
|
|
if path:
|
|
normalized.append({"path": path, "reason": reason})
|
|
return normalized
|
|
|
|
|
|
def run_ollama_once(prompt: str, model: str) -> str:
|
|
try:
|
|
result = subprocess.run(
|
|
["ollama", "run", model, prompt],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
check=False,
|
|
)
|
|
except subprocess.SubprocessError as exc:
|
|
raise WhatError(f"Ollama run failed: {exc}") from exc
|
|
|
|
if result.returncode != 0:
|
|
stderr = result.stderr.strip() or "unknown error"
|
|
raise WhatError(f"Ollama run failed: {stderr}")
|
|
|
|
return result.stdout.strip()
|
|
|
|
|
|
def run_ollama(prompt: str, model: str) -> list[dict[str, str]]:
|
|
first_output = run_ollama_once(prompt, model)
|
|
try:
|
|
return extract_json_array(first_output)
|
|
except (json.JSONDecodeError, WhatError):
|
|
repair_prompt = (
|
|
"Rewrite the following response as strict JSON only.\n"
|
|
'Target format: [{"path":"exact catalog path","reason":"short reason"}]\n'
|
|
"Do not add markdown or commentary.\n\n"
|
|
f"Response to repair:\n{first_output}\n"
|
|
)
|
|
repaired_output = run_ollama_once(repair_prompt, model)
|
|
try:
|
|
return extract_json_array(repaired_output)
|
|
except (json.JSONDecodeError, WhatError) as exc:
|
|
raise WhatError(
|
|
"Model output was not valid JSON after repair. "
|
|
f"Raw output was:\n{repaired_output}"
|
|
) from exc
|
|
|
|
|
|
def search(query: str, entries: list[dict[str, str]], model: str) -> list[dict[str, str]]:
|
|
ensure_ollama_available(model)
|
|
prompt_entries = shortlist_entries(query, entries)
|
|
raw_results = run_ollama(build_prompt(query, prompt_entries), model)
|
|
entry_map = {entry["path"]: entry for entry in entries}
|
|
|
|
results: list[dict[str, str]] = []
|
|
seen: set[str] = set()
|
|
for item in raw_results:
|
|
path = item["path"]
|
|
if path not in entry_map or path in seen:
|
|
continue
|
|
seen.add(path)
|
|
merged = dict(entry_map[path])
|
|
merged["reason"] = item.get("reason", "")
|
|
results.append(merged)
|
|
return results
|
|
|
|
|
|
def list_entries(entries: list[dict[str, str]]) -> None:
|
|
for entry in entries:
|
|
print(f'{entry["path"]}')
|
|
print(f' goal: {entry["goal"]}')
|
|
print(f' usage: {entry["usage"]}')
|
|
|
|
|
|
def show_results(query: str, results: list[dict[str, str]], model: str) -> None:
|
|
if not results:
|
|
print(f"No catalogued tool matched: {query}")
|
|
return
|
|
|
|
print(f"Model: {model}")
|
|
print(f"Query: {query}")
|
|
print()
|
|
|
|
for idx, item in enumerate(results, 1):
|
|
print(f"{idx}. {item['path']}")
|
|
print(f" Goal: {item['goal']}")
|
|
print(f" Usage: {item['usage']}")
|
|
if item.get("reason"):
|
|
print(f" Why: {item['reason']}")
|
|
print()
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="README-driven repository search using Ollama")
|
|
parser.add_argument("query", nargs="?", help="Natural-language search query")
|
|
parser.add_argument("-l", "--list", action="store_true", help="List catalogued tools")
|
|
parser.add_argument("--model", default=DEFAULT_MODEL, help=f"Ollama model to use (default: {DEFAULT_MODEL})")
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
entries = extract_catalog(load_readme())
|
|
except WhatError as exc:
|
|
print(f"Error: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
if args.list:
|
|
list_entries(entries)
|
|
return 0
|
|
|
|
if not args.query:
|
|
parser.print_help()
|
|
print()
|
|
print(f"Catalog source: {README_PATH}")
|
|
print(f"Default model: {args.model}")
|
|
return 0
|
|
|
|
try:
|
|
results = search(args.query, entries, args.model)
|
|
except WhatError as exc:
|
|
print(f"Error: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
show_results(args.query, results, args.model)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|