gists/what

#!/usr/bin/env python3

"""
'what' - Smart repository search tool with progressive enhancement

Fallback hierarchy:
1. Ollama + Gemma2 (natural language search)
2. fzf (fuzzy finding)
3. grep (simple text search)

Usage:
    what <query>        # Find tools matching query
    what -h             # Show help
    what -l             # List all tools with short descriptions
    what -a <filepath>  # Add new file to database
"""

import os
import sys
import json
import argparse
import subprocess
import shutil
from pathlib import Path
import re

# Configuration
REPO_ROOT = Path(__file__).parent.absolute()
DB_FILE = REPO_ROOT / ".what_db.json"

class WhatTool:
    def __init__(self):
        self.db_path = DB_FILE
        self.data = self.load_db()

        # Detect available tools
        self.has_ollama = self.check_ollama()
        self.has_fzf = shutil.which('fzf') is not None

    def load_db(self):
        """Load the tool database"""
        if self.db_path.exists():
            try:
                with open(self.db_path, 'r') as f:
                    return json.load(f)
            except json.JSONDecodeError:
                print(f"Warning: Corrupted database {self.db_path}, creating new one")

        return {
            "version": "1.0",
            "tools": {}
        }

    def save_db(self):
        """Save the tool database"""
        with open(self.db_path, 'w') as f:
            json.dump(self.data, f, indent=2, sort_keys=True)

    def check_ollama(self):
        """Check if ollama with gemma2 is available"""
        try:
            result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=5)
            if result.returncode == 0:
                # Check if gemma2 model is available
                models = result.stdout.lower()
                return 'gemma2' in models
        except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
            pass
        return False

    def get_file_type(self, filepath):
        """Determine file type"""
        if not filepath.exists():
            return "missing"

        if filepath.is_dir():
            return "directory"

        # Check if executable
        is_executable = os.access(filepath, os.X_OK)

        # Check extension
        suffix = filepath.suffix.lower()

        if suffix == '.py':
            return "python script" if is_executable else "python module"
        elif suffix == '.sh':
            return "shell script"
        elif suffix == '.go':
            return "go program"
        elif suffix == '.js':
            return "javascript"
        elif suffix == '.ps1':
            return "powershell script"
        elif suffix == '.rs':
            return "rust program"
        elif suffix in ['.c', '.cpp']:
            return "c/c++ source"
        elif suffix == '.awk':
            return "awk script"
        elif not suffix and is_executable:
            return "binary executable"
        elif not suffix:
            return "script"
        else:
            return f"{suffix[1:]} file"

    def analyze_file_with_ollama(self, filepath):
        """Analyze file using Ollama Gemma2"""
        try:
            # Read file content (limit size for analysis)
            content = ""
            if filepath.stat().st_size > 50000:  # Skip very large files
                content = "[File too large for analysis]"
            else:
                try:
                    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()[:10000]  # First 10KB
                except:
                    content = "[Binary or unreadable file]"

            prompt = f"""
Analyze this code/script file and provide ONLY a JSON response with these fields:

Filename: {filepath.name}
File type: {self.get_file_type(filepath)}
Content preview:
{content[:2000]}

Respond with ONLY this JSON structure:
{{
    "summary": "Brief 1-2 sentence summary of what this tool does and how it works",
    "purpose": "What this tool is used for (e.g., 'Network analysis', 'File processing', 'Security scanning')",
    "short_description": "Very short description for listings (e.g., 'like md5sum but for files inside tarballs')"
}}
"""

            result = subprocess.run([
                'ollama', 'run', 'gemma2:2b', prompt
            ], capture_output=True, text=True, timeout=30)

            if result.returncode == 0:
                # Extract JSON from response
                response = result.stdout.strip()

                # Try to find JSON in the response
                json_match = re.search(r'\{.*\}', response, re.DOTALL)
                if json_match:
                    return json.loads(json_match.group())

        except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception) as e:
            print(f"Ollama analysis failed: {e}")

        return None

    def add_file_interactive(self, filepath):
        """Add file with interactive prompts"""
        rel_path = str(filepath.relative_to(REPO_ROOT))
        file_type = self.get_file_type(filepath)

        print(f"\nAdding: {rel_path}")
        print(f"Type: {file_type}")
        print()

        if self.has_ollama:
            print("Analyzing with Ollama Gemma2...")
            analysis = self.analyze_file_with_ollama(filepath)

            if analysis:
                print("AI Analysis complete. Review and edit if needed:")
                summary = input(f"Summary [{analysis.get('summary', '')}]: ").strip()
                purpose = input(f"Purpose [{analysis.get('purpose', '')}]: ").strip()
                short_desc = input(f"Short description [{analysis.get('short_description', '')}]: ").strip()

                # Use AI suggestions if user didn't provide alternatives
                summary = summary or analysis.get('summary', '')
                purpose = purpose or analysis.get('purpose', '')
                short_desc = short_desc or analysis.get('short_description', '')
            else:
                print("AI analysis failed, using manual input:")
                summary = input("Summary (what it does and how): ").strip()
                purpose = input("Purpose (what it's used for): ").strip()
                short_desc = input("Short description (for listings): ").strip()
        else:
            print("Manual input (Ollama not available):")
            summary = input("Summary (what it does and how): ").strip()
            purpose = input("Purpose (what it's used for): ").strip()
            short_desc = input("Short description (for listings): ").strip()

        # Store in database
        self.data["tools"][rel_path] = {
            "path": rel_path,
            "name": filepath.name,
            "type": file_type,
            "summary": summary,
            "purpose": purpose,
            "short_description": short_desc,
            "executable": os.access(filepath, os.X_OK)
        }

        self.save_db()
        print(f"✓ Added {rel_path} to database")

    def search_with_ollama(self, query):
        """Search using natural language with Ollama"""
        try:
            tools_info = []
            for tool_data in self.data["tools"].values():
                tools_info.append(f"{tool_data['name']}: {tool_data['summary']} (Purpose: {tool_data['purpose']})")

            tools_text = "\n".join(tools_info)

            prompt = f"""
Given this query: "{query}"

Find the most relevant tools from this list. Respond with ONLY the tool names (one per line) in order of relevance:

{tools_text}

Query: {query}

Response (tool names only, one per line, max 10):
"""

            result = subprocess.run([
                'ollama', 'run', 'gemma2:2b', prompt
            ], capture_output=True, text=True, timeout=20)

            if result.returncode == 0:
                tool_names = [line.strip() for line in result.stdout.strip().split('\n') if line.strip()]

                # Find matching tools in database
                matches = []
                for tool_name in tool_names[:10]:  # Limit to top 10
                    for tool_data in self.data["tools"].values():
                        if tool_data['name'] == tool_name:
                            matches.append(tool_data)
                            break

                return matches

        except Exception as e:
            print(f"Ollama search failed: {e}")

        return None

    def search_with_fzf(self, query):
        """Search using fzf fuzzy finder"""
        try:
            # Prepare search data for fzf
            search_lines = []
            for tool_data in self.data["tools"].values():
                line = f"{tool_data['name']} # {tool_data['short_description']} | {tool_data['path']}"
                search_lines.append(line)

            search_input = "\n".join(search_lines)

            # Run fzf with initial query
            result = subprocess.run([
                'fzf', '--filter', query, '--no-sort'
            ], input=search_input, capture_output=True, text=True)

            if result.returncode == 0:
                matches = []
                for line in result.stdout.strip().split('\n'):
                    if ' | ' in line:
                        path = line.split(' | ')[-1]
                        if path in self.data["tools"]:
                            matches.append(self.data["tools"][path])

                return matches

        except Exception as e:
            print(f"fzf search failed: {e}")

        return None

    def search_with_grep(self, query):
        """Fallback search using grep-like functionality"""
        matches = []
        query_lower = query.lower()

        for tool_data in self.data["tools"].values():
            # Search in name, summary, purpose, and short description
            searchable = f"{tool_data['name']} {tool_data['summary']} {tool_data['purpose']} {tool_data['short_description']}".lower()

            if query_lower in searchable:
                matches.append(tool_data)

        # Simple relevance scoring
        def score_match(tool):
            score = 0
            query_lower = query.lower()
            if query_lower in tool['name'].lower():
                score += 10
            if query_lower in tool['short_description'].lower():
                score += 5
            if query_lower in tool['summary'].lower():
                score += 3
            if query_lower in tool['purpose'].lower():
                score += 2
            return score

        matches.sort(key=score_match, reverse=True)
        return matches[:20]  # Limit results

    def search(self, query):
        """Search using the best available method"""
        if not query:
            return []

        print(f"Searching for: {query}")

        # Try Ollama first
        if self.has_ollama:
            print("Using Ollama Gemma2 for natural language search...")
            results = self.search_with_ollama(query)
            if results is not None:
                return results
            print("Ollama search failed, falling back to fzf...")

        # Try fzf
        if self.has_fzf:
            print("Using fzf for fuzzy search...")
            results = self.search_with_fzf(query)
            if results is not None:
                return results
            print("fzf search failed, falling back to grep...")

        # Fallback to grep
        print("Using basic text search...")
        return self.search_with_grep(query)

    def list_all_tools(self):
        """List all tools with short descriptions"""
        if not self.data["tools"]:
            print("No tools in database. Use 'what -a <file>' to add tools.")
            return

        print("Available tools:")
        print()

        # Sort by name
        tools = sorted(self.data["tools"].values(), key=lambda x: x['name'])

        # Calculate max name length for alignment
        max_name_len = max(len(tool['name']) for tool in tools)

        for tool in tools:
            executable_mark = "*" if tool.get('executable', False) else " "
            name_padded = tool['name'].ljust(max_name_len)
            print(f"{executable_mark}{name_padded}  #  {tool['short_description']}")

    def show_search_results(self, results):
        """Display search results"""
        if not results:
            print("No tools found matching your query.")
            return

        print(f"\nFound {len(results)} tool(s):")
        print()

        for i, tool in enumerate(results, 1):
            executable_mark = "*" if tool.get('executable', False) else " "
            print(f"{i:2d}. {executable_mark}{tool['name']}")
            print(f"    Path: {tool['path']}")
            print(f"    Type: {tool['type']}")
            print(f"    Purpose: {tool['purpose']}")
            print(f"    Summary: {tool['summary']}")
            print()

def main():
    parser = argparse.ArgumentParser(description="Smart repository search tool")
    parser.add_argument("query", nargs="?", help="Search query")
    parser.add_argument("-l", "--list", action="store_true",
                       help="List all tools with short descriptions")
    parser.add_argument("-a", "--add", metavar="PATH",
                       help="Add new file to database")

    args = parser.parse_args()

    tool = WhatTool()

    if args.list:
        tool.list_all_tools()
        return

    if args.add:
        filepath = Path(args.add)
        if not filepath.exists():
            print(f"Error: File {filepath} does not exist")
            sys.exit(1)

        if not filepath.is_relative_to(REPO_ROOT):
            print(f"Error: File must be within the repository ({REPO_ROOT})")
            sys.exit(1)

        tool.add_file_interactive(filepath)
        return

    if not args.query:
        parser.print_help()
        print()
        print("Available search methods:")
        if tool.has_ollama:
            print("  ✓ Ollama + Gemma2 (natural language)")
        else:
            print("  ✗ Ollama + Gemma2 (not available)")

        if tool.has_fzf:
            print("  ✓ fzf (fuzzy finding)")
        else:
            print("  ✗ fzf (not available)")

        print("  ✓ grep (basic text search)")
        return

    # Perform search
    results = tool.search(args.query)
    tool.show_search_results(results)

if __name__ == "__main__":
    main()