Files
gists/config/visidata/plugins/lookupcore.py
tobias 84d912ac0a visidata: add IOC types with cached, throttled lookups
Centralize provider caching and rate-limit handling, then add Domain/URL/Hash IOC types and safer VT/IPInfo key resolution so lookups stay reliable on free-tier APIs.
2026-02-21 23:20:42 +01:00

337 lines
8.8 KiB
Python

"""
Lookup/caching helpers shared across local plugins.
Depends on VisiData (`vd`) because options are stored in vd.options.
"""
from __future__ import annotations
import hashlib
import os
import pickle
import sqlite3
import threading
import time
from email.utils import parsedate_to_datetime
from typing import Any, Callable, Dict, Optional
from visidata import vd
vd.option(
"tke_cache_db_path",
os.path.expanduser("~/.visidata_cache.db"),
"sqlite cache db path for local lookups (pickle-serialized)",
sheettype=None,
)
vd.option(
"tke_lookup_cache_ttl", 60 * 60 * 24, "lookup cache ttl in seconds", sheettype=None
)
vd.option(
"tke_lookup_error_ttl",
5 * 60,
"cache ttl in seconds for failed lookups (to avoid tight loops)",
sheettype=None,
)
vd.option("tke_lookup_timeout", 10, "HTTP lookup timeout in seconds", sheettype=None)
vd.option(
"tke_http_retries",
1,
"number of retries for transient HTTP failures",
sheettype=None,
)
# Provider-specific minimum delay between requests (seconds).
vd.option(
"tke_throttle_default_sec",
0.0,
"default min delay between HTTP requests",
sheettype=None,
)
vd.option(
"tke_throttle_vt_sec",
16.0,
"min delay between VirusTotal API requests",
sheettype=None,
)
vd.option(
"tke_throttle_ipinfo_sec", 0.5, "min delay between ipinfo requests", sheettype=None
)
vd.option(
"tke_throttle_ipapi_sec", 1.0, "min delay between ipapi.co requests", sheettype=None
)
vd.option(
"tke_throttle_ipwho_sec", 0.5, "min delay between ipwho.is requests", sheettype=None
)
vd.option(
"tke_throttle_rdap_sec", 1.0, "min delay between RDAP requests", sheettype=None
)
vd.option(
"tke_throttle_mb_sec",
1.0,
"min delay between MalwareBazaar requests",
sheettype=None,
)
# API keys/tokens (optional unless otherwise stated by the provider).
vd.option("tke_ipinfo_token", "", "ipinfo token (optional)", sheettype=None)
vd.option("tke_ipapi_key", "", "ipapi.co API key (optional)", sheettype=None)
vd.option(
"tke_vt_api_key", "", "VirusTotal API key (required for VT lookups)", sheettype=None
)
vd.option(
"tke_maxmind_mmdb_path",
"",
"path to GeoLite2/GeoIP2 .mmdb file for offline MaxMind lookups",
sheettype=None,
)
def opt(name: str, default: Any = "") -> Any:
try:
return getattr(vd.options, name)
except Exception:
return default
def cache_path() -> str:
p = str(opt("tke_cache_db_path", "") or os.path.expanduser("~/.visidata_cache.db"))
return os.path.expanduser(p)
def auth_tag(secret: str) -> str:
if not secret:
return "noauth"
return hashlib.sha256(secret.encode("utf-8")).hexdigest()[:12]
def cache_ttl() -> int:
try:
return int(opt("tke_lookup_cache_ttl", 60 * 60 * 24))
except Exception:
return 60 * 60 * 24
def error_ttl() -> int:
try:
return int(opt("tke_lookup_error_ttl", 5 * 60))
except Exception:
return 5 * 60
def http_timeout() -> int:
try:
return int(opt("tke_lookup_timeout", 10))
except Exception:
return 10
def sqlite_getset(
key: str,
fn: Callable[[], Any],
*,
max_age: Optional[int] = None,
error_max_age: Optional[int] = None,
):
"""SQLite+pickle cache. Falls back to computing if db can't be used.
`key` should NOT contain secrets; include `auth_tag()` instead.
"""
try:
path = cache_path()
os.makedirs(os.path.dirname(path), exist_ok=True)
with sqlite3.connect(path, timeout=2) as conn:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute(
"CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value BLOB, timestamp INTEGER)"
)
cur = conn.cursor()
cur.execute("SELECT value, timestamp FROM cache WHERE key=?", (key,))
row = cur.fetchone()
now = int(time.time())
if row:
val_blob, ts = row
cached_val = pickle.loads(val_blob)
age = now - int(ts)
ttl = max_age
if cached_val is None and error_max_age is not None:
ttl = error_max_age
if ttl is None or age <= int(ttl):
return cached_val
val = fn()
cur.execute(
"INSERT OR REPLACE INTO cache (key, value, timestamp) VALUES (?, ?, ?)",
(key, pickle.dumps(val), now),
)
conn.commit()
return val
except Exception:
return fn()
def read_key_from_file(path: str) -> str:
try:
with open(os.path.expanduser(path)) as f:
return f.readline().strip()
except Exception:
return ""
_rate_lock = threading.Lock()
_next_allowed_at: Dict[str, float] = {}
_retry_after_until: Dict[str, float] = {}
def _provider_for_url(url: str) -> str:
u = str(url).lower()
if "virustotal.com" in u:
return "vt"
if "ipinfo.io" in u:
return "ipinfo"
if "ipapi.co" in u:
return "ipapi"
if "ipwho.is" in u:
return "ipwho"
if "rdap" in u:
return "rdap"
if "mb-api.abuse.ch" in u:
return "mb"
return "default"
def _provider_delay(provider: str) -> float:
optname = {
"vt": "tke_throttle_vt_sec",
"ipinfo": "tke_throttle_ipinfo_sec",
"ipapi": "tke_throttle_ipapi_sec",
"ipwho": "tke_throttle_ipwho_sec",
"rdap": "tke_throttle_rdap_sec",
"mb": "tke_throttle_mb_sec",
}.get(provider, "tke_throttle_default_sec")
try:
return max(0.0, float(opt(optname, 0.0)))
except Exception:
return 0.0
def _wait_for_slot(provider: str) -> None:
now = time.monotonic()
with _rate_lock:
ready = max(
now,
_next_allowed_at.get(provider, 0.0),
_retry_after_until.get(provider, 0.0),
)
_next_allowed_at[provider] = ready + _provider_delay(provider)
if ready > now:
time.sleep(ready - now)
def _mark_retry_after(provider: str, retry_after_s: float) -> None:
if retry_after_s <= 0:
return
until = time.monotonic() + retry_after_s
with _rate_lock:
prev = _retry_after_until.get(provider, 0.0)
if until > prev:
_retry_after_until[provider] = until
def _parse_retry_after(value: str) -> Optional[float]:
v = (value or "").strip()
if not v:
return None
try:
sec = float(v)
if sec >= 0:
return sec
except Exception:
pass
try:
dt = parsedate_to_datetime(v)
if dt is None:
return None
# parsedate_to_datetime can return naive dt; treat as UTC then.
if dt.tzinfo is None:
return max(0.0, dt.timestamp() - time.time())
return max(0.0, dt.timestamp() - time.time())
except Exception:
return None
def _request_json(
method: str,
url: str,
*,
headers: Optional[Dict[str, str]] = None,
data: Optional[Dict[str, Any]] = None,
provider: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
try:
import requests # optional dep
except Exception:
return None
prov = provider or _provider_for_url(url)
try:
retries = max(0, int(opt("tke_http_retries", 1)))
except Exception:
retries = 1
for attempt in range(retries + 1):
_wait_for_slot(prov)
try:
r = requests.request(
method,
url,
headers=headers,
data=data,
timeout=http_timeout(),
)
except Exception:
if attempt < retries:
continue
return None
if r.status_code == 429:
ra = _parse_retry_after(r.headers.get("Retry-After", ""))
if ra is None:
ra = max(1.0, _provider_delay(prov))
_mark_retry_after(prov, ra)
if attempt < retries:
continue
return None
if not r.ok:
if 500 <= r.status_code < 600 and attempt < retries:
_mark_retry_after(prov, max(1.0, _provider_delay(prov)))
continue
return None
try:
return r.json()
except Exception:
return None
return None
def http_get_json(
url: str,
*,
headers: Optional[Dict[str, str]] = None,
provider: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
return _request_json("GET", url, headers=headers, provider=provider)
def http_post_json(
url: str,
data: Dict[str, Any],
*,
headers: Optional[Dict[str, str]] = None,
provider: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
return _request_json("POST", url, headers=headers, data=data, provider=provider)