Rework to CIRCL hashlookup offline bloom (SHA-1)

Replace the self-built 2021 NSRL RDS md5 bloom with CIRCL's offline
hashlookup-full.bloom (SHA-1, NSRL + more), downloaded at build. Old
single-hash CLI preserved (now SHA-1); 'analyse -d <dir>' runs
hashlookup-forensic-analyser against the bundled bloom.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
tabledevil
2026-06-10 13:38:48 +02:00
parent 0d374d6bdb
commit 09ab281881
6 changed files with 97 additions and 246 deletions
+32 -13
View File
@@ -1,19 +1,38 @@
FROM alpine AS builder
# tabledevil/nsrl — known-file hash filter, now backed by CIRCL hashlookup.
#
# Replaces the old self-built NSRL RDS md5 bloom (frozen at RDS 2.72 / 2021)
# with CIRCL's hashlookup-full.bloom: SHA-1, NSRL + many more known-good
# sources, refreshed upstream. Downloaded at build time (~1 GB) so lookups
# are fully offline; the bot rebuilds on a monthly cadence.
#
# # single hashes (old CLI preserved, now SHA-1):
# docker run --rm tabledevil/nsrl <sha1> [<sha1> ...]
# cat sha1s.txt | docker run --rm -i tabledevil/nsrl -s -0 # only misses
#
# # analyse a whole directory tree (hashlookup-forensic-analyser):
# docker run --rm -v /evidence:/data:ro tabledevil/nsrl analyse -d /data
FROM python:3.12-slim
COPY nsrl /nsrl
RUN apk add -U tini alpine-sdk python3 python3-dev py3-pip p7zip \
&& python3 -m pip install git+https://github.com/jaybaird/python-bloomfilter/ \
&& /nsrl/shrink_nsrl.sh \
&& apk del --purge alpine-sdk py3-pip python3-dev p7zip \
&& rm -rf /tmp/* /root/.cache /var/cache/apk/* /nsrl/shrink_nsrl.sh
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates curl git libmagic1 \
&& rm -rf /var/lib/apt/lists/*
FROM alpine
LABEL maintainer="tabledevil"
COPY --from=builder / /
RUN pip install --no-cache-dir flor requests pytz filemagic \
&& git clone --depth=1 https://github.com/hashlookup/hashlookup-forensic-analyser /opt/hfa
# The bloom filter is the data payload — fetched fresh every (monthly) rebuild.
RUN mkdir -p /nsrl \
&& curl -fsSL -o /nsrl/hashlookup-full.bloom \
https://cra.circl.lu/hashlookup/hashlookup-full.bloom \
&& { echo "source = https://cra.circl.lu/hashlookup/hashlookup-full.bloom"; \
curl -fsSI https://cra.circl.lu/hashlookup/hashlookup-full.bloom \
| grep -i '^last-modified' || true; } > /nsrl/bloom.info
COPY nsrl/search.py /nsrl/search.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
WORKDIR /nsrl
ENTRYPOINT ["/sbin/tini","--","python3","/nsrl/search.py"]
ENTRYPOINT ["/entrypoint.sh"]
CMD ["-h"]
Executable
+13
View File
@@ -0,0 +1,13 @@
#!/bin/bash
# nsrl entrypoint — dispatches between single-hash lookup and directory mode.
# <sha1> ... | -s | -h -> search.py (old NSRL CLI, SHA-1)
# analyse [args] -> hashlookup-forensic-analyser with the
# bundled bloom (e.g. analyse -d /data)
set -euo pipefail
if [ "${1:-}" = "analyse" ] || [ "${1:-}" = "analyze" ]; then
shift
exec python3 /opt/hfa/bin/hashlookup-analyser.py \
--bloomfilters /nsrl/hashlookup-full.bloom "$@"
fi
exec python3 /nsrl/search.py "$@"
-118
View File
@@ -1,118 +0,0 @@
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
import binascii
import os
import configparser
from pybloom import BloomFilter
import argparse
# reference - http://stackoverflow.com/a/9631635
def blocks(this_file, size=65536):
while True:
b = this_file.read(size)
if not b:
break
yield b
def main():
parser = argparse.ArgumentParser(prog='build.py')
parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
config = parser.add_mutually_exclusive_group()
config.add_argument('-f', "--config", help='Config file with all settings')
settings = config.add_argument_group()
settings.add_argument('-e','--error-rate', type=float, help="Error Rate for False-Positives")
settings.add_argument('-n','--hashcount',type=int, help="Provide the hashcount")
settings.add_argument('-c','--column', type=int, help="Which Column of inputfile should be processed (0,1,...)")
settings.add_argument('-l','--label', help="What kind of Data is beeing processed (MD5,filenames,...)")
settings.add_argument('-d','--delimiter', help="Which char is used to delimit columns in inputfile")
settings.add_argument('-i','--inputfile', help="Path of input file")
settings.add_argument('-o','--outputfile', help="Path of input file")
args = parser.parse_args()
#check if config-file was given
default_config_file='/nsrl/nsrl.conf'
configfiles=[default_config_file]
if not args.config is None:
#add user config
if os.path.isfile(args.config):
configfiles.append(args.config)
#build config
conf = configparser.ConfigParser()
conf.read(configfiles)
#add commandline options
# conf=config["config"]
if args.error_rate:
conf.set("config","error_rate",str(args.error_rate))
if args.hashcount:
conf.set("config","hash_count",str(args.hashcount))
if args.column:
conf.set("config","hashfile_column",str(args.column))
if args.label:
conf.set("config","hashfile_type",str(args.label))
if args.delimiter:
conf.set("config","hashfile_delimiter",str(args.delimiter))
if args.inputfile:
conf.set("config","hashfile_path",str(args.inputfile))
nsrl_path='/nsrl/NSRLFile.txt'
error_rate=0.01
hashfile_delimiter=','
hashfile_column=0
hashfile_type='Hash'
nsrl_path=conf.get("config","hashfile_path")
error_rate=conf.getfloat("config",'error_rate')
hashfile_delimiter=conf.get("config",'hashfile_delimiter')
hashfile_column=conf.getint("config",'hashfile_column')
hashfile_type=conf.get("config",'hashfile_type')
print("[BUILDING] Using error-rate: {}".format(error_rate))
if os.path.isfile(nsrl_path):
print("[BUILDING] Reading in NSRL Database")
if not conf.has_option("config","hash_count"):
with open(nsrl_path) as f_line:
# Strip off header
_ = f_line.readline()
print("[BUILDING] Calculating number of entries in Inputfile...")
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
conf.set("config",'hash_count',str(num_lines))
else:
num_lines=conf.getint("config","hash_count")
print("[BUILDING] There are {} {}s in the Database".format(num_lines,hashfile_type))
with open(nsrl_path) as f_nsrl:
# Strip off header
_ = f_nsrl.readline()
print("[BUILDING] Creating bloomfilter")
bf = BloomFilter(num_lines, error_rate)
print("[BUILDING] Inserting {} into bloomfilter".format(hashfile_type))
# sha1 hash is in column 0
for line in f_nsrl:
hashline = line.split(hashfile_delimiter)[hashfile_column].strip('"')
if hashline:
try:
hash = binascii.unhexlify(hashline)
bf.add(hash)
except Exception as e:
print("[ERROR] %s" % e)
print("[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf)))
with open('nsrl.bloom', 'wb') as nb:
bf.tofile(nb)
print("[BUILDING] Complete")
else:
print("[ERROR] No such file or directory: %s", nsrl_path)
#save config
with open(default_config_file,'w') as configfile:
conf.write(configfile)
if __name__ == "__main__":
main()
-10
View File
@@ -1,10 +0,0 @@
[config]
rds_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip
rds_name = Reduced Modern
version_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/version.txt
hashfile_name = NSRLFile.txt
hashfile_path = /nsrl/NSRLFile.txt
hashfile_type = md5
hashfile_column = 1
hashfile_delimiter = ,
error_rate = 0.01
+52 -47
View File
@@ -1,64 +1,69 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Known-file SHA-1 lookup against the offline CIRCL hashlookup bloom filter.
# CLI is compatible with the old NSRL md5 search.py (hashes as args or -s
# stdin; -0/-1 to suppress hits/misses; -v verbose) — but hashes are SHA-1.
import argparse
import binascii
import configparser
import re
import sys
from pybloom import BloomFilter
from flor import BloomFilter
BLOOM_PATH = "/nsrl/hashlookup-full.bloom"
INFO_PATH = "/nsrl/bloom.info"
SHA1_RE = re.compile(r"^[0-9a-fA-F]{40}$")
def main():
default_config_file='/nsrl/nsrl.conf'
config = configparser.ConfigParser()
config.read(default_config_file)
#add commandline options
hash_type=config.get('config','hashfile_type')
parser = argparse.ArgumentParser(prog='nsrl')
parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
parser.add_argument("-0", "--no-hits", help="Suppress Output of matching hashes", action="store_true", required=False)
parser.add_argument("-1", "--no-misses", help="Suppress Output of mismatching hashes", action="store_true", required=False)
parser = argparse.ArgumentParser(
prog="nsrl",
description="Offline known-file lookup (CIRCL hashlookup bloom, SHA-1).",
)
parser.add_argument("-v", "--verbose", action="store_true",
help="Display verbose output")
parser.add_argument("-0", "--no-hits", action="store_true",
help="Suppress output of matching (known) hashes")
parser.add_argument("-1", "--no-misses", action="store_true",
help="Suppress output of unknown hashes")
inputs = parser.add_mutually_exclusive_group(required=True)
inputs.add_argument('hash', metavar='<{}>'.format(hash_type), type=str, nargs='*', default=[], help='{} hash to search for.'.format(hash_type))
inputs.add_argument('-s','--stdin',help="Read hashes from stdin", action="store_true")
inputs.add_argument("hash", metavar="<sha1>", type=str, nargs="*",
default=[], help="SHA-1 hash(es) to look up")
inputs.add_argument("-s", "--stdin", action="store_true",
help="Read hashes from stdin (one per line)")
args = parser.parse_args()
if args.verbose:
print("Version INFO: {}".format(config.get('config',"rds_version")))
print("Error Rate: {}".format(config.get('config',"error_rate")))
print("Build Date: {}".format(config.get('config',"build_date")))
print("Filename: {}".format(config.get('config',"hashfile_name")))
print("Hashcount: {}".format(config.get('config',"hash_count")))
try:
sys.stderr.write(open(INFO_PATH).read())
except OSError:
pass
bf = BloomFilter()
with open(BLOOM_PATH, "rb") as fh:
bf.read(fh)
if args.stdin:
hashlist = [line.strip() for line in sys.stdin if line.strip()]
else:
hashlist = args.hash
with open('nsrl.bloom', 'rb') as nb:
bf = BloomFilter.fromfile(nb)
rc = 0
for hash_hex in hashlist:
if not SHA1_RE.match(hash_hex):
print(f"!:{hash_hex} (not a sha1)", file=sys.stderr)
rc = 2
continue
# hashlookup blooms store uppercase-hex SHA-1 strings
is_known = hash_hex.upper().encode() in bf
if args.verbose:
print(f"{hash_hex}:{is_known}")
elif (is_known and not args.no_hits) or (not is_known and not args.no_misses):
if args.no_hits != args.no_misses:
print(hash_hex)
else:
print(f"{'+' if is_known else '-'}:{hash_hex}")
return rc
if args.stdin:
hashlist=[hash.strip() for hash in sys.stdin.readlines()]
else:
hashlist=args.hash
for hash_hex in hashlist:
hash = binascii.unhexlify(hash_hex)
output=""
# only print output if for mismatches if selected
hash_is_a_match=(hash in bf)
if (hash_is_a_match and not args.no_hits) or (not hash_is_a_match and not args.no_misses):
#output
if args.verbose:
output = "{}:{}".format(hash_hex,hash_is_a_match)
elif args.no_hits != args.no_misses :
output = "{}".format(hash_hex)
else:
output = "{}:{}".format("+"if hash_is_a_match else "-",hash_hex)
print(output)
return
if __name__ == "__main__":
try:
main()
except Exception as e:
print("Error: %s" % e)
sys.exit(main())
-58
View File
@@ -1,58 +0,0 @@
#!/bin/sh
#set -x
#read information from config
rds_url=$(cat /nsrl/nsrl.conf | grep 'rds_url' | cut -f2- -d= | grep -o -E '\S+.*')
echo "rds_url=${rds_url}"
version_url=$(cat /nsrl/nsrl.conf | grep 'version_url' | cut -f2- -d= | grep -o -E '\S+.*')
echo "version_url=${version_url}"
error_rate=$(cat /nsrl/nsrl.conf | grep 'error_rate' | cut -f2- -d=| grep -o -E '\S+.*' )
echo "error_rate=${error_rate}"
rds_name=$(cat /nsrl/nsrl.conf | grep 'rds_name' | cut -f2- -d= | grep -o -E '\S+.*')
echo "rds_name=${rds_name}"
hashfile_name=$(cat /nsrl/nsrl.conf | grep 'hashfile_name' | cut -f2- -d= | grep -o -E '\S+.*')
echo "hashfile_name=${hashfile_name}"
build_date=$(date +%Y%M%d_%H%M)
echo "build_date=${build_date}"
ls /nsrl/*.zip
#check if a zipfile was provided
if [ -f /nsrl/*.zip ]; then
zip_filename=$(ls /nsrl/*.zip | head -n1)
echo "[INFO] ZIP-File Exists : ${zip_filename}"
zip_md5=$(md5sum "${zip_filename}" | cut -f1 -d" ")
rds_version=$(stat -c %y "${zip_filename}" )
else
zip_filename=${rds_url##*/}
echo "[INFO] Downloading NSRL Sets (${rds_name}):"
echo "[INFO] URL: ${rds_url}"
#Downloading and Hashing at the same time
zip_md5=$(wget -O - "${rds_url}" 2>/dev/null | tee "/nsrl/${zip_filename}" | md5sum | cut -f1 -d" ")
rds_version=$(wget -O - "${version_url}" 2>/dev/null | tee "/nsrl/.version" | head -n1 )
zip_filename=$(ls /nsrl/*.zip | head -n1)
fi
echo "[INFO] Unzip NSRL Database zip to /nsrl/ ..."
7z e "${zip_filename}" -o/nsrl/ NSRLFile.txt -r
echo "[INFO] Counting Hashes in /nsrl/${hashfile_name} ..."
#counting lines in hashfile without headline
let hash_count=$(cat "/nsrl/${hashfile_name}"|wc -l )-1
echo "[INFO] /nsrl/${hashfile_name} contains ${hash_count} Hashes"
echo "[INFO] Build bloomfilter from NSRL Database ..."
cd /nsrl && python3 /nsrl/build.py -e "${error_rate}" -n "${hash_count}"
echo "[INFO] Listing created files ..."
ls -lah /nsrl
echo "[INFO] Deleting all unused files ..."
rm -f /nsrl/*.zip /nsrl/*.txt
ls -lah /nsrl
#update config
echo "[INFO] Update Config ..."
echo "build_date = ${build_date}" >> /nsrl/nsrl.conf
echo "zip_filename = ${zip_filename}" >> /nsrl/nsrl.conf
echo "zip_md5 = ${zip_md5}" >> /nsrl/nsrl.conf
echo "rds_version = ${rds_version}" >> /nsrl/nsrl.conf
cat /nsrl/nsrl.conf