Rework to CIRCL hashlookup offline bloom (SHA-1)
Replace the self-built 2021 NSRL RDS md5 bloom with CIRCL's offline hashlookup-full.bloom (SHA-1, NSRL + more), downloaded at build. Old single-hash CLI preserved (now SHA-1); 'analyse -d <dir>' runs hashlookup-forensic-analyser against the bundled bloom. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+32
-13
@@ -1,19 +1,38 @@
|
|||||||
FROM alpine AS builder
|
# tabledevil/nsrl — known-file hash filter, now backed by CIRCL hashlookup.
|
||||||
|
#
|
||||||
|
# Replaces the old self-built NSRL RDS md5 bloom (frozen at RDS 2.72 / 2021)
|
||||||
|
# with CIRCL's hashlookup-full.bloom: SHA-1, NSRL + many more known-good
|
||||||
|
# sources, refreshed upstream. Downloaded at build time (~1 GB) so lookups
|
||||||
|
# are fully offline; the bot rebuilds on a monthly cadence.
|
||||||
|
#
|
||||||
|
# # single hashes (old CLI preserved, now SHA-1):
|
||||||
|
# docker run --rm tabledevil/nsrl <sha1> [<sha1> ...]
|
||||||
|
# cat sha1s.txt | docker run --rm -i tabledevil/nsrl -s -0 # only misses
|
||||||
|
#
|
||||||
|
# # analyse a whole directory tree (hashlookup-forensic-analyser):
|
||||||
|
# docker run --rm -v /evidence:/data:ro tabledevil/nsrl analyse -d /data
|
||||||
|
|
||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
COPY nsrl /nsrl
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
RUN apk add -U tini alpine-sdk python3 python3-dev py3-pip p7zip \
|
ca-certificates curl git libmagic1 \
|
||||||
&& python3 -m pip install git+https://github.com/jaybaird/python-bloomfilter/ \
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
&& /nsrl/shrink_nsrl.sh \
|
|
||||||
&& apk del --purge alpine-sdk py3-pip python3-dev p7zip \
|
|
||||||
&& rm -rf /tmp/* /root/.cache /var/cache/apk/* /nsrl/shrink_nsrl.sh
|
|
||||||
|
|
||||||
FROM alpine
|
RUN pip install --no-cache-dir flor requests pytz filemagic \
|
||||||
LABEL maintainer="tabledevil"
|
&& git clone --depth=1 https://github.com/hashlookup/hashlookup-forensic-analyser /opt/hfa
|
||||||
COPY --from=builder / /
|
|
||||||
|
# The bloom filter is the data payload — fetched fresh every (monthly) rebuild.
|
||||||
|
RUN mkdir -p /nsrl \
|
||||||
|
&& curl -fsSL -o /nsrl/hashlookup-full.bloom \
|
||||||
|
https://cra.circl.lu/hashlookup/hashlookup-full.bloom \
|
||||||
|
&& { echo "source = https://cra.circl.lu/hashlookup/hashlookup-full.bloom"; \
|
||||||
|
curl -fsSI https://cra.circl.lu/hashlookup/hashlookup-full.bloom \
|
||||||
|
| grep -i '^last-modified' || true; } > /nsrl/bloom.info
|
||||||
|
|
||||||
|
COPY nsrl/search.py /nsrl/search.py
|
||||||
|
COPY entrypoint.sh /entrypoint.sh
|
||||||
|
RUN chmod +x /entrypoint.sh
|
||||||
|
|
||||||
WORKDIR /nsrl
|
WORKDIR /nsrl
|
||||||
|
ENTRYPOINT ["/entrypoint.sh"]
|
||||||
ENTRYPOINT ["/sbin/tini","--","python3","/nsrl/search.py"]
|
|
||||||
|
|
||||||
CMD ["-h"]
|
CMD ["-h"]
|
||||||
|
|||||||
Executable
+13
@@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# nsrl entrypoint — dispatches between single-hash lookup and directory mode.
|
||||||
|
# <sha1> ... | -s | -h -> search.py (old NSRL CLI, SHA-1)
|
||||||
|
# analyse [args] -> hashlookup-forensic-analyser with the
|
||||||
|
# bundled bloom (e.g. analyse -d /data)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [ "${1:-}" = "analyse" ] || [ "${1:-}" = "analyze" ]; then
|
||||||
|
shift
|
||||||
|
exec python3 /opt/hfa/bin/hashlookup-analyser.py \
|
||||||
|
--bloomfilters /nsrl/hashlookup-full.bloom "$@"
|
||||||
|
fi
|
||||||
|
exec python3 /nsrl/search.py "$@"
|
||||||
-118
@@ -1,118 +0,0 @@
|
|||||||
# !/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import binascii
|
|
||||||
import os
|
|
||||||
import configparser
|
|
||||||
from pybloom import BloomFilter
|
|
||||||
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# reference - http://stackoverflow.com/a/9631635
|
|
||||||
def blocks(this_file, size=65536):
|
|
||||||
while True:
|
|
||||||
b = this_file.read(size)
|
|
||||||
if not b:
|
|
||||||
break
|
|
||||||
yield b
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(prog='build.py')
|
|
||||||
parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
|
|
||||||
config = parser.add_mutually_exclusive_group()
|
|
||||||
config.add_argument('-f', "--config", help='Config file with all settings')
|
|
||||||
settings = config.add_argument_group()
|
|
||||||
settings.add_argument('-e','--error-rate', type=float, help="Error Rate for False-Positives")
|
|
||||||
settings.add_argument('-n','--hashcount',type=int, help="Provide the hashcount")
|
|
||||||
settings.add_argument('-c','--column', type=int, help="Which Column of inputfile should be processed (0,1,...)")
|
|
||||||
settings.add_argument('-l','--label', help="What kind of Data is beeing processed (MD5,filenames,...)")
|
|
||||||
settings.add_argument('-d','--delimiter', help="Which char is used to delimit columns in inputfile")
|
|
||||||
settings.add_argument('-i','--inputfile', help="Path of input file")
|
|
||||||
settings.add_argument('-o','--outputfile', help="Path of input file")
|
|
||||||
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
#check if config-file was given
|
|
||||||
default_config_file='/nsrl/nsrl.conf'
|
|
||||||
configfiles=[default_config_file]
|
|
||||||
if not args.config is None:
|
|
||||||
#add user config
|
|
||||||
if os.path.isfile(args.config):
|
|
||||||
configfiles.append(args.config)
|
|
||||||
|
|
||||||
#build config
|
|
||||||
conf = configparser.ConfigParser()
|
|
||||||
conf.read(configfiles)
|
|
||||||
#add commandline options
|
|
||||||
# conf=config["config"]
|
|
||||||
if args.error_rate:
|
|
||||||
conf.set("config","error_rate",str(args.error_rate))
|
|
||||||
if args.hashcount:
|
|
||||||
conf.set("config","hash_count",str(args.hashcount))
|
|
||||||
if args.column:
|
|
||||||
conf.set("config","hashfile_column",str(args.column))
|
|
||||||
if args.label:
|
|
||||||
conf.set("config","hashfile_type",str(args.label))
|
|
||||||
if args.delimiter:
|
|
||||||
conf.set("config","hashfile_delimiter",str(args.delimiter))
|
|
||||||
if args.inputfile:
|
|
||||||
conf.set("config","hashfile_path",str(args.inputfile))
|
|
||||||
|
|
||||||
nsrl_path='/nsrl/NSRLFile.txt'
|
|
||||||
error_rate=0.01
|
|
||||||
hashfile_delimiter=','
|
|
||||||
hashfile_column=0
|
|
||||||
hashfile_type='Hash'
|
|
||||||
nsrl_path=conf.get("config","hashfile_path")
|
|
||||||
error_rate=conf.getfloat("config",'error_rate')
|
|
||||||
hashfile_delimiter=conf.get("config",'hashfile_delimiter')
|
|
||||||
hashfile_column=conf.getint("config",'hashfile_column')
|
|
||||||
hashfile_type=conf.get("config",'hashfile_type')
|
|
||||||
|
|
||||||
print("[BUILDING] Using error-rate: {}".format(error_rate))
|
|
||||||
if os.path.isfile(nsrl_path):
|
|
||||||
print("[BUILDING] Reading in NSRL Database")
|
|
||||||
if not conf.has_option("config","hash_count"):
|
|
||||||
with open(nsrl_path) as f_line:
|
|
||||||
# Strip off header
|
|
||||||
_ = f_line.readline()
|
|
||||||
print("[BUILDING] Calculating number of entries in Inputfile...")
|
|
||||||
num_lines = sum(bl.count("\n") for bl in blocks(f_line))
|
|
||||||
conf.set("config",'hash_count',str(num_lines))
|
|
||||||
else:
|
|
||||||
num_lines=conf.getint("config","hash_count")
|
|
||||||
print("[BUILDING] There are {} {}s in the Database".format(num_lines,hashfile_type))
|
|
||||||
with open(nsrl_path) as f_nsrl:
|
|
||||||
# Strip off header
|
|
||||||
_ = f_nsrl.readline()
|
|
||||||
print("[BUILDING] Creating bloomfilter")
|
|
||||||
bf = BloomFilter(num_lines, error_rate)
|
|
||||||
print("[BUILDING] Inserting {} into bloomfilter".format(hashfile_type))
|
|
||||||
# sha1 hash is in column 0
|
|
||||||
for line in f_nsrl:
|
|
||||||
hashline = line.split(hashfile_delimiter)[hashfile_column].strip('"')
|
|
||||||
if hashline:
|
|
||||||
try:
|
|
||||||
hash = binascii.unhexlify(hashline)
|
|
||||||
bf.add(hash)
|
|
||||||
except Exception as e:
|
|
||||||
print("[ERROR] %s" % e)
|
|
||||||
print("[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf)))
|
|
||||||
with open('nsrl.bloom', 'wb') as nb:
|
|
||||||
bf.tofile(nb)
|
|
||||||
print("[BUILDING] Complete")
|
|
||||||
else:
|
|
||||||
print("[ERROR] No such file or directory: %s", nsrl_path)
|
|
||||||
|
|
||||||
#save config
|
|
||||||
with open(default_config_file,'w') as configfile:
|
|
||||||
conf.write(configfile)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
[config]
|
|
||||||
rds_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip
|
|
||||||
rds_name = Reduced Modern
|
|
||||||
version_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/version.txt
|
|
||||||
hashfile_name = NSRLFile.txt
|
|
||||||
hashfile_path = /nsrl/NSRLFile.txt
|
|
||||||
hashfile_type = md5
|
|
||||||
hashfile_column = 1
|
|
||||||
hashfile_delimiter = ,
|
|
||||||
error_rate = 0.01
|
|
||||||
+52
-47
@@ -1,64 +1,69 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# Known-file SHA-1 lookup against the offline CIRCL hashlookup bloom filter.
|
||||||
|
# CLI is compatible with the old NSRL md5 search.py (hashes as args or -s
|
||||||
|
# stdin; -0/-1 to suppress hits/misses; -v verbose) — but hashes are SHA-1.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import binascii
|
import re
|
||||||
import configparser
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from pybloom import BloomFilter
|
from flor import BloomFilter
|
||||||
|
|
||||||
|
BLOOM_PATH = "/nsrl/hashlookup-full.bloom"
|
||||||
|
INFO_PATH = "/nsrl/bloom.info"
|
||||||
|
SHA1_RE = re.compile(r"^[0-9a-fA-F]{40}$")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
default_config_file='/nsrl/nsrl.conf'
|
parser = argparse.ArgumentParser(
|
||||||
config = configparser.ConfigParser()
|
prog="nsrl",
|
||||||
config.read(default_config_file)
|
description="Offline known-file lookup (CIRCL hashlookup bloom, SHA-1).",
|
||||||
#add commandline options
|
)
|
||||||
hash_type=config.get('config','hashfile_type')
|
parser.add_argument("-v", "--verbose", action="store_true",
|
||||||
|
help="Display verbose output")
|
||||||
parser = argparse.ArgumentParser(prog='nsrl')
|
parser.add_argument("-0", "--no-hits", action="store_true",
|
||||||
parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False)
|
help="Suppress output of matching (known) hashes")
|
||||||
parser.add_argument("-0", "--no-hits", help="Suppress Output of matching hashes", action="store_true", required=False)
|
parser.add_argument("-1", "--no-misses", action="store_true",
|
||||||
parser.add_argument("-1", "--no-misses", help="Suppress Output of mismatching hashes", action="store_true", required=False)
|
help="Suppress output of unknown hashes")
|
||||||
inputs = parser.add_mutually_exclusive_group(required=True)
|
inputs = parser.add_mutually_exclusive_group(required=True)
|
||||||
inputs.add_argument('hash', metavar='<{}>'.format(hash_type), type=str, nargs='*', default=[], help='{} hash to search for.'.format(hash_type))
|
inputs.add_argument("hash", metavar="<sha1>", type=str, nargs="*",
|
||||||
inputs.add_argument('-s','--stdin',help="Read hashes from stdin", action="store_true")
|
default=[], help="SHA-1 hash(es) to look up")
|
||||||
|
inputs.add_argument("-s", "--stdin", action="store_true",
|
||||||
|
help="Read hashes from stdin (one per line)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
print("Version INFO: {}".format(config.get('config',"rds_version")))
|
try:
|
||||||
print("Error Rate: {}".format(config.get('config',"error_rate")))
|
sys.stderr.write(open(INFO_PATH).read())
|
||||||
print("Build Date: {}".format(config.get('config',"build_date")))
|
except OSError:
|
||||||
print("Filename: {}".format(config.get('config',"hashfile_name")))
|
pass
|
||||||
print("Hashcount: {}".format(config.get('config',"hash_count")))
|
|
||||||
|
|
||||||
|
bf = BloomFilter()
|
||||||
|
with open(BLOOM_PATH, "rb") as fh:
|
||||||
|
bf.read(fh)
|
||||||
|
|
||||||
|
if args.stdin:
|
||||||
|
hashlist = [line.strip() for line in sys.stdin if line.strip()]
|
||||||
|
else:
|
||||||
|
hashlist = args.hash
|
||||||
|
|
||||||
with open('nsrl.bloom', 'rb') as nb:
|
rc = 0
|
||||||
bf = BloomFilter.fromfile(nb)
|
for hash_hex in hashlist:
|
||||||
|
if not SHA1_RE.match(hash_hex):
|
||||||
|
print(f"!:{hash_hex} (not a sha1)", file=sys.stderr)
|
||||||
|
rc = 2
|
||||||
|
continue
|
||||||
|
# hashlookup blooms store uppercase-hex SHA-1 strings
|
||||||
|
is_known = hash_hex.upper().encode() in bf
|
||||||
|
if args.verbose:
|
||||||
|
print(f"{hash_hex}:{is_known}")
|
||||||
|
elif (is_known and not args.no_hits) or (not is_known and not args.no_misses):
|
||||||
|
if args.no_hits != args.no_misses:
|
||||||
|
print(hash_hex)
|
||||||
|
else:
|
||||||
|
print(f"{'+' if is_known else '-'}:{hash_hex}")
|
||||||
|
return rc
|
||||||
|
|
||||||
if args.stdin:
|
|
||||||
hashlist=[hash.strip() for hash in sys.stdin.readlines()]
|
|
||||||
else:
|
|
||||||
hashlist=args.hash
|
|
||||||
for hash_hex in hashlist:
|
|
||||||
hash = binascii.unhexlify(hash_hex)
|
|
||||||
output=""
|
|
||||||
|
|
||||||
# only print output if for mismatches if selected
|
|
||||||
hash_is_a_match=(hash in bf)
|
|
||||||
if (hash_is_a_match and not args.no_hits) or (not hash_is_a_match and not args.no_misses):
|
|
||||||
#output
|
|
||||||
if args.verbose:
|
|
||||||
output = "{}:{}".format(hash_hex,hash_is_a_match)
|
|
||||||
elif args.no_hits != args.no_misses :
|
|
||||||
output = "{}".format(hash_hex)
|
|
||||||
else:
|
|
||||||
output = "{}:{}".format("+"if hash_is_a_match else "-",hash_hex)
|
|
||||||
print(output)
|
|
||||||
return
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
try:
|
sys.exit(main())
|
||||||
main()
|
|
||||||
except Exception as e:
|
|
||||||
print("Error: %s" % e)
|
|
||||||
|
|||||||
@@ -1,58 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
#set -x
|
|
||||||
|
|
||||||
#read information from config
|
|
||||||
rds_url=$(cat /nsrl/nsrl.conf | grep 'rds_url' | cut -f2- -d= | grep -o -E '\S+.*')
|
|
||||||
echo "rds_url=${rds_url}"
|
|
||||||
version_url=$(cat /nsrl/nsrl.conf | grep 'version_url' | cut -f2- -d= | grep -o -E '\S+.*')
|
|
||||||
echo "version_url=${version_url}"
|
|
||||||
error_rate=$(cat /nsrl/nsrl.conf | grep 'error_rate' | cut -f2- -d=| grep -o -E '\S+.*' )
|
|
||||||
echo "error_rate=${error_rate}"
|
|
||||||
rds_name=$(cat /nsrl/nsrl.conf | grep 'rds_name' | cut -f2- -d= | grep -o -E '\S+.*')
|
|
||||||
echo "rds_name=${rds_name}"
|
|
||||||
hashfile_name=$(cat /nsrl/nsrl.conf | grep 'hashfile_name' | cut -f2- -d= | grep -o -E '\S+.*')
|
|
||||||
echo "hashfile_name=${hashfile_name}"
|
|
||||||
build_date=$(date +%Y%M%d_%H%M)
|
|
||||||
echo "build_date=${build_date}"
|
|
||||||
ls /nsrl/*.zip
|
|
||||||
#check if a zipfile was provided
|
|
||||||
if [ -f /nsrl/*.zip ]; then
|
|
||||||
zip_filename=$(ls /nsrl/*.zip | head -n1)
|
|
||||||
echo "[INFO] ZIP-File Exists : ${zip_filename}"
|
|
||||||
zip_md5=$(md5sum "${zip_filename}" | cut -f1 -d" ")
|
|
||||||
rds_version=$(stat -c %y "${zip_filename}" )
|
|
||||||
else
|
|
||||||
zip_filename=${rds_url##*/}
|
|
||||||
echo "[INFO] Downloading NSRL Sets (${rds_name}):"
|
|
||||||
echo "[INFO] URL: ${rds_url}"
|
|
||||||
#Downloading and Hashing at the same time
|
|
||||||
zip_md5=$(wget -O - "${rds_url}" 2>/dev/null | tee "/nsrl/${zip_filename}" | md5sum | cut -f1 -d" ")
|
|
||||||
rds_version=$(wget -O - "${version_url}" 2>/dev/null | tee "/nsrl/.version" | head -n1 )
|
|
||||||
zip_filename=$(ls /nsrl/*.zip | head -n1)
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
echo "[INFO] Unzip NSRL Database zip to /nsrl/ ..."
|
|
||||||
7z e "${zip_filename}" -o/nsrl/ NSRLFile.txt -r
|
|
||||||
|
|
||||||
echo "[INFO] Counting Hashes in /nsrl/${hashfile_name} ..."
|
|
||||||
#counting lines in hashfile without headline
|
|
||||||
let hash_count=$(cat "/nsrl/${hashfile_name}"|wc -l )-1
|
|
||||||
echo "[INFO] /nsrl/${hashfile_name} contains ${hash_count} Hashes"
|
|
||||||
|
|
||||||
echo "[INFO] Build bloomfilter from NSRL Database ..."
|
|
||||||
cd /nsrl && python3 /nsrl/build.py -e "${error_rate}" -n "${hash_count}"
|
|
||||||
echo "[INFO] Listing created files ..."
|
|
||||||
ls -lah /nsrl
|
|
||||||
|
|
||||||
echo "[INFO] Deleting all unused files ..."
|
|
||||||
rm -f /nsrl/*.zip /nsrl/*.txt
|
|
||||||
ls -lah /nsrl
|
|
||||||
|
|
||||||
#update config
|
|
||||||
echo "[INFO] Update Config ..."
|
|
||||||
echo "build_date = ${build_date}" >> /nsrl/nsrl.conf
|
|
||||||
echo "zip_filename = ${zip_filename}" >> /nsrl/nsrl.conf
|
|
||||||
echo "zip_md5 = ${zip_md5}" >> /nsrl/nsrl.conf
|
|
||||||
echo "rds_version = ${rds_version}" >> /nsrl/nsrl.conf
|
|
||||||
cat /nsrl/nsrl.conf
|
|
||||||
Reference in New Issue
Block a user