diff --git a/Dockerfile b/Dockerfile index bbf9398..d359778 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,38 @@ -FROM alpine AS builder +# tabledevil/nsrl — known-file hash filter, now backed by CIRCL hashlookup. +# +# Replaces the old self-built NSRL RDS md5 bloom (frozen at RDS 2.72 / 2021) +# with CIRCL's hashlookup-full.bloom: SHA-1, NSRL + many more known-good +# sources, refreshed upstream. Downloaded at build time (~1 GB) so lookups +# are fully offline; the bot rebuilds on a monthly cadence. +# +# # single hashes (old CLI preserved, now SHA-1): +# docker run --rm tabledevil/nsrl [ ...] +# cat sha1s.txt | docker run --rm -i tabledevil/nsrl -s -0 # only misses +# +# # analyse a whole directory tree (hashlookup-forensic-analyser): +# docker run --rm -v /evidence:/data:ro tabledevil/nsrl analyse -d /data +FROM python:3.12-slim -COPY nsrl /nsrl -RUN apk add -U tini alpine-sdk python3 python3-dev py3-pip p7zip \ - && python3 -m pip install git+https://github.com/jaybaird/python-bloomfilter/ \ - && /nsrl/shrink_nsrl.sh \ - && apk del --purge alpine-sdk py3-pip python3-dev p7zip \ - && rm -rf /tmp/* /root/.cache /var/cache/apk/* /nsrl/shrink_nsrl.sh +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates curl git libmagic1 \ + && rm -rf /var/lib/apt/lists/* -FROM alpine -LABEL maintainer="tabledevil" -COPY --from=builder / / +RUN pip install --no-cache-dir flor requests pytz filemagic \ + && git clone --depth=1 https://github.com/hashlookup/hashlookup-forensic-analyser /opt/hfa + +# The bloom filter is the data payload — fetched fresh every (monthly) rebuild. +RUN mkdir -p /nsrl \ + && curl -fsSL -o /nsrl/hashlookup-full.bloom \ + https://cra.circl.lu/hashlookup/hashlookup-full.bloom \ + && { echo "source = https://cra.circl.lu/hashlookup/hashlookup-full.bloom"; \ + curl -fsSI https://cra.circl.lu/hashlookup/hashlookup-full.bloom \ + | grep -i '^last-modified' || true; } > /nsrl/bloom.info + +COPY nsrl/search.py /nsrl/search.py +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh WORKDIR /nsrl - -ENTRYPOINT ["/sbin/tini","--","python3","/nsrl/search.py"] - +ENTRYPOINT ["/entrypoint.sh"] CMD ["-h"] diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..a438039 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# nsrl entrypoint — dispatches between single-hash lookup and directory mode. +# ... | -s | -h -> search.py (old NSRL CLI, SHA-1) +# analyse [args] -> hashlookup-forensic-analyser with the +# bundled bloom (e.g. analyse -d /data) +set -euo pipefail + +if [ "${1:-}" = "analyse" ] || [ "${1:-}" = "analyze" ]; then + shift + exec python3 /opt/hfa/bin/hashlookup-analyser.py \ + --bloomfilters /nsrl/hashlookup-full.bloom "$@" +fi +exec python3 /nsrl/search.py "$@" diff --git a/nsrl/build.py b/nsrl/build.py deleted file mode 100644 index ffa1052..0000000 --- a/nsrl/build.py +++ /dev/null @@ -1,118 +0,0 @@ -# !/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import binascii -import os -import configparser -from pybloom import BloomFilter - - -import argparse - - - -# reference - http://stackoverflow.com/a/9631635 -def blocks(this_file, size=65536): - while True: - b = this_file.read(size) - if not b: - break - yield b - - -def main(): - parser = argparse.ArgumentParser(prog='build.py') - parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) - config = parser.add_mutually_exclusive_group() - config.add_argument('-f', "--config", help='Config file with all settings') - settings = config.add_argument_group() - settings.add_argument('-e','--error-rate', type=float, help="Error Rate for False-Positives") - settings.add_argument('-n','--hashcount',type=int, help="Provide the hashcount") - settings.add_argument('-c','--column', type=int, help="Which Column of inputfile should be processed (0,1,...)") - settings.add_argument('-l','--label', help="What kind of Data is beeing processed (MD5,filenames,...)") - settings.add_argument('-d','--delimiter', help="Which char is used to delimit columns in inputfile") - settings.add_argument('-i','--inputfile', help="Path of input file") - settings.add_argument('-o','--outputfile', help="Path of input file") - - - args = parser.parse_args() - - #check if config-file was given - default_config_file='/nsrl/nsrl.conf' - configfiles=[default_config_file] - if not args.config is None: - #add user config - if os.path.isfile(args.config): - configfiles.append(args.config) - - #build config - conf = configparser.ConfigParser() - conf.read(configfiles) - #add commandline options - # conf=config["config"] - if args.error_rate: - conf.set("config","error_rate",str(args.error_rate)) - if args.hashcount: - conf.set("config","hash_count",str(args.hashcount)) - if args.column: - conf.set("config","hashfile_column",str(args.column)) - if args.label: - conf.set("config","hashfile_type",str(args.label)) - if args.delimiter: - conf.set("config","hashfile_delimiter",str(args.delimiter)) - if args.inputfile: - conf.set("config","hashfile_path",str(args.inputfile)) - - nsrl_path='/nsrl/NSRLFile.txt' - error_rate=0.01 - hashfile_delimiter=',' - hashfile_column=0 - hashfile_type='Hash' - nsrl_path=conf.get("config","hashfile_path") - error_rate=conf.getfloat("config",'error_rate') - hashfile_delimiter=conf.get("config",'hashfile_delimiter') - hashfile_column=conf.getint("config",'hashfile_column') - hashfile_type=conf.get("config",'hashfile_type') - - print("[BUILDING] Using error-rate: {}".format(error_rate)) - if os.path.isfile(nsrl_path): - print("[BUILDING] Reading in NSRL Database") - if not conf.has_option("config","hash_count"): - with open(nsrl_path) as f_line: - # Strip off header - _ = f_line.readline() - print("[BUILDING] Calculating number of entries in Inputfile...") - num_lines = sum(bl.count("\n") for bl in blocks(f_line)) - conf.set("config",'hash_count',str(num_lines)) - else: - num_lines=conf.getint("config","hash_count") - print("[BUILDING] There are {} {}s in the Database".format(num_lines,hashfile_type)) - with open(nsrl_path) as f_nsrl: - # Strip off header - _ = f_nsrl.readline() - print("[BUILDING] Creating bloomfilter") - bf = BloomFilter(num_lines, error_rate) - print("[BUILDING] Inserting {} into bloomfilter".format(hashfile_type)) - # sha1 hash is in column 0 - for line in f_nsrl: - hashline = line.split(hashfile_delimiter)[hashfile_column].strip('"') - if hashline: - try: - hash = binascii.unhexlify(hashline) - bf.add(hash) - except Exception as e: - print("[ERROR] %s" % e) - print("[BUILDING] NSRL bloomfilter contains {} items.".format(len(bf))) - with open('nsrl.bloom', 'wb') as nb: - bf.tofile(nb) - print("[BUILDING] Complete") - else: - print("[ERROR] No such file or directory: %s", nsrl_path) - - #save config - with open(default_config_file,'w') as configfile: - conf.write(configfile) - - -if __name__ == "__main__": - main() diff --git a/nsrl/nsrl.conf b/nsrl/nsrl.conf deleted file mode 100644 index 7578c39..0000000 --- a/nsrl/nsrl.conf +++ /dev/null @@ -1,10 +0,0 @@ -[config] -rds_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/rds_modernm.zip -rds_name = Reduced Modern -version_url = https://s3.amazonaws.com/rds.nsrl.nist.gov/RDS/current/version.txt -hashfile_name = NSRLFile.txt -hashfile_path = /nsrl/NSRLFile.txt -hashfile_type = md5 -hashfile_column = 1 -hashfile_delimiter = , -error_rate = 0.01 diff --git a/nsrl/search.py b/nsrl/search.py index 9e0a977..e9d5ae1 100755 --- a/nsrl/search.py +++ b/nsrl/search.py @@ -1,64 +1,69 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# Known-file SHA-1 lookup against the offline CIRCL hashlookup bloom filter. +# CLI is compatible with the old NSRL md5 search.py (hashes as args or -s +# stdin; -0/-1 to suppress hits/misses; -v verbose) — but hashes are SHA-1. import argparse -import binascii -import configparser +import re import sys -from pybloom import BloomFilter +from flor import BloomFilter + +BLOOM_PATH = "/nsrl/hashlookup-full.bloom" +INFO_PATH = "/nsrl/bloom.info" +SHA1_RE = re.compile(r"^[0-9a-fA-F]{40}$") + def main(): - default_config_file='/nsrl/nsrl.conf' - config = configparser.ConfigParser() - config.read(default_config_file) - #add commandline options - hash_type=config.get('config','hashfile_type') - - parser = argparse.ArgumentParser(prog='nsrl') - parser.add_argument("-v", "--verbose", help="Display verbose output message", action="store_true", required=False) - parser.add_argument("-0", "--no-hits", help="Suppress Output of matching hashes", action="store_true", required=False) - parser.add_argument("-1", "--no-misses", help="Suppress Output of mismatching hashes", action="store_true", required=False) + parser = argparse.ArgumentParser( + prog="nsrl", + description="Offline known-file lookup (CIRCL hashlookup bloom, SHA-1).", + ) + parser.add_argument("-v", "--verbose", action="store_true", + help="Display verbose output") + parser.add_argument("-0", "--no-hits", action="store_true", + help="Suppress output of matching (known) hashes") + parser.add_argument("-1", "--no-misses", action="store_true", + help="Suppress output of unknown hashes") inputs = parser.add_mutually_exclusive_group(required=True) - inputs.add_argument('hash', metavar='<{}>'.format(hash_type), type=str, nargs='*', default=[], help='{} hash to search for.'.format(hash_type)) - inputs.add_argument('-s','--stdin',help="Read hashes from stdin", action="store_true") + inputs.add_argument("hash", metavar="", type=str, nargs="*", + default=[], help="SHA-1 hash(es) to look up") + inputs.add_argument("-s", "--stdin", action="store_true", + help="Read hashes from stdin (one per line)") args = parser.parse_args() if args.verbose: - print("Version INFO: {}".format(config.get('config',"rds_version"))) - print("Error Rate: {}".format(config.get('config',"error_rate"))) - print("Build Date: {}".format(config.get('config',"build_date"))) - print("Filename: {}".format(config.get('config',"hashfile_name"))) - print("Hashcount: {}".format(config.get('config',"hash_count"))) + try: + sys.stderr.write(open(INFO_PATH).read()) + except OSError: + pass + bf = BloomFilter() + with open(BLOOM_PATH, "rb") as fh: + bf.read(fh) + if args.stdin: + hashlist = [line.strip() for line in sys.stdin if line.strip()] + else: + hashlist = args.hash - with open('nsrl.bloom', 'rb') as nb: - bf = BloomFilter.fromfile(nb) + rc = 0 + for hash_hex in hashlist: + if not SHA1_RE.match(hash_hex): + print(f"!:{hash_hex} (not a sha1)", file=sys.stderr) + rc = 2 + continue + # hashlookup blooms store uppercase-hex SHA-1 strings + is_known = hash_hex.upper().encode() in bf + if args.verbose: + print(f"{hash_hex}:{is_known}") + elif (is_known and not args.no_hits) or (not is_known and not args.no_misses): + if args.no_hits != args.no_misses: + print(hash_hex) + else: + print(f"{'+' if is_known else '-'}:{hash_hex}") + return rc - if args.stdin: - hashlist=[hash.strip() for hash in sys.stdin.readlines()] - else: - hashlist=args.hash - for hash_hex in hashlist: - hash = binascii.unhexlify(hash_hex) - output="" - - # only print output if for mismatches if selected - hash_is_a_match=(hash in bf) - if (hash_is_a_match and not args.no_hits) or (not hash_is_a_match and not args.no_misses): - #output - if args.verbose: - output = "{}:{}".format(hash_hex,hash_is_a_match) - elif args.no_hits != args.no_misses : - output = "{}".format(hash_hex) - else: - output = "{}:{}".format("+"if hash_is_a_match else "-",hash_hex) - print(output) - return if __name__ == "__main__": - try: - main() - except Exception as e: - print("Error: %s" % e) + sys.exit(main()) diff --git a/nsrl/shrink_nsrl.sh b/nsrl/shrink_nsrl.sh deleted file mode 100755 index f43836b..0000000 --- a/nsrl/shrink_nsrl.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/sh -#set -x - -#read information from config -rds_url=$(cat /nsrl/nsrl.conf | grep 'rds_url' | cut -f2- -d= | grep -o -E '\S+.*') -echo "rds_url=${rds_url}" -version_url=$(cat /nsrl/nsrl.conf | grep 'version_url' | cut -f2- -d= | grep -o -E '\S+.*') -echo "version_url=${version_url}" -error_rate=$(cat /nsrl/nsrl.conf | grep 'error_rate' | cut -f2- -d=| grep -o -E '\S+.*' ) -echo "error_rate=${error_rate}" -rds_name=$(cat /nsrl/nsrl.conf | grep 'rds_name' | cut -f2- -d= | grep -o -E '\S+.*') -echo "rds_name=${rds_name}" -hashfile_name=$(cat /nsrl/nsrl.conf | grep 'hashfile_name' | cut -f2- -d= | grep -o -E '\S+.*') -echo "hashfile_name=${hashfile_name}" -build_date=$(date +%Y%M%d_%H%M) -echo "build_date=${build_date}" -ls /nsrl/*.zip -#check if a zipfile was provided -if [ -f /nsrl/*.zip ]; then - zip_filename=$(ls /nsrl/*.zip | head -n1) - echo "[INFO] ZIP-File Exists : ${zip_filename}" - zip_md5=$(md5sum "${zip_filename}" | cut -f1 -d" ") - rds_version=$(stat -c %y "${zip_filename}" ) -else - zip_filename=${rds_url##*/} - echo "[INFO] Downloading NSRL Sets (${rds_name}):" - echo "[INFO] URL: ${rds_url}" - #Downloading and Hashing at the same time - zip_md5=$(wget -O - "${rds_url}" 2>/dev/null | tee "/nsrl/${zip_filename}" | md5sum | cut -f1 -d" ") - rds_version=$(wget -O - "${version_url}" 2>/dev/null | tee "/nsrl/.version" | head -n1 ) - zip_filename=$(ls /nsrl/*.zip | head -n1) -fi - - -echo "[INFO] Unzip NSRL Database zip to /nsrl/ ..." -7z e "${zip_filename}" -o/nsrl/ NSRLFile.txt -r - -echo "[INFO] Counting Hashes in /nsrl/${hashfile_name} ..." -#counting lines in hashfile without headline -let hash_count=$(cat "/nsrl/${hashfile_name}"|wc -l )-1 -echo "[INFO] /nsrl/${hashfile_name} contains ${hash_count} Hashes" - -echo "[INFO] Build bloomfilter from NSRL Database ..." -cd /nsrl && python3 /nsrl/build.py -e "${error_rate}" -n "${hash_count}" -echo "[INFO] Listing created files ..." -ls -lah /nsrl - -echo "[INFO] Deleting all unused files ..." -rm -f /nsrl/*.zip /nsrl/*.txt -ls -lah /nsrl - -#update config -echo "[INFO] Update Config ..." -echo "build_date = ${build_date}" >> /nsrl/nsrl.conf -echo "zip_filename = ${zip_filename}" >> /nsrl/nsrl.conf -echo "zip_md5 = ${zip_md5}" >> /nsrl/nsrl.conf -echo "rds_version = ${rds_version}" >> /nsrl/nsrl.conf -cat /nsrl/nsrl.conf