#!/usr/bin/python3 import sys import hashlib import os import numpy as np import math import argparse def even_select(N, M): if M > N/2: cut = np.zeros(N, dtype=int) q, r = divmod(N, N-M) indices = [q*i + min(i, r) for i in range(N-M)] cut[indices] = True else: cut = np.ones(N, dtype=int) q, r = divmod(N, M) indices = [q*i + min(i, r) for i in range(M)] cut[indices] = False return cut def get_offsets(blocksize, blockcount,blocks_to_hash): selection = even_select(blockcount,blocks_to_hash) for i in range(0,blockcount): if selection[i] == 0: offset = int(blocksize*i) yield offset def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1): h = hashlib.new(hashalgo) filesize = os.path.getsize(file.name) fs_block_size = os.stat(file.name).st_blksize if fs_block_size % h.block_size != 0: raise ValueError(f"Filesystem block size {fs_block_size} is not a multiple of hash block size {h.block_size}") blocksize = fs_block_size blockcount = math.ceil(filesize/blocksize) if blocks_to_hash == -1 : blocks_to_hash = math.ceil(blockcount*spread/100) if (blocks_to_hash * blocksize) > maxsize: blocks_to_hash = math.ceil(maxsize/blocksize) if filesize > blocksize: for offset in get_offsets(blocksize, blockcount, blocks_to_hash): file.seek(offset) data = file.read(blocksize) for i in range(0, len(data), h.block_size): h.update(data[i:i + h.block_size]) else: h.update(file.read(blocksize)) result = f"{h.hexdigest()};{blocks_to_hash};{filesize};{hashalgo};{file.name}" return result def main(): parser = argparse.ArgumentParser(description='Sparsely hash large files. Only a given percentage of the file is actually hashed.') parser.add_argument('-p', metavar='N', dest="spread", type=int, default=10, help='Percentage of file to hash. 0 < N < 100 (default=10)') parser.add_argument('-s', metavar='N', dest="size", type=int, default=10, help='Maximum amount of data per file in MB') parser.add_argument('-c', dest="hashalgo", default="md5", help='Select a hash algorithm (default=md5)') parser.add_argument('file', type=argparse.FileType('rb'), nargs='+') parser.add_argument('-v', dest="validate", action='store_true', help='Read output-file of previous run and validate hashes') parser.add_argument('-1', dest="mismatches", action='store_false', help='Suppress mismatches') parser.add_argument('-0', dest="matches", action='store_false', help='Suppress matches') args = parser.parse_args() hashalgo = args.hashalgo spread = args.spread maxsize = args.size * 1024 * 1024 print(args) if not args.validate: for infile in args.file: print(get_hash(infile, hashalgo, spread, maxsize)) else: print("validating") for line in args.file[0]: line = line.decode().strip() hash, blocks_hashed, filesize, hashalgo, file = line.split(';') blocks_hashed = int(blocks_hashed) filesize = int(filesize) if os.path.isfile(file): if os.path.getsize(file) != filesize: result="BAD_SIZE" else: rehash = get_hash(open(file,'rb'), hashalgo, blocks_to_hash=blocks_hashed) if hash == rehash.split(";")[0]: result = "OK" else: result = "BAD_HASH" else: result="FILE_NOT_FOUND" if args.mismatches and result != "OK": print(f"{result};{line}") elif args.matches and result == "OK": print(f"{result};{line}") if __name__ == "__main__": main()