From 19fb083b883983e2ff2fca6f3b700ab26cf9fb01 Mon Sep 17 00:00:00 2001 From: tke Date: Wed, 10 Jan 2024 11:51:50 +0100 Subject: [PATCH] Refactor scatterhash.py: Align with filesystem block size, improve hashing efficiency, and restructure argument parsing --- tools/scatterhash.py | 92 +++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/tools/scatterhash.py b/tools/scatterhash.py index b137da6..32eb267 100755 --- a/tools/scatterhash.py +++ b/tools/scatterhash.py @@ -27,59 +27,71 @@ def get_offsets(blocksize, blockcount,blocks_to_hash): yield offset def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1): - h=hashlib.new(hashalgo) + h = hashlib.new(hashalgo) filesize = os.path.getsize(file.name) - blocksize = h.block_size*65535 + fs_block_size = os.stat(file.name).st_blksize + if fs_block_size % h.block_size != 0: + raise ValueError(f"Filesystem block size {fs_block_size} is not a multiple of hash block size {h.block_size}") + blocksize = fs_block_size blockcount = math.ceil(filesize/blocksize) if blocks_to_hash == -1 : blocks_to_hash = math.ceil(blockcount*spread/100) if (blocks_to_hash * blocksize) > maxsize: blocks_to_hash = math.ceil(maxsize/blocksize) - if filesize>blocksize: - for of in get_offsets(blocksize,blockcount,blocks_to_hash): - file.seek(of) - h.update(file.read(blocksize)) + if filesize > blocksize: + for offset in get_offsets(blocksize, blockcount, blocks_to_hash): + file.seek(offset) + data = file.read(blocksize) + for i in range(0, len(data), h.block_size): + h.update(data[i:i + h.block_size]) else: h.update(file.read(blocksize)) - result="{};{};{};{};{}".format(h.hexdigest(),blocks_to_hash,filesize,hashalgo,file.name) + result = f"{h.hexdigest()};{blocks_to_hash};{filesize};{hashalgo};{file.name}" return result -parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.') -parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)') -parser.add_argument('-s',metavar='N', action="store",dest="size",type=int, nargs='?',default=10,help='maximum amount of data per file in MB') -parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)') -parser.add_argument('file', type=argparse.FileType('rb'), nargs='+') -parser.add_argument('-v', default=False, dest="validate", action='store_true', help='read output-file of previous run and validate hashes') -parser.add_argument('-1', default=True, dest="mismatches", action='store_false', help='suppress mismatches') -parser.add_argument('-0', default=True, dest="matches", action='store_false', help='suppress matches') -args = parser.parse_args() +def main(): + parser = argparse.ArgumentParser(description='Sparsely hash large files. Only a given percentage of the file is actually hashed.') + parser.add_argument('-p', metavar='N', dest="spread", type=int, default=10, help='Percentage of file to hash. 0 < N < 100 (default=10)') + parser.add_argument('-s', metavar='N', dest="size", type=int, default=10, help='Maximum amount of data per file in MB') + parser.add_argument('-c', dest="hashalgo", default="md5", help='Select a hash algorithm (default=md5)') + parser.add_argument('file', type=argparse.FileType('rb'), nargs='+') + parser.add_argument('-v', dest="validate", action='store_true', help='Read output-file of previous run and validate hashes') + parser.add_argument('-1', dest="mismatches", action='store_false', help='Suppress mismatches') + parser.add_argument('-0', dest="matches", action='store_false', help='Suppress matches') + + args = parser.parse_args() -if not args.validate: hashalgo = args.hashalgo spread = args.spread maxsize = args.size * 1024 * 1024 - for infile in args.file: - print(get_hash(infile,hashalgo,spread,maxsize)) -else: - print("validating") - for line in args.file[0]: - line=line.decode().strip() - hash, blocks_hashed, filesize, hashalgo, file = line.split(';') - blocks_hashed=int(blocks_hashed) - filesize=int(filesize) - if os.path.isfile(file): - if os.path.getsize(file) != filesize: - result="BAD_SIZE" - else: - rehash=get_hash(open(file,'rb'),hashalgo,blocks_to_hash=blocks_hashed) - if hash == rehash.split(";")[0]: - result = "OK" + print(args) + if not args.validate: + for infile in args.file: + print(get_hash(infile, hashalgo, spread, maxsize)) + else: + print("validating") + for line in args.file[0]: + line = line.decode().strip() + hash, blocks_hashed, filesize, hashalgo, file = line.split(';') + blocks_hashed = int(blocks_hashed) + filesize = int(filesize) + if os.path.isfile(file): + if os.path.getsize(file) != filesize: + result="BAD_SIZE" else: - result = "BAD_HASH" - else: - result="FILE_NOT_FOUND" - if args.mismatches and not result == "OK": - print("{};{}".format(result,line)) - elif args.matches and result == "OK": - print("{};{}".format(result,line)) + rehash = get_hash(open(file,'rb'), hashalgo, blocks_to_hash=blocks_hashed) + if hash == rehash.split(";")[0]: + result = "OK" + else: + result = "BAD_HASH" + else: + result="FILE_NOT_FOUND" + + if args.mismatches and result != "OK": + print(f"{result};{line}") + elif args.matches and result == "OK": + print(f"{result};{line}") + +if __name__ == "__main__": + main()