From 19fb083b883983e2ff2fca6f3b700ab26cf9fb01 Mon Sep 17 00:00:00 2001
From: tke <tobias.kessels@certbw.de>
Date: Wed, 10 Jan 2024 11:51:50 +0100
Subject: [PATCH] Refactor scatterhash.py: Align with filesystem block size,
 improve hashing efficiency, and restructure argument parsing

---
 tools/scatterhash.py | 92 +++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/tools/scatterhash.py b/tools/scatterhash.py
index b137da6..32eb267 100755
--- a/tools/scatterhash.py
+++ b/tools/scatterhash.py
@@ -27,59 +27,71 @@ def get_offsets(blocksize, blockcount,blocks_to_hash):
             yield offset
 
 def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1):
-    h=hashlib.new(hashalgo)
+    h = hashlib.new(hashalgo)
     filesize = os.path.getsize(file.name)
-    blocksize = h.block_size*65535
+    fs_block_size = os.stat(file.name).st_blksize
+    if fs_block_size % h.block_size != 0:
+        raise ValueError(f"Filesystem block size {fs_block_size} is not a multiple of hash block size {h.block_size}")
+    blocksize = fs_block_size
     blockcount = math.ceil(filesize/blocksize)
     if blocks_to_hash == -1 :
         blocks_to_hash = math.ceil(blockcount*spread/100)
         if (blocks_to_hash * blocksize) > maxsize:
             blocks_to_hash = math.ceil(maxsize/blocksize)
-    if filesize>blocksize:
-        for of in get_offsets(blocksize,blockcount,blocks_to_hash):
-            file.seek(of)
-            h.update(file.read(blocksize))
+    if filesize > blocksize:
+        for offset in get_offsets(blocksize, blockcount, blocks_to_hash):
+            file.seek(offset)
+            data = file.read(blocksize)
+            for i in range(0, len(data), h.block_size):
+                h.update(data[i:i + h.block_size])
     else:
         h.update(file.read(blocksize))
-    result="{};{};{};{};{}".format(h.hexdigest(),blocks_to_hash,filesize,hashalgo,file.name)
+    result = f"{h.hexdigest()};{blocks_to_hash};{filesize};{hashalgo};{file.name}"
     return result
 
-parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.')
 
-parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)')
-parser.add_argument('-s',metavar='N', action="store",dest="size",type=int, nargs='?',default=10,help='maximum amount of data per file in MB')
-parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)')
-parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
-parser.add_argument('-v', default=False, dest="validate", action='store_true', help='read output-file of previous run and validate hashes')
-parser.add_argument('-1', default=True, dest="mismatches", action='store_false', help='suppress mismatches')
-parser.add_argument('-0', default=True, dest="matches", action='store_false', help='suppress matches')
-args = parser.parse_args()
+def main():
+    parser = argparse.ArgumentParser(description='Sparsely hash large files. Only a given percentage of the file is actually hashed.')
+    parser.add_argument('-p', metavar='N', dest="spread", type=int, default=10, help='Percentage of file to hash. 0 < N < 100 (default=10)')
+    parser.add_argument('-s', metavar='N', dest="size", type=int, default=10, help='Maximum amount of data per file in MB')
+    parser.add_argument('-c', dest="hashalgo", default="md5", help='Select a hash algorithm (default=md5)')
+    parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
+    parser.add_argument('-v', dest="validate", action='store_true', help='Read output-file of previous run and validate hashes')
+    parser.add_argument('-1', dest="mismatches", action='store_false', help='Suppress mismatches')
+    parser.add_argument('-0', dest="matches", action='store_false', help='Suppress matches')
+
+    args = parser.parse_args()
 
-if not args.validate:
     hashalgo = args.hashalgo
     spread = args.spread
     maxsize = args.size * 1024 * 1024
-    for infile in args.file:
-        print(get_hash(infile,hashalgo,spread,maxsize))
-else:
-    print("validating")
-    for line in args.file[0]:
-        line=line.decode().strip()
-        hash, blocks_hashed, filesize, hashalgo, file = line.split(';')
-        blocks_hashed=int(blocks_hashed)
-        filesize=int(filesize)
-        if os.path.isfile(file):
-            if os.path.getsize(file) != filesize:
-                result="BAD_SIZE"
-            else:
-                rehash=get_hash(open(file,'rb'),hashalgo,blocks_to_hash=blocks_hashed)
-                if hash == rehash.split(";")[0]:
-                    result = "OK"
+    print(args)
+    if not args.validate:
+        for infile in args.file:
+            print(get_hash(infile, hashalgo, spread, maxsize))
+    else:
+        print("validating")
+        for line in args.file[0]:
+            line = line.decode().strip()
+            hash, blocks_hashed, filesize, hashalgo, file = line.split(';')
+            blocks_hashed = int(blocks_hashed)
+            filesize = int(filesize)
+            if os.path.isfile(file):
+                if os.path.getsize(file) != filesize:
+                    result="BAD_SIZE"
                 else:
-                    result = "BAD_HASH"
-        else:
-            result="FILE_NOT_FOUND"
-        if args.mismatches and not result == "OK":
-            print("{};{}".format(result,line))
-        elif args.matches and result == "OK":
-            print("{};{}".format(result,line))
+                    rehash = get_hash(open(file,'rb'), hashalgo, blocks_to_hash=blocks_hashed)
+                    if hash == rehash.split(";")[0]:
+                        result = "OK"
+                    else:
+                        result = "BAD_HASH"
+            else:
+                result="FILE_NOT_FOUND"
+            
+            if args.mismatches and result != "OK":
+                print(f"{result};{line}")
+            elif args.matches and result == "OK":
+                print(f"{result};{line}")
+
+if __name__ == "__main__":
+    main()