gists/tools/scatterhash.py

#!/usr/bin/python3
import sys
import hashlib
import os
import numpy as np
import math
import argparse

def even_select(N, M):
    if M > N/2:
        cut = np.zeros(N, dtype=int)
        q, r = divmod(N, N-M)
        indices = [q*i + min(i, r) for i in range(N-M)]
        cut[indices] = True
    else:
        cut = np.ones(N, dtype=int)
        q, r = divmod(N, M)
        indices = [q*i + min(i, r) for i in range(M)]
        cut[indices] = False
    return cut

def get_offsets(blocksize, blockcount,blocks_to_hash):
    selection = even_select(blockcount,blocks_to_hash)
    for i in range(0,blockcount):
        if selection[i] == 0:
            offset = int(blocksize*i)
            yield offset

def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1):
    h = hashlib.new(hashalgo)
    filesize = os.path.getsize(file.name)
    fs_block_size = os.stat(file.name).st_blksize
    if fs_block_size % h.block_size != 0:
        raise ValueError(f"Filesystem block size {fs_block_size} is not a multiple of hash block size {h.block_size}")
    blocksize = fs_block_size
    blockcount = math.ceil(filesize/blocksize)
    if blocks_to_hash == -1 :
        blocks_to_hash = math.ceil(blockcount*spread/100)
        if (blocks_to_hash * blocksize) > maxsize:
            blocks_to_hash = math.ceil(maxsize/blocksize)
    if filesize > blocksize:
        for offset in get_offsets(blocksize, blockcount, blocks_to_hash):
            file.seek(offset)
            data = file.read(blocksize)
            for i in range(0, len(data), h.block_size):
                h.update(data[i:i + h.block_size])
    else:
        h.update(file.read(blocksize))
    result = f"{h.hexdigest()};{blocks_to_hash};{filesize};{hashalgo};{file.name}"
    return result


def main():
    parser = argparse.ArgumentParser(description='Sparsely hash large files. Only a given percentage of the file is actually hashed.')
    parser.add_argument('-p', metavar='N', dest="spread", type=int, default=10, help='Percentage of file to hash. 0 < N < 100 (default=10)')
    parser.add_argument('-s', metavar='N', dest="size", type=int, default=10, help='Maximum amount of data per file in MB')
    parser.add_argument('-c', dest="hashalgo", default="md5", help='Select a hash algorithm (default=md5)')
    parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
    parser.add_argument('-v', dest="validate", action='store_true', help='Read output-file of previous run and validate hashes')
    parser.add_argument('-1', dest="mismatches", action='store_false', help='Suppress mismatches')
    parser.add_argument('-0', dest="matches", action='store_false', help='Suppress matches')

    args = parser.parse_args()

    hashalgo = args.hashalgo
    spread = args.spread
    maxsize = args.size * 1024 * 1024
    print(args)
    if not args.validate:
        for infile in args.file:
            print(get_hash(infile, hashalgo, spread, maxsize))
    else:
        print("validating")
        for line in args.file[0]:
            line = line.decode().strip()
            hash, blocks_hashed, filesize, hashalgo, file = line.split(';')
            blocks_hashed = int(blocks_hashed)
            filesize = int(filesize)
            if os.path.isfile(file):
                if os.path.getsize(file) != filesize:
                    result="BAD_SIZE"
                else:
                    rehash = get_hash(open(file,'rb'), hashalgo, blocks_to_hash=blocks_hashed)
                    if hash == rehash.split(";")[0]:
                        result = "OK"
                    else:
                        result = "BAD_HASH"
            else:
                result="FILE_NOT_FOUND"

            if args.mismatches and result != "OK":
                print(f"{result};{line}")
            elif args.matches and result == "OK":
                print(f"{result};{line}")

if __name__ == "__main__":
    main()