Refactor scatterhash.py: Align with filesystem block size, improve hashing efficiency, and restructure argument parsing
This commit is contained in:
@@ -29,36 +29,44 @@ def get_offsets(blocksize, blockcount,blocks_to_hash):
|
|||||||
def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1):
|
def get_hash(file,hashalgo,spread=-1,maxsize=-1,blocks_to_hash=-1):
|
||||||
h = hashlib.new(hashalgo)
|
h = hashlib.new(hashalgo)
|
||||||
filesize = os.path.getsize(file.name)
|
filesize = os.path.getsize(file.name)
|
||||||
blocksize = h.block_size*65535
|
fs_block_size = os.stat(file.name).st_blksize
|
||||||
|
if fs_block_size % h.block_size != 0:
|
||||||
|
raise ValueError(f"Filesystem block size {fs_block_size} is not a multiple of hash block size {h.block_size}")
|
||||||
|
blocksize = fs_block_size
|
||||||
blockcount = math.ceil(filesize/blocksize)
|
blockcount = math.ceil(filesize/blocksize)
|
||||||
if blocks_to_hash == -1 :
|
if blocks_to_hash == -1 :
|
||||||
blocks_to_hash = math.ceil(blockcount*spread/100)
|
blocks_to_hash = math.ceil(blockcount*spread/100)
|
||||||
if (blocks_to_hash * blocksize) > maxsize:
|
if (blocks_to_hash * blocksize) > maxsize:
|
||||||
blocks_to_hash = math.ceil(maxsize/blocksize)
|
blocks_to_hash = math.ceil(maxsize/blocksize)
|
||||||
if filesize > blocksize:
|
if filesize > blocksize:
|
||||||
for of in get_offsets(blocksize,blockcount,blocks_to_hash):
|
for offset in get_offsets(blocksize, blockcount, blocks_to_hash):
|
||||||
file.seek(of)
|
file.seek(offset)
|
||||||
h.update(file.read(blocksize))
|
data = file.read(blocksize)
|
||||||
|
for i in range(0, len(data), h.block_size):
|
||||||
|
h.update(data[i:i + h.block_size])
|
||||||
else:
|
else:
|
||||||
h.update(file.read(blocksize))
|
h.update(file.read(blocksize))
|
||||||
result="{};{};{};{};{}".format(h.hexdigest(),blocks_to_hash,filesize,hashalgo,file.name)
|
result = f"{h.hexdigest()};{blocks_to_hash};{filesize};{hashalgo};{file.name}"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.')
|
|
||||||
|
|
||||||
parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)')
|
def main():
|
||||||
parser.add_argument('-s',metavar='N', action="store",dest="size",type=int, nargs='?',default=10,help='maximum amount of data per file in MB')
|
parser = argparse.ArgumentParser(description='Sparsely hash large files. Only a given percentage of the file is actually hashed.')
|
||||||
parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)')
|
parser.add_argument('-p', metavar='N', dest="spread", type=int, default=10, help='Percentage of file to hash. 0 < N < 100 (default=10)')
|
||||||
|
parser.add_argument('-s', metavar='N', dest="size", type=int, default=10, help='Maximum amount of data per file in MB')
|
||||||
|
parser.add_argument('-c', dest="hashalgo", default="md5", help='Select a hash algorithm (default=md5)')
|
||||||
parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
|
parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
|
||||||
parser.add_argument('-v', default=False, dest="validate", action='store_true', help='read output-file of previous run and validate hashes')
|
parser.add_argument('-v', dest="validate", action='store_true', help='Read output-file of previous run and validate hashes')
|
||||||
parser.add_argument('-1', default=True, dest="mismatches", action='store_false', help='suppress mismatches')
|
parser.add_argument('-1', dest="mismatches", action='store_false', help='Suppress mismatches')
|
||||||
parser.add_argument('-0', default=True, dest="matches", action='store_false', help='suppress matches')
|
parser.add_argument('-0', dest="matches", action='store_false', help='Suppress matches')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not args.validate:
|
|
||||||
hashalgo = args.hashalgo
|
hashalgo = args.hashalgo
|
||||||
spread = args.spread
|
spread = args.spread
|
||||||
maxsize = args.size * 1024 * 1024
|
maxsize = args.size * 1024 * 1024
|
||||||
|
print(args)
|
||||||
|
if not args.validate:
|
||||||
for infile in args.file:
|
for infile in args.file:
|
||||||
print(get_hash(infile, hashalgo, spread, maxsize))
|
print(get_hash(infile, hashalgo, spread, maxsize))
|
||||||
else:
|
else:
|
||||||
@@ -79,7 +87,11 @@ else:
|
|||||||
result = "BAD_HASH"
|
result = "BAD_HASH"
|
||||||
else:
|
else:
|
||||||
result="FILE_NOT_FOUND"
|
result="FILE_NOT_FOUND"
|
||||||
if args.mismatches and not result == "OK":
|
|
||||||
print("{};{}".format(result,line))
|
if args.mismatches and result != "OK":
|
||||||
|
print(f"{result};{line}")
|
||||||
elif args.matches and result == "OK":
|
elif args.matches and result == "OK":
|
||||||
print("{};{}".format(result,line))
|
print(f"{result};{line}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user