Update scatterhash to honor a maxsize

This commit is contained in:
TKE
2020-12-23 11:49:36 +01:00
parent b9428d9af7
commit 3b2585abe5

View File

@@ -26,29 +26,34 @@ def get_offsets(blocksize, blockcount,blocks_to_hash):
offset = int(blocksize*i) offset = int(blocksize*i)
yield offset yield offset
def get_hash(file,hashalgo,spread): def get_hash(file,hashalgo,spread,maxsize):
h=hashlib.new(hashalgo) h=hashlib.new(hashalgo)
filesize = os.path.getsize(file.name) filesize = os.path.getsize(file.name)
blocksize = h.block_size*65535 blocksize = h.block_size*65535
blockcount = math.ceil(filesize/blocksize) blockcount = math.ceil(filesize/blocksize)
blocks_to_hash = math.ceil(blockcount*spread/100) blocks_to_hash = math.ceil(blockcount*spread/100)
if (blocks_to_hash * blocksize) > maxsize:
blocks_to_hash = math.ceil(maxsize/blocksize)
if filesize>blocksize: if filesize>blocksize:
for of in get_offsets(blocksize,blockcount,blocks_to_hash): for of in get_offsets(blocksize,blockcount,blocks_to_hash):
infile.seek(of) infile.seek(of)
h.update(file.read(blocksize)) h.update(file.read(blocksize))
else: else:
h.update(file.read(blocksize)) h.update(file.read(blocksize))
result="{};{};{};{};{}".format(h.hexdigest(),spread,filesize,hashalgo,file.name) result="{};{};{};{};{}".format(h.hexdigest(),blocks_to_hash,filesize,hashalgo,file.name)
return result return result
parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.') parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.')
parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)') parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)')
parser.add_argument('-s',metavar='N', action="store",dest="size",type=int, nargs='?',default=10,help='maximum amount of data per file in MB')
parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)') parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)')
parser.add_argument('file', type=argparse.FileType('rb'), nargs='+') parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
args = parser.parse_args() args = parser.parse_args()
hashalgo = args.hashalgo hashalgo = args.hashalgo
spread = args.spread spread = args.spread
maxsize = args.size * 1024 * 1024
for infile in args.file: for infile in args.file:
hashvalue = get_hash(infile,hashalgo,spread) hashvalue = get_hash(infile,hashalgo,spread,maxsize)
print(hashvalue) print(hashvalue)