reworked scatterhash.py

uses argparser now and is a bit faster due to hashing at least 4M chunks
This commit is contained in:
Tobias Kessels
2018-12-19 17:50:41 +01:00
parent 3d77ba9320
commit e5c6be9edc

View File

@@ -3,6 +3,8 @@ import sys
import hashlib import hashlib
import os import os
import numpy as np import numpy as np
import math
import argparse
def even_select(N, M): def even_select(N, M):
if M > N/2: if M > N/2:
@@ -18,32 +20,38 @@ def even_select(N, M):
return cut return cut
def get_offsets(chunksize, spread): def get_offsets(blocksize, blockcount,blocks_to_hash):
selection=even_select(100,spread) selection=even_select(blockcount,blocks_to_hash)
for i in range(0,100): for i in range(0,blockcount):
if selection[i]==0: if selection[i]==0:
offset=int(chunksize*i) offset=int(blocksize*i)
yield offset yield offset
def get_blocks(filename,spread,blocksize):
filesize=os.path.getsize(filename) def get_hash(file,hashalgo,spread):
chunksize=filesize/100 h=hashlib.new(hashalgo)
with open(filename,'rb') as infile: filesize=os.path.getsize(file.name)
for of in get_offsets(chunksize,spread): blocksize=h.block_size*65535
blockcount=math.ceil(filesize/blocksize)
blocks_to_hash=math.ceil(blockcount*spread/100)
if filesize>blocksize:
for of in get_offsets(blocksize,blockcount,blocks_to_hash):
infile.seek(of) infile.seek(of)
tohashsize=chunksize h.update(file.read(blocksize))
while tohashsize > 0: else:
yield infile.read(blocksize) h.update(file.read(blocksize))
tohashsize-=h.block_size
result="{};{};{};{};{}".format(h.hexdigest(),spread,filesize,hashalgo,file.name)
return result
parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.')
parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)')
parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)')
parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
args=parser.parse_args()
hashalgo="md5" hashalgo=args.hashalgo
filename=sys.argv[2] spread=args.spread
spread=int(sys.argv[1]) #percentage of hash for infile in args.file:
h=hashlib.new(hashalgo) hashvalue=get_hash(infile,hashalgo,spread)
print(hashvalue)
blocksize=h.block_size*4
for block in get_blocks(filename,spread,blocksize):
h.update(block)
print(h.hexdigest())