#!/usr/bin/python3
import sys
import hashlib
import os
import numpy as np
import math
import argparse

def even_select(N, M):
    if M > N/2:
        cut = np.zeros(N, dtype=int)
        q, r = divmod(N, N-M)
        indices = [q*i + min(i, r) for i in range(N-M)]
        cut[indices] = True
    else:
        cut = np.ones(N, dtype=int)
        q, r = divmod(N, M)
        indices = [q*i + min(i, r) for i in range(M)]
        cut[indices] = False
    return cut

def get_offsets(blocksize, blockcount,blocks_to_hash):
    selection = even_select(blockcount,blocks_to_hash)
    for i in range(0,blockcount):
        if selection[i] == 0:
            offset = int(blocksize*i)
            yield offset

def get_hash(file,hashalgo,spread,maxsize):
    h=hashlib.new(hashalgo)
    filesize = os.path.getsize(file.name)
    blocksize = h.block_size*65535
    blockcount = math.ceil(filesize/blocksize)
    blocks_to_hash = math.ceil(blockcount*spread/100)
    if (blocks_to_hash * blocksize) > maxsize:
        blocks_to_hash = math.ceil(maxsize/blocksize)
    if filesize>blocksize:
        for of in get_offsets(blocksize,blockcount,blocks_to_hash):
            infile.seek(of)
            h.update(file.read(blocksize))
    else:
        h.update(file.read(blocksize))
    result="{};{};{};{};{}".format(h.hexdigest(),blocks_to_hash,filesize,hashalgo,file.name)
    return result

parser = argparse.ArgumentParser(description='Sparsly hash large files. Only a given percentage of the file is actualy hashed.')

parser.add_argument('-p',metavar='N', action="store",dest="spread",type=int, nargs='?',default=10,help='percentage of file to hash. 0 < N < 100 (default=10)')
parser.add_argument('-s',metavar='N', action="store",dest="size",type=int, nargs='?',default=10,help='maximum amount of data per file in MB')
parser.add_argument('-c', action="store",dest="hashalgo",nargs='?',default="md5",help='select an hashalgorithm (default=md5)')
parser.add_argument('file', type=argparse.FileType('rb'), nargs='+')
args = parser.parse_args()

hashalgo = args.hashalgo
spread = args.spread
maxsize = args.size * 1024 * 1024
for infile in args.file:
    hashvalue = get_hash(infile,hashalgo,spread,maxsize)
    print(hashvalue)