From 2a9c0e625abfb64278f647b32bc95171cbd20285 Mon Sep 17 00:00:00 2001 From: Tobias Kessels Date: Mon, 4 Feb 2019 17:30:31 +0100 Subject: [PATCH] tweaked some --- process_leak.py | 56 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/process_leak.py b/process_leak.py index 33d4d3a..dab3155 100755 --- a/process_leak.py +++ b/process_leak.py @@ -78,12 +78,33 @@ def strip_badbytes(b, encoding): def get_files(dir): + files_in_log={} + global threshold + try: + with open(log_filename,'r') as file_log: + for line in file_log.readlines(): + try: + filedata=line.split(";") + files_in_log[filedata[0]]=float(filedata[1]) + except: + print("Can't parse Line") + pass + except: + print("Can't open Logfile") + pass + for (dirpath, dirnames, filenames) in walk(dir): for file in filenames: full_filename=os.path.join(dirpath, file) + if full_filename in files_in_log and files_in_log[full_filename] > threshold: + print('[~] Skipping file [Already Parsed]: %s' % full_filename) + continue + encoding=get_file_enconding(full_filename) if encoding: yield encoding, full_filename + else: + print('[~] Skipping file [Unknown Encoding]: %s' % full_filename) def get_lines(file,encoding=None): @@ -96,8 +117,8 @@ def get_lines(file,encoding=None): def get_parsable_lines(file): global log_filename - success = 1 # initialized with 1 to preven div/0 - failure = 1 + success = 0 # initialized with 1 to preven div/0 + failure = 0 for line in get_lines(file): doc = extract_email(line) if doc: @@ -106,6 +127,7 @@ def get_parsable_lines(file): else: failure += 1 success_rate = (success / (success + failure)) + print('[+] Done with file: {} ({})'.format(file,success_rate)) with open(log_filename, 'a+') as file_log: file_log.write("{};{}\n".format(file, success_rate)) @@ -124,28 +146,42 @@ def create_doc(file): "containsUpperCase" : check_upper(cred[2]), "containsSpecial" : check_special(cred[2]) } + username_split=cred[0].split(";") + if len(username_split)==2: + doc["username"]=username_split[0] + doc["user"]=username_split[1] id_hash=hex(mmh3.hash128(",".join((doc["user"], doc["domain"], doc["password"])), 12,signed=False) % 1000000000000000000000) yield id_hash, doc def process_file(input_file): - global index_prefix, doc_type_name - filenamehash=hex(mmh3.hash128(input_file, 12,signed=False) % 1000000000000000000000) + global index, doc_type_name for id_hash, doc in create_doc(input_file): yield { - "_index": "{}_{}".format(index_prefix, filenamehash), + "_index": index, "_type": doc_type_name, "_id": id_hash, "_source": doc } -index_prefix = "leak_col1" +index = "leak_col1" doc_type_name = "credential" log_filename = "processed_files" +threshold = 0 #threshold for reparsing an already parsed file -es = Elasticsearch() +def main(): + es = Elasticsearch() + dir=sys.argv[1] + for encoding, data in get_files(dir): + count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) + pre=count["count"] + print('[*] Indexing file: %s' % data) + success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False) + count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) + post=count["count"] + print('[=] Added {} Documents with {}'.format(post-pre,data)) -for encoding, data in get_files(sys.argv[1]): - print('[*] Indexing file: %s' % data) - success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False) + + +main()