#!/usr/bin/python3 import os import re import mmh3 import string import sys from os import walk from chardet.universaldetector import UniversalDetector from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from multiprocessing import Pool,Lock import multiprocessing import hashlib import json import argparse lock = Lock() def log_to_file(text): global log_filename with lock: # thread blocks at this line until it can obtain lock with open(log_filename, 'a+') as file_log: file_log.write("{}\n".format(text)) def log_to_console(text): ps=multiprocessing.current_process() with lock: # thread blocks at this line until it can obtain lock print("[{}]:{}".format(ps.pid,text)) def get_mask(s): mask = "" for c in s: if c.isdigit(): mask += "?d" elif c.islower(): mask += "?l" elif c.isupper(): mask += "?u" else: mask += "?s" return mask def check_special(s): for c in s: if c in string.punctuation or c.isspace(): return True return False def check_upper(s): return any(i.isupper() for i in s) def check_lower(s): return any(i.islower() for i in s) def check_digit(s): return any(i.isdigit() for i in s) # list all files in dir def get_file_enconding(file): detector = UniversalDetector() with open(file, 'rb') as daf: i = 1000 for line in daf.readlines(): i -= 1 detector.feed(line) if detector.done or i == 0: break detector.close() r = detector.result return r["encoding"] patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)") def extract_email(line): global patter match = patter.search(line) if match: res = (match.group(1), match.group(2), match.group(4)) return (res) else: return None def strip_badbytes(b, encoding): return (b.decode(encoding, errors='ignore')).strip() def get_files(dir): files_in_log={} global threshold try: with open(log_filename,'r') as file_log: for line in file_log.readlines(): try: filedata=line.split(";") files_in_log[filedata[0]]=float(filedata[1]) except: log_to_console("Can't parse Line") pass except: log_to_console("Can't open Logfile") pass for (dirpath, dirnames, filenames) in walk(dir): for file in filenames: full_filename=os.path.join(dirpath, file) if full_filename in files_in_log and files_in_log[full_filename] > threshold: log_to_console('[~] Skipping file [Already Parsed]: %s' % full_filename) continue yield full_filename def get_lines(file,encoding=None): if not encoding: encoding = get_file_enconding(file) with open(file, 'rb') as f: return [strip_badbytes(line, encoding) for line in f] # for line in f: # yield (strip_badbytes(line, encoding)) def get_parsable_lines(file,encoding): global log_filename success = 0 # initialized with 1 to preven div/0 failure = 0 for line in get_lines(file,encoding): doc = extract_email(line) if doc: success += 1 yield doc else: failure += 1 success_rate = (success / (success + failure)) log_to_console('[+] Done parsing file: {} ({})'.format(file,success_rate)) log_to_file("{};{}".format(file, success_rate)) def get_hash(text): hash_object = hashlib.md5(text.encode()) return hash_object.hexdigest() # return hex(mmh3.hash(text, 12, signed=False)).split("x")[1] def get_user_pw_hash(text): return get_hash(text) # return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1] def create_doc(file,encoding): for cred in get_parsable_lines(file,encoding): doc = { "user" : cred[0], "domain" : cred[1], "password" : cred[2][:129], "file" : file, "length" : len(cred[2]), "passwordMask" : get_mask(cred[2]), "containsDigits" : check_digit(cred[2]), "containsLowerCase" : check_lower(cred[2]), "containsUpperCase" : check_upper(cred[2]), "containsSpecial" : check_special(cred[2]) } username_split=cred[0].split(";") if len(username_split)==2: if len(username_split[0]) > 0 and len(username_split[1]) > 0: doc["username"]=username_split[0] doc["user"]=username_split[1] id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"])) id_domain=id_hash[:1] yield id_domain, id_hash, doc def process_file(input_file,encoding): global index, doc_type_name for id_domain, id_hash, doc in create_doc(input_file,encoding): yield { "_index": "{}_{}".format(index,id_domain), "_type": doc_type_name, "_id": id_hash, "_source": doc } def index_file(input_file): encoding=get_file_enconding(input_file) if encoding: es = Elasticsearch(["172.16.1.141"],http_compress=True) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # pre=count["count"] log_to_console('[*] Indexing file: {}'.format(input_file)) try: success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, initial_backoff=60, max_retries=3, request_timeout=60, raise_on_error=False, raise_on_exception=True) log_to_console('[!] Indexing done: {} [{} lines committed]'.format(input_file,success)) except Exception as e: log_to_console('[!] Indexing failed for: {}\n[!] REASON:{}'.format(input_file,str((e.errors[0])))) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # post=count["count"] # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file)) else: log_to_console('[~] Skipping file [Unknown Encoding]: {}'.format(input_file)) def bench_file(input_file): ps=multiprocessing.current_process() encoding=get_file_enconding(input_file) devnull=open(os.devnull,'w') if encoding: es = Elasticsearch() # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # pre=count["count"] log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file)) docs=0 try: # success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False) for doc in process_file(input_file,encoding): docs+=1 devnull.write(json.dumps(doc)) log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs)) except Exception as e: log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message)) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # post=count["count"] # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file)) else: log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file)) index="" doc_type_name = "credential" log_filename = "processed_files" threshold = -1 #threshold for reparsing an already parsed file def main(): global index parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch") parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?') parser.add_argument("-i",help="index suffix",default="leak_data") parser.add_argument("-b",help="dont write to es just benchmark",action='store_true') parser.add_argument('folder') args = parser.parse_args() index=args.i workers=args.p dir=args.folder p=Pool(workers) if args.b: p.map(bench_file,get_files(dir)) else: p.map(index_file,get_files(dir)) if __name__ == '__main__': main()