#!/usr/bin/python3 import os import re import mmh3 import string import sys from os import walk from chardet.universaldetector import UniversalDetector from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk from multiprocessing import Pool,Lock import multiprocessing lock = Lock() def log_to_file(text): global log_filename with lock: # thread blocks at this line until it can obtain lock with open(log_filename, 'a+') as file_log: file_log.write("{}\n".format(text)) def log_to_console(text): with lock: # thread blocks at this line until it can obtain lock print(text) def get_mask(s): mask = "" for c in s: if c.isdigit(): mask += "?d" elif c.islower(): mask += "?l" elif c.isupper(): mask += "?u" else: mask += "?s" return mask def check_special(s): for c in s: if c in string.punctuation or c.isspace(): return True return False def check_upper(s): return any(i.isupper() for i in s) def check_lower(s): return any(i.islower() for i in s) def check_digit(s): return any(i.isdigit() for i in s) # list all files in dir def get_file_enconding(file): detector = UniversalDetector() with open(file, 'rb') as daf: i = 1000 for line in daf.readlines(): i -= 1 detector.feed(line) if detector.done or i == 0: break detector.close() r = detector.result return r["encoding"] patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)") def extract_email(line): global patter match = patter.search(line) if match: res = (match.group(1), match.group(2), match.group(4)) return (res) else: return None def strip_badbytes(b, encoding): return (b.decode(encoding, errors='ignore')).strip() def get_files(dir): files_in_log={} global threshold try: with open(log_filename,'r') as file_log: for line in file_log.readlines(): try: filedata=line.split(";") files_in_log[filedata[0]]=float(filedata[1]) except: log_to_console("Can't parse Line") pass except: log_to_console("Can't open Logfile") pass for (dirpath, dirnames, filenames) in walk(dir): for file in filenames: full_filename=os.path.join(dirpath, file) if full_filename in files_in_log and files_in_log[full_filename] > threshold: log_to_console('[~] Skipping file [Already Parsed]: %s' % full_filename) continue yield full_filename def get_lines(file,encoding=None): if not encoding: encoding = get_file_enconding(file) with open(file, 'rb') as f: for line in f: yield (strip_badbytes(line, encoding)) def get_parsable_lines(file,encoding): global log_filename success = 0 # initialized with 1 to preven div/0 failure = 0 for line in get_lines(file,encoding): doc = extract_email(line) if doc: success += 1 yield doc else: failure += 1 success_rate = (success / (success + failure)) log_to_console('[+] Done with file: {} ({})'.format(file,success_rate)) log_to_file("{};{}".format(file, success_rate)) def get_hash(text): return hex(mmh3.hash(text, 12, signed=False)).split("x")[1] def get_user_pw_hash(text): return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1] def create_doc(file,encoding): for cred in get_parsable_lines(file,encoding): doc = { "user" : cred[0], "domain" : cred[1], "password" : cred[2], "file" : file, "length" : len(cred[2]), "passwordMask" : get_mask(cred[2]), "containsDigits" : check_digit(cred[2]), "containsLowerCase" : check_lower(cred[2]), "containsUpperCase" : check_upper(cred[2]), "containsSpecial" : check_special(cred[2]) } username_split=cred[0].split(";") if len(username_split)==2: if len(username_split[0]) > 0 and len(username_split[1]) > 0: doc["username"]=username_split[0] doc["user"]=username_split[1] id_hash=get_user_pw_hash("{}{}".format(doc["user"],doc["password"])) # id_domain=get_domain_hash(cred[1]) id_domain=id_hash[:1] yield id_domain, id_hash, doc def process_file(input_file,encoding): global index, doc_type_name for id_domain, id_hash, doc in create_doc(input_file,encoding): yield { "_index": "{}_{}".format(index,id_domain), "_type": doc_type_name, "_id": id_hash, "_source": doc } def index_file(input_file): ps=multiprocessing.current_process() encoding=get_file_enconding(input_file) if encoding: es = Elasticsearch() # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # pre=count["count"] log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file)) success, _ = bulk(es, process_file(input_file,encoding), request_timeout=60, raise_on_exception=False) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # post=count["count"] # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file)) else: log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file)) index = "leak_col1" doc_type_name = "credential" log_filename = "processed_files" threshold = -1 #threshold for reparsing an already parsed file p=Pool(20) def main(): dir=sys.argv[1] # for filename in get_files(dir): # index_file(filename) p.map(index_file,get_files(dir)) if __name__ == '__main__': main()