From a677c73ed675e7d8267f4407e3dfb44ebf675be8 Mon Sep 17 00:00:00 2001 From: Tobias Kessels Date: Sun, 3 Feb 2019 23:33:54 +0100 Subject: [PATCH] added some new functionality to process_leak --- process_leak.py | 164 +++++++++++++++++++++++++++--------------------- quickchardet.py | 39 ++++++++++++ 2 files changed, 132 insertions(+), 71 deletions(-) create mode 100755 quickchardet.py diff --git a/process_leak.py b/process_leak.py index 7a218ac..33d4d3a 100755 --- a/process_leak.py +++ b/process_leak.py @@ -1,129 +1,151 @@ #!/usr/bin/python3 -import sys -import chardet import os -from os import walk -from chardet.universaldetector import UniversalDetector import re import mmh3 -# from bs4 import UnicodeDammit +import string +import sys +from os import walk + +from chardet.universaldetector import UniversalDetector from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk -import string def get_mask(s): - mask = "" - for c in s: - if c.isdigit(): - mask += "?d" - elif c.islower(): - mask += "?l" - elif c.isupper(): - mask += "?u" - else: - mask += "?s" - return mask + mask = "" + for c in s: + if c.isdigit(): + mask += "?d" + elif c.islower(): + mask += "?l" + elif c.isupper(): + mask += "?u" + else: + mask += "?s" + return mask + def check_special(s): - for c in s: - if c in string.punctuation or c.isspace(): - return True - return False + for c in s: + if c in string.punctuation or c.isspace(): + return True + return False + def check_upper(s): - return any(i.isupper() for i in s) + return any(i.isupper() for i in s) + def check_lower(s): - return any(i.islower() for i in s) + return any(i.islower() for i in s) + def check_digit(s): return any(i.isdigit() for i in s) -#list all files in dir + +# list all files in dir def get_file_enconding(file): detector = UniversalDetector() - with open(file,'rb') as daf: - i=1000 + with open(file, 'rb') as daf: + i = 1000 for line in daf.readlines(): - i-=1 + i -= 1 detector.feed(line) - if detector.done or i==0: + if detector.done or i == 0: break detector.close() - r=detector.result + r = detector.result return r["encoding"] -patter=re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)") + +patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)") + def extract_email(line): global patter - match=patter.search(line) + match = patter.search(line) if match: - res=(match.group(1),match.group(2),match.group(4)) + res = (match.group(1), match.group(2), match.group(4)) return (res) else: return None -def strip_badbytes(b,encoding): + +def strip_badbytes(b, encoding): return (b.decode(encoding, errors='ignore')).strip() -def get_files(dir): - for (dirpath, dirnames, filenames) in walk(dir): - for x in filenames: - yield os.path.join(dirpath,x) -def get_lines(file): - encoding=get_file_enconding(file) +def get_files(dir): + for (dirpath, dirnames, filenames) in walk(dir): + for file in filenames: + full_filename=os.path.join(dirpath, file) + encoding=get_file_enconding(full_filename) + if encoding: + yield encoding, full_filename + + +def get_lines(file,encoding=None): + if not encoding: + encoding = get_file_enconding(file) with open(file, 'rb') as f: for line in f: - yield(strip_badbytes(line,encoding)) + yield (strip_badbytes(line, encoding)) + def get_parsable_lines(file): - success=1 #initialized with 1 to preven div/0 - failure=1 + global log_filename + success = 1 # initialized with 1 to preven div/0 + failure = 1 for line in get_lines(file): - doc=extract_email(line) + doc = extract_email(line) if doc: - success+=1 + success += 1 yield doc else: - failure+=1 - success_rate=(success/(success+failure)) - with open("processed_files",'a+') as file_log: - file_log.write("{};{}\n".format(file,success_rate)) + failure += 1 + success_rate = (success / (success + failure)) + with open(log_filename, 'a+') as file_log: + file_log.write("{};{}\n".format(file, success_rate)) def create_doc(file): for cred in get_parsable_lines(file): - doc={} - doc["user"],doc["domain"],doc["password"] = cred - doc["file"]=file - doc["length"] = len(doc["password"]) - doc["passwordMask"] = get_mask(doc["password"]) - doc["containsDigits"] = check_digit(doc["password"]) - doc["containsLowerCase"] = check_lower(doc["password"]) - doc["containsUpperCase"] = check_upper(doc["password"]) - doc["containsSpecial"] = check_special(doc["password"]) - yield doc + doc = { + "user" : cred[0], + "domain" : cred[1], + "password" : cred[2], + "file" : file, + "length" : len(cred[2]), + "passwordMask" : get_mask(cred[2]), + "containsDigits" : check_digit(cred[2]), + "containsLowerCase" : check_lower(cred[2]), + "containsUpperCase" : check_upper(cred[2]), + "containsSpecial" : check_special(cred[2]) + } + id_hash=hex(mmh3.hash128(",".join((doc["user"], doc["domain"], doc["password"])), 12,signed=False) % 1000000000000000000000) + yield id_hash, doc - -def set_data(input_file, index_name = "leak_col1", doc_type_name = "credential"): - for doc in create_doc(input_file): - id=hex(mmh3.hash128(",".join((doc["user"],doc["domain"],doc["password"])),12,signed=False)%1000000000000000000000) +def process_file(input_file): + global index_prefix, doc_type_name + filenamehash=hex(mmh3.hash128(input_file, 12,signed=False) % 1000000000000000000000) + for id_hash, doc in create_doc(input_file): yield { - "_index": index_name, - "_type": doc_type_name, - "_id": id, - "_source": doc + "_index": "{}_{}".format(index_prefix, filenamehash), + "_type": doc_type_name, + "_id": id_hash, + "_source": doc } -def load(es, input_file, **kwargs): - print('[*] Indexing file: %s' % input_file) - success, _ = bulk(es, set_data(input_file, **kwargs), request_timeout = 60, raise_on_exception = False) + +index_prefix = "leak_col1" +doc_type_name = "credential" +log_filename = "processed_files" es = Elasticsearch() -for data in get_files(sys.argv[1]): - load(es,data) + +for encoding, data in get_files(sys.argv[1]): + print('[*] Indexing file: %s' % data) + success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False) diff --git a/quickchardet.py b/quickchardet.py new file mode 100755 index 0000000..9eaa26c --- /dev/null +++ b/quickchardet.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 +import chardet +from chardet import UniversalDetector +import sys +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("-l",help="list all encoding changes in file",action='store_true') +parser.add_argument("-d",help="try to decode all Lines",action='store_true') +parser.add_argument('filename') +args = parser.parse_args() + + +with open(args.filename,'rb') as infile: + det=UniversalDetector() + if args.l: + print("listing encodings of file \"{}\"".format(args.filename)) + encoding=None + for nl,line in enumerate(infile.readlines()): + det.reset() + det.feed(line) + det.close() + res=det.result + if encoding != res["encoding"]: + encoding=res["encoding"] + if args.d: + print("{}#{}#{}({})".format(nl,line.decode(res["encoding"]),res["encoding"],res["confidence"])) + else: + print("{}#{}#{}({})".format(nl,line,res["encoding"],res["confidence"])) + else: + i=1000 + for line in infile.readlines(): + i-=1 + det.feed(line) + if det.done or i==0: + break + det.close() + res=det.result + print("{}:{}({})".format(sys.argv[1],res["encoding"],res["confidence"]))