Reorganise gists folder

2022-05-13 12:49:21 +02:00
parent ecd3c7fe2f
commit acd8d616dc
98 changed files with 63 additions and 20 deletions
--- a/codegrab/process_leak.py
+++ b/codegrab/process_leak.py
@@ -0,0 +1,272 @@
+#!/usr/bin/python3
+import os
+import re
+import mmh3
+import string
+import sys
+from os import walk
+from chardet.universaldetector import UniversalDetector
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk
+from multiprocessing import Pool,Lock
+import multiprocessing
+import hashlib
+import json
+import argparse
+
+
+lock = Lock()
+
+
+def log_to_file(text):
+    global log_filename
+    with lock: # thread blocks at this line until it can obtain lock
+        with open(log_filename, 'a+') as file_log:
+            file_log.write("{}\n".format(text))
+
+def log_to_console(text):
+    ps=multiprocessing.current_process()
+    with lock: # thread blocks at this line until it can obtain lock
+        print("[{}]:{}".format(ps.pid,text))
+
+
+def get_mask(s):
+    mask = ""
+    for c in s:
+        if c.isdigit():
+            mask += "?d"
+        elif c.islower():
+            mask += "?l"
+        elif c.isupper():
+            mask += "?u"
+        else:
+            mask += "?s"
+    return mask
+
+
+def check_special(s):
+    for c in s:
+        if c in string.punctuation or c.isspace():
+            return True
+    return False
+
+
+def check_upper(s):
+    return any(i.isupper() for i in s)
+
+
+def check_lower(s):
+    return any(i.islower() for i in s)
+
+
+def check_digit(s):
+    return any(i.isdigit() for i in s)
+
+
+# list all files in dir
+def get_file_enconding(file):
+    detector = UniversalDetector()
+    with open(file, 'rb') as daf:
+        i = 1000
+        for line in daf.readlines():
+            i -= 1
+            detector.feed(line)
+            if detector.done or i == 0:
+                break
+        detector.close()
+
+        r = detector.result
+        return r["encoding"]
+
+
+patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
+
+
+def extract_email(line):
+    global patter
+    match = patter.search(line)
+    if match:
+        res = (match.group(1), match.group(2), match.group(4))
+        return (res)
+    else:
+        return None
+
+
+def strip_badbytes(b, encoding):
+    return (b.decode(encoding, errors='ignore')).strip()
+
+
+def get_files(dir):
+    files_in_log={}
+    global threshold
+    try:
+        with open(log_filename,'r') as file_log:
+            for line in file_log.readlines():
+                try:
+                    filedata=line.split(";")
+                    files_in_log[filedata[0]]=float(filedata[1])
+                except:
+                    log_to_console("Can't parse Line")
+                    pass
+    except:
+        log_to_console("Can't open Logfile")
+        pass
+
+    for (dirpath, dirnames, filenames) in walk(dir):
+        for file in filenames:
+            full_filename=os.path.join(dirpath, file)
+            if full_filename in files_in_log and files_in_log[full_filename] > threshold:
+                log_to_console('[~] Skipping file [Already Parsed]: %s' % full_filename)
+                continue
+            yield full_filename
+
+
+def get_lines(file,encoding=None):
+    if not encoding:
+        encoding = get_file_enconding(file)
+    with open(file, 'rb') as f:
+        return [strip_badbytes(line, encoding) for line in f]
+        # for line in f:
+        #     yield (strip_badbytes(line, encoding))
+
+
+def get_parsable_lines(file,encoding):
+    global log_filename
+    success = 0  # initialized with 1 to preven div/0
+    failure = 0
+    for line in get_lines(file,encoding):
+        doc = extract_email(line)
+        if doc:
+            success += 1
+            yield doc
+        else:
+            failure += 1
+    success_rate = (success / (success + failure))
+    log_to_console('[+] Done parsing file: {} ({})'.format(file,success_rate))
+    log_to_file("{};{}".format(file, success_rate))
+
+
+def get_hash(text):
+    hash_object = hashlib.md5(text.encode())
+    return hash_object.hexdigest()
+    # return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
+
+
+def get_user_pw_hash(text):
+    return get_hash(text)
+    # return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
+
+
+def create_doc(file,encoding):
+    for cred in get_parsable_lines(file,encoding):
+        doc = {
+            "user"              :   cred[0],
+            "domain"            :   cred[1],
+            "password"          :   cred[2][:129],
+            "file"              :   file,
+            "length"            :   len(cred[2]),
+            "passwordMask"      :   get_mask(cred[2]),
+            "containsDigits"    :   check_digit(cred[2]),
+            "containsLowerCase" :   check_lower(cred[2]),
+            "containsUpperCase" :   check_upper(cred[2]),
+            "containsSpecial"   :   check_special(cred[2])
+           }
+        username_split=cred[0].split(";")
+        if len(username_split)==2:
+            if len(username_split[0]) > 0 and len(username_split[1]) > 0:
+                doc["username"]=username_split[0]
+                doc["user"]=username_split[1]
+        id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"]))
+        id_domain=id_hash[:1]
+        yield id_domain, id_hash, doc
+
+
+def process_file(input_file,encoding):
+    global index, doc_type_name
+    for id_domain, id_hash, doc in create_doc(input_file,encoding):
+        yield {
+                "_index": "{}_{}".format(index,id_domain),
+                "_type": doc_type_name,
+                "_id": id_hash,
+                "_source": doc
+        }
+
+
+def index_file(input_file):
+    encoding=get_file_enconding(input_file)
+    if encoding:
+        es = Elasticsearch(["172.16.1.141"],http_compress=True)
+        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
+        # pre=count["count"]
+        log_to_console('[*] Indexing file: {}'.format(input_file))
+        try:
+            success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, initial_backoff=60, max_retries=3, request_timeout=60, raise_on_error=False, raise_on_exception=True)
+            log_to_console('[!] Indexing done: {} [{} lines committed]'.format(input_file,success))
+        except Exception as e:
+            log_to_console('[!] Indexing failed for: {}\n[!] REASON:{}'.format(input_file,str((e.errors[0]))))
+        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
+        # post=count["count"]
+        # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
+    else:
+        log_to_console('[~] Skipping file [Unknown Encoding]: {}'.format(input_file))
+
+
+def bench_file(input_file):
+    ps=multiprocessing.current_process()
+    encoding=get_file_enconding(input_file)
+    devnull=open(os.devnull,'w')
+    if encoding:
+        es = Elasticsearch()
+        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
+        # pre=count["count"]
+        log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file))
+        docs=0
+        try:
+            # success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False)
+            for doc in process_file(input_file,encoding):
+                docs+=1
+                devnull.write(json.dumps(doc))
+
+
+            log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs))
+
+
+
+
+        except Exception as e:
+            log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
+        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
+        # post=count["count"]
+        # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
+    else:
+        log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
+
+
+
+index=""
+doc_type_name = "credential"
+log_filename = "processed_files"
+threshold = -1 #threshold for reparsing an already parsed file
+
+
+def main():
+    global index
+    parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")
+    parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?')
+    parser.add_argument("-i",help="index suffix",default="leak_data")
+    parser.add_argument("-b",help="dont write to es just benchmark",action='store_true')
+    parser.add_argument('folder')
+    args = parser.parse_args()
+    index=args.i
+    workers=args.p
+    dir=args.folder
+    p=Pool(workers)
+    if args.b:
+        p.map(bench_file,get_files(dir))
+    else:
+        p.map(index_file,get_files(dir))
+
+
+
+if __name__ == '__main__':
+    main()