Reorganise gists folder

This commit is contained in:
TKE
2022-05-13 12:49:21 +02:00
parent ecd3c7fe2f
commit acd8d616dc
98 changed files with 63 additions and 20 deletions

272
codegrab/process_leak.py Executable file
View File

@@ -0,0 +1,272 @@
#!/usr/bin/python3
import os
import re
import mmh3
import string
import sys
from os import walk
from chardet.universaldetector import UniversalDetector
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from multiprocessing import Pool,Lock
import multiprocessing
import hashlib
import json
import argparse
lock = Lock()
def log_to_file(text):
global log_filename
with lock: # thread blocks at this line until it can obtain lock
with open(log_filename, 'a+') as file_log:
file_log.write("{}\n".format(text))
def log_to_console(text):
ps=multiprocessing.current_process()
with lock: # thread blocks at this line until it can obtain lock
print("[{}]:{}".format(ps.pid,text))
def get_mask(s):
mask = ""
for c in s:
if c.isdigit():
mask += "?d"
elif c.islower():
mask += "?l"
elif c.isupper():
mask += "?u"
else:
mask += "?s"
return mask
def check_special(s):
for c in s:
if c in string.punctuation or c.isspace():
return True
return False
def check_upper(s):
return any(i.isupper() for i in s)
def check_lower(s):
return any(i.islower() for i in s)
def check_digit(s):
return any(i.isdigit() for i in s)
# list all files in dir
def get_file_enconding(file):
detector = UniversalDetector()
with open(file, 'rb') as daf:
i = 1000
for line in daf.readlines():
i -= 1
detector.feed(line)
if detector.done or i == 0:
break
detector.close()
r = detector.result
return r["encoding"]
patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
def extract_email(line):
global patter
match = patter.search(line)
if match:
res = (match.group(1), match.group(2), match.group(4))
return (res)
else:
return None
def strip_badbytes(b, encoding):
return (b.decode(encoding, errors='ignore')).strip()
def get_files(dir):
files_in_log={}
global threshold
try:
with open(log_filename,'r') as file_log:
for line in file_log.readlines():
try:
filedata=line.split(";")
files_in_log[filedata[0]]=float(filedata[1])
except:
log_to_console("Can't parse Line")
pass
except:
log_to_console("Can't open Logfile")
pass
for (dirpath, dirnames, filenames) in walk(dir):
for file in filenames:
full_filename=os.path.join(dirpath, file)
if full_filename in files_in_log and files_in_log[full_filename] > threshold:
log_to_console('[~] Skipping file [Already Parsed]: %s' % full_filename)
continue
yield full_filename
def get_lines(file,encoding=None):
if not encoding:
encoding = get_file_enconding(file)
with open(file, 'rb') as f:
return [strip_badbytes(line, encoding) for line in f]
# for line in f:
# yield (strip_badbytes(line, encoding))
def get_parsable_lines(file,encoding):
global log_filename
success = 0 # initialized with 1 to preven div/0
failure = 0
for line in get_lines(file,encoding):
doc = extract_email(line)
if doc:
success += 1
yield doc
else:
failure += 1
success_rate = (success / (success + failure))
log_to_console('[+] Done parsing file: {} ({})'.format(file,success_rate))
log_to_file("{};{}".format(file, success_rate))
def get_hash(text):
hash_object = hashlib.md5(text.encode())
return hash_object.hexdigest()
# return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
def get_user_pw_hash(text):
return get_hash(text)
# return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
def create_doc(file,encoding):
for cred in get_parsable_lines(file,encoding):
doc = {
"user" : cred[0],
"domain" : cred[1],
"password" : cred[2][:129],
"file" : file,
"length" : len(cred[2]),
"passwordMask" : get_mask(cred[2]),
"containsDigits" : check_digit(cred[2]),
"containsLowerCase" : check_lower(cred[2]),
"containsUpperCase" : check_upper(cred[2]),
"containsSpecial" : check_special(cred[2])
}
username_split=cred[0].split(";")
if len(username_split)==2:
if len(username_split[0]) > 0 and len(username_split[1]) > 0:
doc["username"]=username_split[0]
doc["user"]=username_split[1]
id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"]))
id_domain=id_hash[:1]
yield id_domain, id_hash, doc
def process_file(input_file,encoding):
global index, doc_type_name
for id_domain, id_hash, doc in create_doc(input_file,encoding):
yield {
"_index": "{}_{}".format(index,id_domain),
"_type": doc_type_name,
"_id": id_hash,
"_source": doc
}
def index_file(input_file):
encoding=get_file_enconding(input_file)
if encoding:
es = Elasticsearch(["172.16.1.141"],http_compress=True)
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# pre=count["count"]
log_to_console('[*] Indexing file: {}'.format(input_file))
try:
success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, initial_backoff=60, max_retries=3, request_timeout=60, raise_on_error=False, raise_on_exception=True)
log_to_console('[!] Indexing done: {} [{} lines committed]'.format(input_file,success))
except Exception as e:
log_to_console('[!] Indexing failed for: {}\n[!] REASON:{}'.format(input_file,str((e.errors[0]))))
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# post=count["count"]
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
else:
log_to_console('[~] Skipping file [Unknown Encoding]: {}'.format(input_file))
def bench_file(input_file):
ps=multiprocessing.current_process()
encoding=get_file_enconding(input_file)
devnull=open(os.devnull,'w')
if encoding:
es = Elasticsearch()
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# pre=count["count"]
log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file))
docs=0
try:
# success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False)
for doc in process_file(input_file,encoding):
docs+=1
devnull.write(json.dumps(doc))
log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs))
except Exception as e:
log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# post=count["count"]
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
else:
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
index=""
doc_type_name = "credential"
log_filename = "processed_files"
threshold = -1 #threshold for reparsing an already parsed file
def main():
global index
parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")
parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?')
parser.add_argument("-i",help="index suffix",default="leak_data")
parser.add_argument("-b",help="dont write to es just benchmark",action='store_true')
parser.add_argument('folder')
args = parser.parse_args()
index=args.i
workers=args.p
dir=args.folder
p=Pool(workers)
if args.b:
p.map(bench_file,get_files(dir))
else:
p.map(index_file,get_files(dir))
if __name__ == '__main__':
main()