273 lines
8.3 KiB
Python
Executable File
273 lines
8.3 KiB
Python
Executable File
#!/usr/bin/python3
|
|
import os
|
|
import re
|
|
import mmh3
|
|
import string
|
|
import sys
|
|
from os import walk
|
|
from chardet.universaldetector import UniversalDetector
|
|
from elasticsearch import Elasticsearch
|
|
from elasticsearch.helpers import bulk
|
|
from multiprocessing import Pool,Lock
|
|
import multiprocessing
|
|
import hashlib
|
|
import json
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
lock = Lock()
|
|
|
|
def log_to_file(text):
|
|
global log_filename
|
|
with lock: # thread blocks at this line until it can obtain lock
|
|
with open(log_filename, 'a+') as file_log:
|
|
file_log.write("{}\n".format(text))
|
|
|
|
def log_to_console(text):
|
|
with lock: # thread blocks at this line until it can obtain lock
|
|
print(text)
|
|
|
|
|
|
def get_mask(s):
|
|
mask = ""
|
|
for c in s:
|
|
if c.isdigit():
|
|
mask += "?d"
|
|
elif c.islower():
|
|
mask += "?l"
|
|
elif c.isupper():
|
|
mask += "?u"
|
|
else:
|
|
mask += "?s"
|
|
return mask
|
|
|
|
|
|
def check_special(s):
|
|
for c in s:
|
|
if c in string.punctuation or c.isspace():
|
|
return True
|
|
return False
|
|
|
|
|
|
def check_upper(s):
|
|
return any(i.isupper() for i in s)
|
|
|
|
|
|
def check_lower(s):
|
|
return any(i.islower() for i in s)
|
|
|
|
|
|
def check_digit(s):
|
|
return any(i.isdigit() for i in s)
|
|
|
|
|
|
# list all files in dir
|
|
def get_file_enconding(file):
|
|
detector = UniversalDetector()
|
|
with open(file, 'rb') as daf:
|
|
i = 1000
|
|
for line in daf.readlines():
|
|
i -= 1
|
|
detector.feed(line)
|
|
if detector.done or i == 0:
|
|
break
|
|
detector.close()
|
|
|
|
r = detector.result
|
|
return r["encoding"]
|
|
|
|
|
|
patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
|
|
|
|
|
|
def extract_email(line):
|
|
global patter
|
|
match = patter.search(line)
|
|
if match:
|
|
res = (match.group(1), match.group(2), match.group(4))
|
|
return (res)
|
|
else:
|
|
return None
|
|
|
|
|
|
def strip_badbytes(b, encoding):
|
|
return (b.decode(encoding, errors='ignore')).strip()
|
|
|
|
|
|
def get_files(dir):
|
|
files_in_log={}
|
|
global threshold
|
|
try:
|
|
with open(log_filename,'r') as file_log:
|
|
for line in file_log.readlines():
|
|
try:
|
|
filedata=line.split(";")
|
|
files_in_log[filedata[0]]=float(filedata[1])
|
|
except:
|
|
log_to_console("Can't parse Line")
|
|
pass
|
|
except:
|
|
log_to_console("Can't open Logfile")
|
|
pass
|
|
|
|
for (dirpath, dirnames, filenames) in walk(dir):
|
|
for file in filenames:
|
|
full_filename=os.path.join(dirpath, file)
|
|
if full_filename in files_in_log and files_in_log[full_filename] > threshold:
|
|
log_to_console('[~] Skipping file [Already Parsed]: %s' % full_filename)
|
|
continue
|
|
yield full_filename
|
|
|
|
|
|
def get_lines(file,encoding=None):
|
|
if not encoding:
|
|
encoding = get_file_enconding(file)
|
|
with open(file, 'rb') as f:
|
|
return [strip_badbytes(line, encoding) for line in f]
|
|
# for line in f:
|
|
# yield (strip_badbytes(line, encoding))
|
|
|
|
|
|
def get_parsable_lines(file,encoding):
|
|
global log_filename
|
|
success = 0 # initialized with 1 to preven div/0
|
|
failure = 0
|
|
for line in get_lines(file,encoding):
|
|
doc = extract_email(line)
|
|
if doc:
|
|
success += 1
|
|
yield doc
|
|
else:
|
|
failure += 1
|
|
success_rate = (success / (success + failure))
|
|
log_to_console('[+] Done with file: {} ({})'.format(file,success_rate))
|
|
log_to_file("{};{}".format(file, success_rate))
|
|
|
|
|
|
def get_hash(text):
|
|
hash_object = hashlib.md5(text.encode())
|
|
return hash_object.hexdigest()
|
|
# return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
|
|
|
|
def get_user_pw_hash(text):
|
|
return get_hash(text)
|
|
# return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
|
|
|
|
def create_doc(file,encoding):
|
|
for cred in get_parsable_lines(file,encoding):
|
|
doc = {
|
|
"user" : cred[0],
|
|
"domain" : cred[1],
|
|
"password" : cred[2],
|
|
"file" : file,
|
|
"length" : len(cred[2]),
|
|
"passwordMask" : get_mask(cred[2]),
|
|
"containsDigits" : check_digit(cred[2]),
|
|
"containsLowerCase" : check_lower(cred[2]),
|
|
"containsUpperCase" : check_upper(cred[2]),
|
|
"containsSpecial" : check_special(cred[2])
|
|
}
|
|
username_split=cred[0].split(";")
|
|
if len(username_split)==2:
|
|
if len(username_split[0]) > 0 and len(username_split[1]) > 0:
|
|
doc["username"]=username_split[0]
|
|
doc["user"]=username_split[1]
|
|
id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"]))
|
|
id_domain=id_hash[:1]
|
|
yield id_domain, id_hash, doc
|
|
|
|
|
|
def process_file(input_file,encoding):
|
|
global index, doc_type_name
|
|
for id_domain, id_hash, doc in create_doc(input_file,encoding):
|
|
yield {
|
|
"_index": "{}_{}".format(index,id_domain),
|
|
"_type": doc_type_name,
|
|
"_id": id_hash,
|
|
"_source": doc
|
|
}
|
|
|
|
|
|
|
|
|
|
def index_file(input_file):
|
|
ps=multiprocessing.current_process()
|
|
encoding=get_file_enconding(input_file)
|
|
if encoding:
|
|
es = Elasticsearch(["172.16.1.141"],http_compress=True)
|
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
|
# pre=count["count"]
|
|
log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file))
|
|
try:
|
|
success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, request_timeout=60, raise_on_error=True, raise_on_exception=False)
|
|
# es.bulk()
|
|
except Exception as e:
|
|
log_to_console('[{}:!] Indexing failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
|
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
|
# post=count["count"]
|
|
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
|
|
else:
|
|
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
|
|
|
|
def bench_file(input_file):
|
|
ps=multiprocessing.current_process()
|
|
encoding=get_file_enconding(input_file)
|
|
devnull=open(os.devnull,'w')
|
|
if encoding:
|
|
es = Elasticsearch()
|
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
|
# pre=count["count"]
|
|
log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file))
|
|
docs=0
|
|
try:
|
|
# success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False)
|
|
for doc in process_file(input_file,encoding):
|
|
docs+=1
|
|
devnull.write(json.dumps(doc))
|
|
|
|
|
|
log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs))
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
|
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
|
# post=count["count"]
|
|
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
|
|
else:
|
|
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
|
|
|
|
|
|
|
|
index=""
|
|
doc_type_name = "credential"
|
|
log_filename = "processed_files"
|
|
threshold = -1 #threshold for reparsing an already parsed file
|
|
|
|
def main():
|
|
global index
|
|
parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")
|
|
parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?')
|
|
parser.add_argument("-i",help="index suffix",default="leak_data")
|
|
parser.add_argument("-b",help="dont write to es just benchmark",action='store_true')
|
|
parser.add_argument('folder')
|
|
args = parser.parse_args()
|
|
index=args.i
|
|
workers=args.p
|
|
dir=args.folder
|
|
p=Pool(workers)
|
|
if args.b:
|
|
p.map(bench_file,get_files(dir))
|
|
else:
|
|
p.map(index_file,get_files(dir))
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|