added some new functionality to process_leak

This commit is contained in:
Tobias Kessels
2019-02-03 23:33:54 +01:00
parent 3ced576803
commit a677c73ed6
2 changed files with 132 additions and 71 deletions

View File

@@ -1,129 +1,151 @@
#!/usr/bin/python3 #!/usr/bin/python3
import sys
import chardet
import os import os
from os import walk
from chardet.universaldetector import UniversalDetector
import re import re
import mmh3 import mmh3
# from bs4 import UnicodeDammit import string
import sys
from os import walk
from chardet.universaldetector import UniversalDetector
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk from elasticsearch.helpers import bulk
import string
def get_mask(s): def get_mask(s):
mask = "" mask = ""
for c in s: for c in s:
if c.isdigit(): if c.isdigit():
mask += "?d" mask += "?d"
elif c.islower(): elif c.islower():
mask += "?l" mask += "?l"
elif c.isupper(): elif c.isupper():
mask += "?u" mask += "?u"
else: else:
mask += "?s" mask += "?s"
return mask return mask
def check_special(s): def check_special(s):
for c in s: for c in s:
if c in string.punctuation or c.isspace(): if c in string.punctuation or c.isspace():
return True return True
return False return False
def check_upper(s): def check_upper(s):
return any(i.isupper() for i in s) return any(i.isupper() for i in s)
def check_lower(s): def check_lower(s):
return any(i.islower() for i in s) return any(i.islower() for i in s)
def check_digit(s): def check_digit(s):
return any(i.isdigit() for i in s) return any(i.isdigit() for i in s)
#list all files in dir
# list all files in dir
def get_file_enconding(file): def get_file_enconding(file):
detector = UniversalDetector() detector = UniversalDetector()
with open(file,'rb') as daf: with open(file, 'rb') as daf:
i=1000 i = 1000
for line in daf.readlines(): for line in daf.readlines():
i-=1 i -= 1
detector.feed(line) detector.feed(line)
if detector.done or i==0: if detector.done or i == 0:
break break
detector.close() detector.close()
r=detector.result r = detector.result
return r["encoding"] return r["encoding"]
patter=re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
def extract_email(line): def extract_email(line):
global patter global patter
match=patter.search(line) match = patter.search(line)
if match: if match:
res=(match.group(1),match.group(2),match.group(4)) res = (match.group(1), match.group(2), match.group(4))
return (res) return (res)
else: else:
return None return None
def strip_badbytes(b,encoding):
def strip_badbytes(b, encoding):
return (b.decode(encoding, errors='ignore')).strip() return (b.decode(encoding, errors='ignore')).strip()
def get_files(dir):
for (dirpath, dirnames, filenames) in walk(dir):
for x in filenames:
yield os.path.join(dirpath,x)
def get_lines(file): def get_files(dir):
encoding=get_file_enconding(file) for (dirpath, dirnames, filenames) in walk(dir):
for file in filenames:
full_filename=os.path.join(dirpath, file)
encoding=get_file_enconding(full_filename)
if encoding:
yield encoding, full_filename
def get_lines(file,encoding=None):
if not encoding:
encoding = get_file_enconding(file)
with open(file, 'rb') as f: with open(file, 'rb') as f:
for line in f: for line in f:
yield(strip_badbytes(line,encoding)) yield (strip_badbytes(line, encoding))
def get_parsable_lines(file): def get_parsable_lines(file):
success=1 #initialized with 1 to preven div/0 global log_filename
failure=1 success = 1 # initialized with 1 to preven div/0
failure = 1
for line in get_lines(file): for line in get_lines(file):
doc=extract_email(line) doc = extract_email(line)
if doc: if doc:
success+=1 success += 1
yield doc yield doc
else: else:
failure+=1 failure += 1
success_rate=(success/(success+failure)) success_rate = (success / (success + failure))
with open("processed_files",'a+') as file_log: with open(log_filename, 'a+') as file_log:
file_log.write("{};{}\n".format(file,success_rate)) file_log.write("{};{}\n".format(file, success_rate))
def create_doc(file): def create_doc(file):
for cred in get_parsable_lines(file): for cred in get_parsable_lines(file):
doc={} doc = {
doc["user"],doc["domain"],doc["password"] = cred "user" : cred[0],
doc["file"]=file "domain" : cred[1],
doc["length"] = len(doc["password"]) "password" : cred[2],
doc["passwordMask"] = get_mask(doc["password"]) "file" : file,
doc["containsDigits"] = check_digit(doc["password"]) "length" : len(cred[2]),
doc["containsLowerCase"] = check_lower(doc["password"]) "passwordMask" : get_mask(cred[2]),
doc["containsUpperCase"] = check_upper(doc["password"]) "containsDigits" : check_digit(cred[2]),
doc["containsSpecial"] = check_special(doc["password"]) "containsLowerCase" : check_lower(cred[2]),
yield doc "containsUpperCase" : check_upper(cred[2]),
"containsSpecial" : check_special(cred[2])
}
id_hash=hex(mmh3.hash128(",".join((doc["user"], doc["domain"], doc["password"])), 12,signed=False) % 1000000000000000000000)
yield id_hash, doc
def process_file(input_file):
def set_data(input_file, index_name = "leak_col1", doc_type_name = "credential"): global index_prefix, doc_type_name
for doc in create_doc(input_file): filenamehash=hex(mmh3.hash128(input_file, 12,signed=False) % 1000000000000000000000)
id=hex(mmh3.hash128(",".join((doc["user"],doc["domain"],doc["password"])),12,signed=False)%1000000000000000000000) for id_hash, doc in create_doc(input_file):
yield { yield {
"_index": index_name, "_index": "{}_{}".format(index_prefix, filenamehash),
"_type": doc_type_name, "_type": doc_type_name,
"_id": id, "_id": id_hash,
"_source": doc "_source": doc
} }
def load(es, input_file, **kwargs):
print('[*] Indexing file: %s' % input_file) index_prefix = "leak_col1"
success, _ = bulk(es, set_data(input_file, **kwargs), request_timeout = 60, raise_on_exception = False) doc_type_name = "credential"
log_filename = "processed_files"
es = Elasticsearch() es = Elasticsearch()
for data in get_files(sys.argv[1]):
load(es,data) for encoding, data in get_files(sys.argv[1]):
print('[*] Indexing file: %s' % data)
success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False)

39
quickchardet.py Executable file
View File

@@ -0,0 +1,39 @@
#!/usr/bin/python3
import chardet
from chardet import UniversalDetector
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-l",help="list all encoding changes in file",action='store_true')
parser.add_argument("-d",help="try to decode all Lines",action='store_true')
parser.add_argument('filename')
args = parser.parse_args()
with open(args.filename,'rb') as infile:
det=UniversalDetector()
if args.l:
print("listing encodings of file \"{}\"".format(args.filename))
encoding=None
for nl,line in enumerate(infile.readlines()):
det.reset()
det.feed(line)
det.close()
res=det.result
if encoding != res["encoding"]:
encoding=res["encoding"]
if args.d:
print("{}#{}#{}({})".format(nl,line.decode(res["encoding"]),res["encoding"],res["confidence"]))
else:
print("{}#{}#{}({})".format(nl,line,res["encoding"],res["confidence"]))
else:
i=1000
for line in infile.readlines():
i-=1
det.feed(line)
if det.done or i==0:
break
det.close()
res=det.result
print("{}:{}({})".format(sys.argv[1],res["encoding"],res["confidence"]))