added some new functionality to process_leak
This commit is contained in:
164
process_leak.py
164
process_leak.py
@@ -1,129 +1,151 @@
|
||||
#!/usr/bin/python3
|
||||
import sys
|
||||
import chardet
|
||||
import os
|
||||
from os import walk
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
import re
|
||||
import mmh3
|
||||
# from bs4 import UnicodeDammit
|
||||
import string
|
||||
import sys
|
||||
from os import walk
|
||||
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch.helpers import bulk
|
||||
|
||||
import string
|
||||
|
||||
def get_mask(s):
|
||||
mask = ""
|
||||
for c in s:
|
||||
if c.isdigit():
|
||||
mask += "?d"
|
||||
elif c.islower():
|
||||
mask += "?l"
|
||||
elif c.isupper():
|
||||
mask += "?u"
|
||||
else:
|
||||
mask += "?s"
|
||||
return mask
|
||||
mask = ""
|
||||
for c in s:
|
||||
if c.isdigit():
|
||||
mask += "?d"
|
||||
elif c.islower():
|
||||
mask += "?l"
|
||||
elif c.isupper():
|
||||
mask += "?u"
|
||||
else:
|
||||
mask += "?s"
|
||||
return mask
|
||||
|
||||
|
||||
def check_special(s):
|
||||
for c in s:
|
||||
if c in string.punctuation or c.isspace():
|
||||
return True
|
||||
return False
|
||||
for c in s:
|
||||
if c in string.punctuation or c.isspace():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_upper(s):
|
||||
return any(i.isupper() for i in s)
|
||||
return any(i.isupper() for i in s)
|
||||
|
||||
|
||||
def check_lower(s):
|
||||
return any(i.islower() for i in s)
|
||||
return any(i.islower() for i in s)
|
||||
|
||||
|
||||
def check_digit(s):
|
||||
return any(i.isdigit() for i in s)
|
||||
|
||||
#list all files in dir
|
||||
|
||||
# list all files in dir
|
||||
def get_file_enconding(file):
|
||||
detector = UniversalDetector()
|
||||
with open(file,'rb') as daf:
|
||||
i=1000
|
||||
with open(file, 'rb') as daf:
|
||||
i = 1000
|
||||
for line in daf.readlines():
|
||||
i-=1
|
||||
i -= 1
|
||||
detector.feed(line)
|
||||
if detector.done or i==0:
|
||||
if detector.done or i == 0:
|
||||
break
|
||||
detector.close()
|
||||
|
||||
r=detector.result
|
||||
r = detector.result
|
||||
return r["encoding"]
|
||||
|
||||
patter=re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
|
||||
|
||||
patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
|
||||
|
||||
|
||||
def extract_email(line):
|
||||
global patter
|
||||
match=patter.search(line)
|
||||
match = patter.search(line)
|
||||
if match:
|
||||
res=(match.group(1),match.group(2),match.group(4))
|
||||
res = (match.group(1), match.group(2), match.group(4))
|
||||
return (res)
|
||||
else:
|
||||
return None
|
||||
|
||||
def strip_badbytes(b,encoding):
|
||||
|
||||
def strip_badbytes(b, encoding):
|
||||
return (b.decode(encoding, errors='ignore')).strip()
|
||||
|
||||
def get_files(dir):
|
||||
for (dirpath, dirnames, filenames) in walk(dir):
|
||||
for x in filenames:
|
||||
yield os.path.join(dirpath,x)
|
||||
|
||||
def get_lines(file):
|
||||
encoding=get_file_enconding(file)
|
||||
def get_files(dir):
|
||||
for (dirpath, dirnames, filenames) in walk(dir):
|
||||
for file in filenames:
|
||||
full_filename=os.path.join(dirpath, file)
|
||||
encoding=get_file_enconding(full_filename)
|
||||
if encoding:
|
||||
yield encoding, full_filename
|
||||
|
||||
|
||||
def get_lines(file,encoding=None):
|
||||
if not encoding:
|
||||
encoding = get_file_enconding(file)
|
||||
with open(file, 'rb') as f:
|
||||
for line in f:
|
||||
yield(strip_badbytes(line,encoding))
|
||||
yield (strip_badbytes(line, encoding))
|
||||
|
||||
|
||||
def get_parsable_lines(file):
|
||||
success=1 #initialized with 1 to preven div/0
|
||||
failure=1
|
||||
global log_filename
|
||||
success = 1 # initialized with 1 to preven div/0
|
||||
failure = 1
|
||||
for line in get_lines(file):
|
||||
doc=extract_email(line)
|
||||
doc = extract_email(line)
|
||||
if doc:
|
||||
success+=1
|
||||
success += 1
|
||||
yield doc
|
||||
else:
|
||||
failure+=1
|
||||
success_rate=(success/(success+failure))
|
||||
with open("processed_files",'a+') as file_log:
|
||||
file_log.write("{};{}\n".format(file,success_rate))
|
||||
failure += 1
|
||||
success_rate = (success / (success + failure))
|
||||
with open(log_filename, 'a+') as file_log:
|
||||
file_log.write("{};{}\n".format(file, success_rate))
|
||||
|
||||
|
||||
def create_doc(file):
|
||||
for cred in get_parsable_lines(file):
|
||||
doc={}
|
||||
doc["user"],doc["domain"],doc["password"] = cred
|
||||
doc["file"]=file
|
||||
doc["length"] = len(doc["password"])
|
||||
doc["passwordMask"] = get_mask(doc["password"])
|
||||
doc["containsDigits"] = check_digit(doc["password"])
|
||||
doc["containsLowerCase"] = check_lower(doc["password"])
|
||||
doc["containsUpperCase"] = check_upper(doc["password"])
|
||||
doc["containsSpecial"] = check_special(doc["password"])
|
||||
yield doc
|
||||
doc = {
|
||||
"user" : cred[0],
|
||||
"domain" : cred[1],
|
||||
"password" : cred[2],
|
||||
"file" : file,
|
||||
"length" : len(cred[2]),
|
||||
"passwordMask" : get_mask(cred[2]),
|
||||
"containsDigits" : check_digit(cred[2]),
|
||||
"containsLowerCase" : check_lower(cred[2]),
|
||||
"containsUpperCase" : check_upper(cred[2]),
|
||||
"containsSpecial" : check_special(cred[2])
|
||||
}
|
||||
id_hash=hex(mmh3.hash128(",".join((doc["user"], doc["domain"], doc["password"])), 12,signed=False) % 1000000000000000000000)
|
||||
yield id_hash, doc
|
||||
|
||||
|
||||
|
||||
def set_data(input_file, index_name = "leak_col1", doc_type_name = "credential"):
|
||||
for doc in create_doc(input_file):
|
||||
id=hex(mmh3.hash128(",".join((doc["user"],doc["domain"],doc["password"])),12,signed=False)%1000000000000000000000)
|
||||
def process_file(input_file):
|
||||
global index_prefix, doc_type_name
|
||||
filenamehash=hex(mmh3.hash128(input_file, 12,signed=False) % 1000000000000000000000)
|
||||
for id_hash, doc in create_doc(input_file):
|
||||
yield {
|
||||
"_index": index_name,
|
||||
"_type": doc_type_name,
|
||||
"_id": id,
|
||||
"_source": doc
|
||||
"_index": "{}_{}".format(index_prefix, filenamehash),
|
||||
"_type": doc_type_name,
|
||||
"_id": id_hash,
|
||||
"_source": doc
|
||||
}
|
||||
|
||||
def load(es, input_file, **kwargs):
|
||||
print('[*] Indexing file: %s' % input_file)
|
||||
success, _ = bulk(es, set_data(input_file, **kwargs), request_timeout = 60, raise_on_exception = False)
|
||||
|
||||
index_prefix = "leak_col1"
|
||||
doc_type_name = "credential"
|
||||
log_filename = "processed_files"
|
||||
|
||||
es = Elasticsearch()
|
||||
for data in get_files(sys.argv[1]):
|
||||
load(es,data)
|
||||
|
||||
for encoding, data in get_files(sys.argv[1]):
|
||||
print('[*] Indexing file: %s' % data)
|
||||
success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False)
|
||||
|
||||
39
quickchardet.py
Executable file
39
quickchardet.py
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/python3
|
||||
import chardet
|
||||
from chardet import UniversalDetector
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-l",help="list all encoding changes in file",action='store_true')
|
||||
parser.add_argument("-d",help="try to decode all Lines",action='store_true')
|
||||
parser.add_argument('filename')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
with open(args.filename,'rb') as infile:
|
||||
det=UniversalDetector()
|
||||
if args.l:
|
||||
print("listing encodings of file \"{}\"".format(args.filename))
|
||||
encoding=None
|
||||
for nl,line in enumerate(infile.readlines()):
|
||||
det.reset()
|
||||
det.feed(line)
|
||||
det.close()
|
||||
res=det.result
|
||||
if encoding != res["encoding"]:
|
||||
encoding=res["encoding"]
|
||||
if args.d:
|
||||
print("{}#{}#{}({})".format(nl,line.decode(res["encoding"]),res["encoding"],res["confidence"]))
|
||||
else:
|
||||
print("{}#{}#{}({})".format(nl,line,res["encoding"],res["confidence"]))
|
||||
else:
|
||||
i=1000
|
||||
for line in infile.readlines():
|
||||
i-=1
|
||||
det.feed(line)
|
||||
if det.done or i==0:
|
||||
break
|
||||
det.close()
|
||||
res=det.result
|
||||
print("{}:{}({})".format(sys.argv[1],res["encoding"],res["confidence"]))
|
||||
Reference in New Issue
Block a user