more fixes on process_leak

added argparsing and fixed pool invocation
This commit is contained in:
Tobias Kessels
2019-02-07 17:37:39 +01:00
parent 30d026cf70
commit c47b306ee5
2 changed files with 138 additions and 18 deletions

63
mapping Normal file
View File

@@ -0,0 +1,63 @@
PUT _template/template_1
{
"index_patterns" : ["leak*"],
"settings" : {
"number_of_shards" : 2,
"number_of_replicas" : 0,
"refresh_interval": "60s"
},
"mappings": {
"credential": {
"properties": {
"containsDigits": {
"type": "boolean"
},
"containsLowerCase": {
"type": "boolean"
},
"containsSpecial": {
"type": "boolean"
},
"containsUpperCase": {
"type": "boolean"
},
"domain": {
"type": "keyword",
"ignore_above": 512,
"norms" : false
},
"file": {
"type": "keyword",
"ignore_above": 1024,
"norms" : false
},
"length": {
"type": "short"
},
"password": {
"type": "keyword",
"norms" : false,
"ignore_above": 512
},
"passwordMask": {
"type": "keyword",
"norms" : false,
"ignore_above": 512
},
"user": {
"type": "keyword",
"norms" : false,
"ignore_above": 512
},
"username": {
"type": "keyword",
"norms" : false,
"ignore_above": 512
}
}
}
}
}

View File

@@ -10,6 +10,13 @@ from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk from elasticsearch.helpers import bulk
from multiprocessing import Pool,Lock from multiprocessing import Pool,Lock
import multiprocessing import multiprocessing
import hashlib
import json
import argparse
lock = Lock() lock = Lock()
@@ -119,8 +126,9 @@ def get_lines(file,encoding=None):
if not encoding: if not encoding:
encoding = get_file_enconding(file) encoding = get_file_enconding(file)
with open(file, 'rb') as f: with open(file, 'rb') as f:
for line in f: return [strip_badbytes(line, encoding) for line in f]
yield (strip_badbytes(line, encoding)) # for line in f:
# yield (strip_badbytes(line, encoding))
def get_parsable_lines(file,encoding): def get_parsable_lines(file,encoding):
@@ -140,10 +148,13 @@ def get_parsable_lines(file,encoding):
def get_hash(text): def get_hash(text):
return hex(mmh3.hash(text, 12, signed=False)).split("x")[1] hash_object = hashlib.md5(text.encode())
return hash_object.hexdigest()
# return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
def get_user_pw_hash(text): def get_user_pw_hash(text):
return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1] return get_hash(text)
# return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
def create_doc(file,encoding): def create_doc(file,encoding):
for cred in get_parsable_lines(file,encoding): for cred in get_parsable_lines(file,encoding):
@@ -164,8 +175,7 @@ def create_doc(file,encoding):
if len(username_split[0]) > 0 and len(username_split[1]) > 0: if len(username_split[0]) > 0 and len(username_split[1]) > 0:
doc["username"]=username_split[0] doc["username"]=username_split[0]
doc["user"]=username_split[1] doc["user"]=username_split[1]
id_hash=get_user_pw_hash("{}{}".format(doc["user"],doc["password"])) id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"]))
# id_domain=get_domain_hash(cred[1])
id_domain=id_hash[:1] id_domain=id_hash[:1]
yield id_domain, id_hash, doc yield id_domain, id_hash, doc
@@ -182,31 +192,78 @@ def process_file(input_file,encoding):
def index_file(input_file): def index_file(input_file):
ps=multiprocessing.current_process() ps=multiprocessing.current_process()
encoding=get_file_enconding(input_file) encoding=get_file_enconding(input_file)
if encoding: if encoding:
es = Elasticsearch() es = Elasticsearch(["172.16.1.141"],http_compress=True)
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# pre=count["count"] # pre=count["count"]
log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file)) log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file))
success, _ = bulk(es, process_file(input_file,encoding), request_timeout=60, raise_on_exception=False) try:
success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, request_timeout=60, raise_on_error=True, raise_on_exception=False)
# es.bulk()
except Exception as e:
log_to_console('[{}:!] Indexing failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# post=count["count"] # post=count["count"]
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file)) # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
else: else:
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file)) log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
index = "leak_col1" def bench_file(input_file):
ps=multiprocessing.current_process()
encoding=get_file_enconding(input_file)
devnull=open(os.devnull,'w')
if encoding:
es = Elasticsearch()
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# pre=count["count"]
log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file))
docs=0
try:
# success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False)
for doc in process_file(input_file,encoding):
docs+=1
devnull.write(json.dumps(doc))
log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs))
except Exception as e:
log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
# post=count["count"]
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
else:
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
index=""
doc_type_name = "credential" doc_type_name = "credential"
log_filename = "processed_files" log_filename = "processed_files"
threshold = -1 #threshold for reparsing an already parsed file threshold = -1 #threshold for reparsing an already parsed file
p=Pool(20)
def main(): def main():
dir=sys.argv[1] global index
# for filename in get_files(dir): parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")
# index_file(filename) parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?')
parser.add_argument("-i",help="index suffix",default="leak_data")
parser.add_argument("-b",help="dont write to es just benchmark",action='store_true')
parser.add_argument('folder')
args = parser.parse_args()
index=args.i
workers=args.p
dir=args.folder
p=Pool(workers)
if args.b:
p.map(bench_file,get_files(dir))
else:
p.map(index_file,get_files(dir)) p.map(index_file,get_files(dir))