more fixes on process_leak
added argparsing and fixed pool invocation
This commit is contained in:
63
mapping
Normal file
63
mapping
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
PUT _template/template_1
|
||||||
|
{
|
||||||
|
"index_patterns" : ["leak*"],
|
||||||
|
"settings" : {
|
||||||
|
"number_of_shards" : 2,
|
||||||
|
"number_of_replicas" : 0,
|
||||||
|
"refresh_interval": "60s"
|
||||||
|
},
|
||||||
|
|
||||||
|
"mappings": {
|
||||||
|
"credential": {
|
||||||
|
"properties": {
|
||||||
|
"containsDigits": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"containsLowerCase": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"containsSpecial": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"containsUpperCase": {
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
"domain": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 512,
|
||||||
|
"norms" : false
|
||||||
|
},
|
||||||
|
"file": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 1024,
|
||||||
|
"norms" : false
|
||||||
|
},
|
||||||
|
"length": {
|
||||||
|
"type": "short"
|
||||||
|
},
|
||||||
|
"password": {
|
||||||
|
"type": "keyword",
|
||||||
|
"norms" : false,
|
||||||
|
"ignore_above": 512
|
||||||
|
},
|
||||||
|
"passwordMask": {
|
||||||
|
"type": "keyword",
|
||||||
|
"norms" : false,
|
||||||
|
"ignore_above": 512
|
||||||
|
},
|
||||||
|
"user": {
|
||||||
|
"type": "keyword",
|
||||||
|
"norms" : false,
|
||||||
|
"ignore_above": 512
|
||||||
|
},
|
||||||
|
"username": {
|
||||||
|
"type": "keyword",
|
||||||
|
"norms" : false,
|
||||||
|
"ignore_above": 512
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@@ -10,6 +10,13 @@ from elasticsearch import Elasticsearch
|
|||||||
from elasticsearch.helpers import bulk
|
from elasticsearch.helpers import bulk
|
||||||
from multiprocessing import Pool,Lock
|
from multiprocessing import Pool,Lock
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
lock = Lock()
|
lock = Lock()
|
||||||
|
|
||||||
@@ -119,8 +126,9 @@ def get_lines(file,encoding=None):
|
|||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = get_file_enconding(file)
|
encoding = get_file_enconding(file)
|
||||||
with open(file, 'rb') as f:
|
with open(file, 'rb') as f:
|
||||||
for line in f:
|
return [strip_badbytes(line, encoding) for line in f]
|
||||||
yield (strip_badbytes(line, encoding))
|
# for line in f:
|
||||||
|
# yield (strip_badbytes(line, encoding))
|
||||||
|
|
||||||
|
|
||||||
def get_parsable_lines(file,encoding):
|
def get_parsable_lines(file,encoding):
|
||||||
@@ -140,10 +148,13 @@ def get_parsable_lines(file,encoding):
|
|||||||
|
|
||||||
|
|
||||||
def get_hash(text):
|
def get_hash(text):
|
||||||
return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
|
hash_object = hashlib.md5(text.encode())
|
||||||
|
return hash_object.hexdigest()
|
||||||
|
# return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]
|
||||||
|
|
||||||
def get_user_pw_hash(text):
|
def get_user_pw_hash(text):
|
||||||
return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
|
return get_hash(text)
|
||||||
|
# return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]
|
||||||
|
|
||||||
def create_doc(file,encoding):
|
def create_doc(file,encoding):
|
||||||
for cred in get_parsable_lines(file,encoding):
|
for cred in get_parsable_lines(file,encoding):
|
||||||
@@ -164,8 +175,7 @@ def create_doc(file,encoding):
|
|||||||
if len(username_split[0]) > 0 and len(username_split[1]) > 0:
|
if len(username_split[0]) > 0 and len(username_split[1]) > 0:
|
||||||
doc["username"]=username_split[0]
|
doc["username"]=username_split[0]
|
||||||
doc["user"]=username_split[1]
|
doc["user"]=username_split[1]
|
||||||
id_hash=get_user_pw_hash("{}{}".format(doc["user"],doc["password"]))
|
id_hash=get_user_pw_hash("{}{}{}".format(doc["user"],doc["domain"],doc["password"]))
|
||||||
# id_domain=get_domain_hash(cred[1])
|
|
||||||
id_domain=id_hash[:1]
|
id_domain=id_hash[:1]
|
||||||
yield id_domain, id_hash, doc
|
yield id_domain, id_hash, doc
|
||||||
|
|
||||||
@@ -182,31 +192,78 @@ def process_file(input_file,encoding):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def index_file(input_file):
|
def index_file(input_file):
|
||||||
ps=multiprocessing.current_process()
|
ps=multiprocessing.current_process()
|
||||||
encoding=get_file_enconding(input_file)
|
encoding=get_file_enconding(input_file)
|
||||||
if encoding:
|
if encoding:
|
||||||
es = Elasticsearch()
|
es = Elasticsearch(["172.16.1.141"],http_compress=True)
|
||||||
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
||||||
# pre=count["count"]
|
# pre=count["count"]
|
||||||
log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file))
|
log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file))
|
||||||
success, _ = bulk(es, process_file(input_file,encoding), request_timeout=60, raise_on_exception=False)
|
try:
|
||||||
|
success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, request_timeout=60, raise_on_error=True, raise_on_exception=False)
|
||||||
|
# es.bulk()
|
||||||
|
except Exception as e:
|
||||||
|
log_to_console('[{}:!] Indexing failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
|
||||||
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
||||||
# post=count["count"]
|
# post=count["count"]
|
||||||
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
|
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
|
||||||
else:
|
else:
|
||||||
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
|
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
|
||||||
|
|
||||||
index = "leak_col1"
|
def bench_file(input_file):
|
||||||
|
ps=multiprocessing.current_process()
|
||||||
|
encoding=get_file_enconding(input_file)
|
||||||
|
devnull=open(os.devnull,'w')
|
||||||
|
if encoding:
|
||||||
|
es = Elasticsearch()
|
||||||
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
||||||
|
# pre=count["count"]
|
||||||
|
log_to_console('[{}:*] Benching file: {}'.format(ps.pid,input_file))
|
||||||
|
docs=0
|
||||||
|
try:
|
||||||
|
# success, _ = bulk(es, process_file(input_file,encoding), chunk_size=1000, request_timeout=60, raise_on_error=False, raise_on_exception=False)
|
||||||
|
for doc in process_file(input_file,encoding):
|
||||||
|
docs+=1
|
||||||
|
devnull.write(json.dumps(doc))
|
||||||
|
|
||||||
|
|
||||||
|
log_to_console('[{}:*] Benching Done: {} [processed {} docs]'.format(ps.pid,input_file,docs))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log_to_console('[{}:!] Benching failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
|
||||||
|
# count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
|
||||||
|
# post=count["count"]
|
||||||
|
# log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
|
||||||
|
else:
|
||||||
|
log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
index=""
|
||||||
doc_type_name = "credential"
|
doc_type_name = "credential"
|
||||||
log_filename = "processed_files"
|
log_filename = "processed_files"
|
||||||
threshold = -1 #threshold for reparsing an already parsed file
|
threshold = -1 #threshold for reparsing an already parsed file
|
||||||
p=Pool(20)
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
dir=sys.argv[1]
|
global index
|
||||||
# for filename in get_files(dir):
|
parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")
|
||||||
# index_file(filename)
|
parser.add_argument("-p",help="how many workers (default:4)",default=4,type=int,nargs='?')
|
||||||
|
parser.add_argument("-i",help="index suffix",default="leak_data")
|
||||||
|
parser.add_argument("-b",help="dont write to es just benchmark",action='store_true')
|
||||||
|
parser.add_argument('folder')
|
||||||
|
args = parser.parse_args()
|
||||||
|
index=args.i
|
||||||
|
workers=args.p
|
||||||
|
dir=args.folder
|
||||||
|
p=Pool(workers)
|
||||||
|
if args.b:
|
||||||
|
p.map(bench_file,get_files(dir))
|
||||||
|
else:
|
||||||
p.map(index_file,get_files(dir))
|
p.map(index_file,get_files(dir))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user