diff --git a/dockerfiles/logstash/configs/bwi.conf b/dockerfiles/logstash/configs/bwi.conf index bc14a7e..1658c66 100644 --- a/dockerfiles/logstash/configs/bwi.conf +++ b/dockerfiles/logstash/configs/bwi.conf @@ -9,7 +9,7 @@ input { # } file{ type => syslog - path => "/config/*" + path => "/config/data.csv" start_position => "beginning" } } @@ -79,10 +79,18 @@ filter { match => { "mwg[parsedurl][host]" => "((?[^/]+)\.)?(?[^/.]+)\.(?[^/.]+)" } } - if [mwg.parsedurl.domain] == "google"{ - grok { - match => { "mwg[parsedurl][parameters]" => "(?q=[^&]+)" } - } + grok { + match => { "mwg[parsedurl][parameters]" => "q=(?[^&]+)" } + add_tag => "querystring" + tag_on_failure => "no_querystring" + } + urldecode { + field => "mwg[parsedurl][querystring]" + add_tag => "urldecoded_querystring" + } + urldecode { + field => "mwg[parsedurl][parameters]" + add_tag => "urldecoded_parameters" } # parsing von allen url parameter macht probleme weil zu viele @@ -96,9 +104,9 @@ filter { } } output { -# elasticsearch { -# hosts => ["elasticsearch:9200"] -# index => "logstash-bwi-casenr" -# } + elasticsearch { + hosts => ["elasticsearch:9200"] + index => "logstash-testindex" + } stdout { codec => rubydebug } } diff --git a/process_leak.py b/process_leak.py index 3f98057..c36249e 100755 --- a/process_leak.py +++ b/process_leak.py @@ -12,14 +12,12 @@ from multiprocessing import Pool,Lock import multiprocessing import hashlib import json - import argparse - - lock = Lock() + def log_to_file(text): global log_filename with lock: # thread blocks at this line until it can obtain lock @@ -27,8 +25,9 @@ def log_to_file(text): file_log.write("{}\n".format(text)) def log_to_console(text): + ps=multiprocessing.current_process() with lock: # thread blocks at this line until it can obtain lock - print(text) + print("[{}]:{}".format(ps.pid,text)) def get_mask(s): @@ -143,7 +142,7 @@ def get_parsable_lines(file,encoding): else: failure += 1 success_rate = (success / (success + failure)) - log_to_console('[+] Done with file: {} ({})'.format(file,success_rate)) + log_to_console('[+] Done parsing file: {} ({})'.format(file,success_rate)) log_to_file("{};{}".format(file, success_rate)) @@ -152,16 +151,18 @@ def get_hash(text): return hash_object.hexdigest() # return hex(mmh3.hash(text, 12, signed=False)).split("x")[1] + def get_user_pw_hash(text): return get_hash(text) # return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1] + def create_doc(file,encoding): for cred in get_parsable_lines(file,encoding): doc = { "user" : cred[0], "domain" : cred[1], - "password" : cred[2], + "password" : cred[2][:129], "file" : file, "length" : len(cred[2]), "passwordMask" : get_mask(cred[2]), @@ -191,26 +192,24 @@ def process_file(input_file,encoding): } - - def index_file(input_file): - ps=multiprocessing.current_process() encoding=get_file_enconding(input_file) if encoding: es = Elasticsearch(["172.16.1.141"],http_compress=True) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # pre=count["count"] - log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file)) + log_to_console('[*] Indexing file: {}'.format(input_file)) try: - success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, request_timeout=60, raise_on_error=True, raise_on_exception=False) - # es.bulk() + success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, initial_backoff=60, max_retries=3, request_timeout=60, raise_on_error=False, raise_on_exception=True) + log_to_console('[!] Indexing done: {} [{} lines committed]'.format(input_file,success)) except Exception as e: - log_to_console('[{}:!] Indexing failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message)) + log_to_console('[!] Indexing failed for: {}\n[!] REASON:{}'.format(input_file,str((e.errors[0])))) # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}}) # post=count["count"] # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file)) else: - log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file)) + log_to_console('[~] Skipping file [Unknown Encoding]: {}'.format(input_file)) + def bench_file(input_file): ps=multiprocessing.current_process() @@ -249,6 +248,7 @@ doc_type_name = "credential" log_filename = "processed_files" threshold = -1 #threshold for reparsing an already parsed file + def main(): global index parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")