update logstash config and process_leak

bwi.conf: now is correctly extracting querystrings from urlparameters if they are named "q" process_leak.py: fixed output to add pid automatically added lenght restriction to passwordfield
2019-02-22 13:16:05 +01:00
parent 2d631cef27
commit f71c15d6f5
2 changed files with 31 additions and 23 deletions
--- a/dockerfiles/logstash/configs/bwi.conf
+++ b/dockerfiles/logstash/configs/bwi.conf
@@ -9,7 +9,7 @@ input {
  # }
    file{
        type => syslog
-        path => "/config/*"
+        path => "/config/data.csv"
        start_position => "beginning"
    }
 }
@@ -79,10 +79,18 @@ filter {
        match => { "mwg[parsedurl][host]" => "((?<mwg[parsedurl][subdomain]>[^/]+)\.)?(?<mwg[parsedurl][domain]>[^/.]+)\.(?<mwg[parsedurl][tld]>[^/.]+)" }
        }

-        if [mwg.parsedurl.domain] == "google"{
        grok {
-            match => { "mwg[parsedurl][parameters]" => "(?<mwg[parsedurl][googlesearch]>q=[^&]+)" }
+          match => { "mwg[parsedurl][parameters]" => "q=(?<mwg[parsedurl][querystring]>[^&]+)" }
+          add_tag => "querystring"
+          tag_on_failure => "no_querystring"
        }
+        urldecode {
+          field => "mwg[parsedurl][querystring]"
+          add_tag => "urldecoded_querystring"
+        }
+        urldecode {
+          field => "mwg[parsedurl][parameters]"
+          add_tag => "urldecoded_parameters"
        }

 	# parsing von allen url parameter macht probleme weil zu viele
@@ -96,9 +104,9 @@ filter {
    }
 }
 output {
-#  elasticsearch {
-#    hosts => ["elasticsearch:9200"]
-#    index => "logstash-bwi-casenr"
-#  }
+ elasticsearch {
+   hosts => ["elasticsearch:9200"]
+   index => "logstash-testindex"
+ }
  stdout { codec => rubydebug }
 }
--- a/process_leak.py
+++ b/process_leak.py
@@ -12,14 +12,12 @@ from multiprocessing import Pool,Lock
 import multiprocessing
 import hashlib
 import json
-
 import argparse


-
-
 lock = Lock()

+
 def log_to_file(text):
    global log_filename
    with lock: # thread blocks at this line until it can obtain lock
@@ -27,8 +25,9 @@ def log_to_file(text):
            file_log.write("{}\n".format(text))

 def log_to_console(text):
+    ps=multiprocessing.current_process()
    with lock: # thread blocks at this line until it can obtain lock
-        print(text)
+        print("[{}]:{}".format(ps.pid,text))


 def get_mask(s):
@@ -143,7 +142,7 @@ def get_parsable_lines(file,encoding):
        else:
            failure += 1
    success_rate = (success / (success + failure))
-    log_to_console('[+] Done with file: {} ({})'.format(file,success_rate))
+    log_to_console('[+] Done parsing file: {} ({})'.format(file,success_rate))
    log_to_file("{};{}".format(file, success_rate))


@@ -152,16 +151,18 @@ def get_hash(text):
    return hash_object.hexdigest()
    # return hex(mmh3.hash(text, 12, signed=False)).split("x")[1]

+
 def get_user_pw_hash(text):
    return get_hash(text)
    # return hex(mmh3.hash128(text, 12,signed=False) % 1000000000000000).split("x")[1]

+
 def create_doc(file,encoding):
    for cred in get_parsable_lines(file,encoding):
        doc = {
            "user"              :   cred[0],
            "domain"            :   cred[1],
-            "password"          :   cred[2],
+            "password"          :   cred[2][:129],
            "file"              :   file,
            "length"            :   len(cred[2]),
            "passwordMask"      :   get_mask(cred[2]),
@@ -191,26 +192,24 @@ def process_file(input_file,encoding):
        }


-
-
 def index_file(input_file):
-    ps=multiprocessing.current_process()
    encoding=get_file_enconding(input_file)
    if encoding:
        es = Elasticsearch(["172.16.1.141"],http_compress=True)
        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
        # pre=count["count"]
-        log_to_console('[{}:*] Indexing file: {}'.format(ps.pid,input_file))
+        log_to_console('[*] Indexing file: {}'.format(input_file))
        try:
-            success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, request_timeout=60, raise_on_error=True, raise_on_exception=False)
-            # es.bulk()
+            success, _ = bulk(es, process_file(input_file,encoding), chunk_size=10000, initial_backoff=60, max_retries=3, request_timeout=60, raise_on_error=False, raise_on_exception=True)
+            log_to_console('[!] Indexing done: {} [{} lines committed]'.format(input_file,success))
        except Exception as e:
-            log_to_console('[{}:!] Indexing failed for: {}\n[{}:!] REASON: {}'.format(ps.pid,input_file,e.message))
+            log_to_console('[!] Indexing failed for: {}\n[!] REASON:{}'.format(input_file,str((e.errors[0]))))
        # count = es.count(index=index, doc_type=doc_type_name, body={ "query": {"match_all" : { }}})
        # post=count["count"]
        # log_to_console('[{}:=] Added {} Documents with {}'.format(ps.pid,post-pre,input_file))
    else:
-        log_to_console('[{}:~] Skipping file [Unknown Encoding]: {}'.format(ps.pid,input_file))
+        log_to_console('[~] Skipping file [Unknown Encoding]: {}'.format(input_file))
+

 def bench_file(input_file):
    ps=multiprocessing.current_process()
@@ -249,6 +248,7 @@ doc_type_name = "credential"
 log_filename = "processed_files"
 threshold = -1 #threshold for reparsing an already parsed file

+
 def main():
    global index
    parser = argparse.ArgumentParser(description="Put Leakdata into local Elasticsearch")