From a677c73ed675e7d8267f4407e3dfb44ebf675be8 Mon Sep 17 00:00:00 2001
From: Tobias Kessels <tabledevil@gmail.com>
Date: Sun, 3 Feb 2019 23:33:54 +0100
Subject: [PATCH] added some new functionality to process_leak

---
 process_leak.py | 164 +++++++++++++++++++++++++++---------------------
 quickchardet.py |  39 ++++++++++++
 2 files changed, 132 insertions(+), 71 deletions(-)
 create mode 100755 quickchardet.py

diff --git a/process_leak.py b/process_leak.py
index 7a218ac..33d4d3a 100755
--- a/process_leak.py
+++ b/process_leak.py
@@ -1,129 +1,151 @@
 #!/usr/bin/python3
-import sys
-import chardet
 import os
-from os import walk
-from chardet.universaldetector import UniversalDetector
 import re
 import mmh3
-# from bs4 import UnicodeDammit
+import string
+import sys
+from os import walk
+
+from chardet.universaldetector import UniversalDetector
 from elasticsearch import Elasticsearch
 from elasticsearch.helpers import bulk
 
-import string
 
 def get_mask(s):
-	mask = ""
-	for c in s:
-		if c.isdigit():
-			mask += "?d"
-		elif c.islower():
-			mask += "?l"
-		elif c.isupper():
-			mask += "?u"
-		else:
-			mask += "?s"
-	return mask
+    mask = ""
+    for c in s:
+        if c.isdigit():
+            mask += "?d"
+        elif c.islower():
+            mask += "?l"
+        elif c.isupper():
+            mask += "?u"
+        else:
+            mask += "?s"
+    return mask
+
 
 def check_special(s):
-	for c in s:
-		if c in string.punctuation or c.isspace():
-			return True
-	return False
+    for c in s:
+        if c in string.punctuation or c.isspace():
+            return True
+    return False
+
 
 def check_upper(s):
-	return any(i.isupper() for i in s)
+    return any(i.isupper() for i in s)
+
 
 def check_lower(s):
-	return any(i.islower() for i in s)
+    return any(i.islower() for i in s)
+
 
 def check_digit(s):
     return any(i.isdigit() for i in s)
 
-#list all files in dir
+
+# list all files in dir
 def get_file_enconding(file):
     detector = UniversalDetector()
-    with open(file,'rb') as daf:
-        i=1000
+    with open(file, 'rb') as daf:
+        i = 1000
         for line in daf.readlines():
-            i-=1
+            i -= 1
             detector.feed(line)
-            if detector.done or i==0:
+            if detector.done or i == 0:
                 break
         detector.close()
 
-        r=detector.result
+        r = detector.result
         return r["encoding"]
 
-patter=re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
+
+patter = re.compile("([^@]+)@([^@]+\.[^@]+)(\s|:|;)(.*)")
+
 
 def extract_email(line):
     global patter
-    match=patter.search(line)
+    match = patter.search(line)
     if match:
-        res=(match.group(1),match.group(2),match.group(4))
+        res = (match.group(1), match.group(2), match.group(4))
         return (res)
     else:
         return None
 
-def strip_badbytes(b,encoding):
+
+def strip_badbytes(b, encoding):
     return (b.decode(encoding, errors='ignore')).strip()
 
-def get_files(dir):
- for (dirpath, dirnames, filenames) in walk(dir):
-  for x in filenames:
-   yield os.path.join(dirpath,x)
 
-def get_lines(file):
-    encoding=get_file_enconding(file)
+def get_files(dir):
+    for (dirpath, dirnames, filenames) in walk(dir):
+        for file in filenames:
+            full_filename=os.path.join(dirpath, file)
+            encoding=get_file_enconding(full_filename)
+            if encoding:
+                yield encoding, full_filename
+
+
+def get_lines(file,encoding=None):
+    if not encoding:
+        encoding = get_file_enconding(file)
     with open(file, 'rb') as f:
         for line in f:
-            yield(strip_badbytes(line,encoding))
+            yield (strip_badbytes(line, encoding))
+
 
 def get_parsable_lines(file):
-    success=1 #initialized with 1 to preven div/0
-    failure=1
+    global log_filename
+    success = 1  # initialized with 1 to preven div/0
+    failure = 1
     for line in get_lines(file):
-        doc=extract_email(line)
+        doc = extract_email(line)
         if doc:
-            success+=1
+            success += 1
             yield doc
         else:
-            failure+=1
-    success_rate=(success/(success+failure))
-    with open("processed_files",'a+') as file_log:
-        file_log.write("{};{}\n".format(file,success_rate))
+            failure += 1
+    success_rate = (success / (success + failure))
+    with open(log_filename, 'a+') as file_log:
+        file_log.write("{};{}\n".format(file, success_rate))
 
 
 def create_doc(file):
     for cred in get_parsable_lines(file):
-        doc={}
-        doc["user"],doc["domain"],doc["password"] = cred
-        doc["file"]=file
-        doc["length"] = len(doc["password"])
-        doc["passwordMask"] = get_mask(doc["password"])
-        doc["containsDigits"] = check_digit(doc["password"])
-        doc["containsLowerCase"] = check_lower(doc["password"])
-        doc["containsUpperCase"] = check_upper(doc["password"])
-        doc["containsSpecial"] = check_special(doc["password"])
-        yield doc
+        doc = {
+            "user"              :   cred[0],
+            "domain"            :   cred[1],
+            "password"          :   cred[2],
+            "file"              :   file,
+            "length"            :   len(cred[2]),
+            "passwordMask"      :   get_mask(cred[2]),
+            "containsDigits"    :   check_digit(cred[2]),
+            "containsLowerCase" :   check_lower(cred[2]),
+            "containsUpperCase" :   check_upper(cred[2]),
+            "containsSpecial"   :   check_special(cred[2])
+           }
+        id_hash=hex(mmh3.hash128(",".join((doc["user"], doc["domain"], doc["password"])), 12,signed=False) % 1000000000000000000000)
+        yield id_hash, doc
 
 
-
-def set_data(input_file, index_name = "leak_col1", doc_type_name = "credential"):
-    for doc in create_doc(input_file):
-        id=hex(mmh3.hash128(",".join((doc["user"],doc["domain"],doc["password"])),12,signed=False)%1000000000000000000000)
+def process_file(input_file):
+    global index_prefix, doc_type_name
+    filenamehash=hex(mmh3.hash128(input_file, 12,signed=False) % 1000000000000000000000)
+    for id_hash, doc in create_doc(input_file):
         yield {
-        "_index": index_name,
-        "_type": doc_type_name,
-        "_id": id,
-        "_source": doc
+            "_index": "{}_{}".format(index_prefix, filenamehash),
+            "_type": doc_type_name,
+            "_id": id_hash,
+            "_source": doc
         }
 
-def load(es, input_file, **kwargs):
-	print('[*] Indexing file: %s' % input_file)
-	success, _ = bulk(es, set_data(input_file, **kwargs), request_timeout = 60, raise_on_exception = False)
+
+index_prefix = "leak_col1"
+doc_type_name = "credential"
+log_filename = "processed_files"
 
 es = Elasticsearch()
-for data in get_files(sys.argv[1]):
-    load(es,data)
+
+for encoding, data in get_files(sys.argv[1]):
+    print('[*] Indexing file: %s' % data)
+    success, _ = bulk(es, process_file(data), request_timeout=60, raise_on_exception=False)
diff --git a/quickchardet.py b/quickchardet.py
new file mode 100755
index 0000000..9eaa26c
--- /dev/null
+++ b/quickchardet.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+import chardet
+from chardet import UniversalDetector
+import sys
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-l",help="list all encoding changes in file",action='store_true')
+parser.add_argument("-d",help="try to decode all Lines",action='store_true')
+parser.add_argument('filename')
+args = parser.parse_args()
+
+
+with open(args.filename,'rb') as infile:
+    det=UniversalDetector()
+    if args.l:
+        print("listing encodings of file \"{}\"".format(args.filename))
+        encoding=None
+        for nl,line in enumerate(infile.readlines()):
+            det.reset()
+            det.feed(line)
+            det.close()
+            res=det.result
+            if encoding != res["encoding"]:
+                encoding=res["encoding"]
+                if args.d:
+                    print("{}#{}#{}({})".format(nl,line.decode(res["encoding"]),res["encoding"],res["confidence"]))
+                else:
+                    print("{}#{}#{}({})".format(nl,line,res["encoding"],res["confidence"]))
+    else:
+        i=1000
+        for line in infile.readlines():
+            i-=1
+            det.feed(line)
+            if det.done or i==0:
+                break
+        det.close()
+        res=det.result
+        print("{}:{}({})".format(sys.argv[1],res["encoding"],res["confidence"]))