From 1380c7df75d985f2c33306af8531da4a1c3bde31 Mon Sep 17 00:00:00 2001 From: Tobias Kessels Date: Thu, 1 Aug 2024 09:44:57 +0200 Subject: [PATCH] Add split_linewise.py split_linewise.py splits large textfiles into smaller chunks, like 500MB for Uploading into splunk while also making sure each line stays intakt --- tools/split_linewise.py | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 tools/split_linewise.py diff --git a/tools/split_linewise.py b/tools/split_linewise.py new file mode 100755 index 0000000..a7ff8c6 --- /dev/null +++ b/tools/split_linewise.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +import os +import sys +import argparse + + +def split_lines(input_file, chunk_size_mb=500): + """ + Splits a file into chunks of size chunk_size_mb but preserves lines. + + Args: + input_file (str): Path to the file that will be splitted into chunks. + chunk_size_mb (int, optional): Size of each chunk in MB. Defaults to 500. + """ + chunk_size_bytes = chunk_size_mb * 1024 * 1024 # Convert to MB to bytes + current_chunk = 1 + current_size = 0 + output_file = None + # Try to extract original extension + file_extension = os.path.splitext(input_file) + if len(file_extension) == 2: + file_extension = file_extension[1] + else: + file_extension = '' + + # Open the file and split it into chunks + with open(input_file, 'r') as infile: + for line in infile: + # Open new file if none exists or next line exceeds chunk size + if output_file is None or ((current_size + len(line.encode('utf-8'))) > chunk_size_bytes): + if output_file: + output_file.close() + output_filename = f"{os.path.splitext(input_file)[0]}_chunk{current_chunk:03d}{file_extension}" + print(f"Created {output_filename}") + output_file = open(output_filename, 'w') + current_chunk += 1 + current_size = 0 + output_file.write(line) + current_size += len(line.encode('utf-8')) + + if output_file: + output_file.close() + +def parse_arguments(): + """ + Parses command line arguments. + + Returns: + argparse.Namespace: The arguments passed to the command line. + """ + parser = argparse.ArgumentParser(description='Split a text file into smaller chunks but keep lines intakt.') + parser.add_argument('input_file', type=str, help='The path to the input file to be split.') + parser.add_argument('--chunk_size_mb', type=int, default=500, help='Maximum chunk size in MB (default: 500).') + return parser.parse_args() + +def main(): + args = parse_arguments() + split_lines(args.input_file, args.chunk_size_mb) + +if __name__ == '__main__': + main()