Add split_linewise.py

split_linewise.py splits large textfiles into smaller chunks, like 500MB for Uploading into splunk while also making sure each line stays intakt
This commit is contained in:
Tobias Kessels
2024-08-01 09:44:57 +02:00
parent a9ccc8cd27
commit 1380c7df75

61
tools/split_linewise.py Executable file
View File

@@ -0,0 +1,61 @@
#!/usr/bin/python3
import os
import sys
import argparse
def split_lines(input_file, chunk_size_mb=500):
"""
Splits a file into chunks of size chunk_size_mb but preserves lines.
Args:
input_file (str): Path to the file that will be splitted into chunks.
chunk_size_mb (int, optional): Size of each chunk in MB. Defaults to 500.
"""
chunk_size_bytes = chunk_size_mb * 1024 * 1024 # Convert to MB to bytes
current_chunk = 1
current_size = 0
output_file = None
# Try to extract original extension
file_extension = os.path.splitext(input_file)
if len(file_extension) == 2:
file_extension = file_extension[1]
else:
file_extension = ''
# Open the file and split it into chunks
with open(input_file, 'r') as infile:
for line in infile:
# Open new file if none exists or next line exceeds chunk size
if output_file is None or ((current_size + len(line.encode('utf-8'))) > chunk_size_bytes):
if output_file:
output_file.close()
output_filename = f"{os.path.splitext(input_file)[0]}_chunk{current_chunk:03d}{file_extension}"
print(f"Created {output_filename}")
output_file = open(output_filename, 'w')
current_chunk += 1
current_size = 0
output_file.write(line)
current_size += len(line.encode('utf-8'))
if output_file:
output_file.close()
def parse_arguments():
"""
Parses command line arguments.
Returns:
argparse.Namespace: The arguments passed to the command line.
"""
parser = argparse.ArgumentParser(description='Split a text file into smaller chunks but keep lines intakt.')
parser.add_argument('input_file', type=str, help='The path to the input file to be split.')
parser.add_argument('--chunk_size_mb', type=int, default=500, help='Maximum chunk size in MB (default: 500).')
return parser.parse_args()
def main():
args = parse_arguments()
split_lines(args.input_file, args.chunk_size_mb)
if __name__ == '__main__':
main()