split_linewise.py splits large textfiles into smaller chunks, like 500MB for Uploading into splunk while also making sure each line stays intakt
62 lines
2.1 KiB
Python
Executable File
62 lines
2.1 KiB
Python
Executable File
#!/usr/bin/python3
|
|
import os
|
|
import sys
|
|
import argparse
|
|
|
|
|
|
def split_lines(input_file, chunk_size_mb=500):
|
|
"""
|
|
Splits a file into chunks of size chunk_size_mb but preserves lines.
|
|
|
|
Args:
|
|
input_file (str): Path to the file that will be splitted into chunks.
|
|
chunk_size_mb (int, optional): Size of each chunk in MB. Defaults to 500.
|
|
"""
|
|
chunk_size_bytes = chunk_size_mb * 1024 * 1024 # Convert to MB to bytes
|
|
current_chunk = 1
|
|
current_size = 0
|
|
output_file = None
|
|
# Try to extract original extension
|
|
file_extension = os.path.splitext(input_file)
|
|
if len(file_extension) == 2:
|
|
file_extension = file_extension[1]
|
|
else:
|
|
file_extension = ''
|
|
|
|
# Open the file and split it into chunks
|
|
with open(input_file, 'r') as infile:
|
|
for line in infile:
|
|
# Open new file if none exists or next line exceeds chunk size
|
|
if output_file is None or ((current_size + len(line.encode('utf-8'))) > chunk_size_bytes):
|
|
if output_file:
|
|
output_file.close()
|
|
output_filename = f"{os.path.splitext(input_file)[0]}_chunk{current_chunk:03d}{file_extension}"
|
|
print(f"Created {output_filename}")
|
|
output_file = open(output_filename, 'w')
|
|
current_chunk += 1
|
|
current_size = 0
|
|
output_file.write(line)
|
|
current_size += len(line.encode('utf-8'))
|
|
|
|
if output_file:
|
|
output_file.close()
|
|
|
|
def parse_arguments():
|
|
"""
|
|
Parses command line arguments.
|
|
|
|
Returns:
|
|
argparse.Namespace: The arguments passed to the command line.
|
|
"""
|
|
parser = argparse.ArgumentParser(description='Split a text file into smaller chunks but keep lines intakt.')
|
|
parser.add_argument('input_file', type=str, help='The path to the input file to be split.')
|
|
parser.add_argument('--chunk_size_mb', type=int, default=500, help='Maximum chunk size in MB (default: 500).')
|
|
return parser.parse_args()
|
|
|
|
def main():
|
|
args = parse_arguments()
|
|
split_lines(args.input_file, args.chunk_size_mb)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|