Add split_linewise.py
split_linewise.py splits large textfiles into smaller chunks, like 500MB for Uploading into splunk while also making sure each line stays intakt
This commit is contained in:
61
tools/split_linewise.py
Executable file
61
tools/split_linewise.py
Executable file
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/python3
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
def split_lines(input_file, chunk_size_mb=500):
|
||||
"""
|
||||
Splits a file into chunks of size chunk_size_mb but preserves lines.
|
||||
|
||||
Args:
|
||||
input_file (str): Path to the file that will be splitted into chunks.
|
||||
chunk_size_mb (int, optional): Size of each chunk in MB. Defaults to 500.
|
||||
"""
|
||||
chunk_size_bytes = chunk_size_mb * 1024 * 1024 # Convert to MB to bytes
|
||||
current_chunk = 1
|
||||
current_size = 0
|
||||
output_file = None
|
||||
# Try to extract original extension
|
||||
file_extension = os.path.splitext(input_file)
|
||||
if len(file_extension) == 2:
|
||||
file_extension = file_extension[1]
|
||||
else:
|
||||
file_extension = ''
|
||||
|
||||
# Open the file and split it into chunks
|
||||
with open(input_file, 'r') as infile:
|
||||
for line in infile:
|
||||
# Open new file if none exists or next line exceeds chunk size
|
||||
if output_file is None or ((current_size + len(line.encode('utf-8'))) > chunk_size_bytes):
|
||||
if output_file:
|
||||
output_file.close()
|
||||
output_filename = f"{os.path.splitext(input_file)[0]}_chunk{current_chunk:03d}{file_extension}"
|
||||
print(f"Created {output_filename}")
|
||||
output_file = open(output_filename, 'w')
|
||||
current_chunk += 1
|
||||
current_size = 0
|
||||
output_file.write(line)
|
||||
current_size += len(line.encode('utf-8'))
|
||||
|
||||
if output_file:
|
||||
output_file.close()
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Parses command line arguments.
|
||||
|
||||
Returns:
|
||||
argparse.Namespace: The arguments passed to the command line.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description='Split a text file into smaller chunks but keep lines intakt.')
|
||||
parser.add_argument('input_file', type=str, help='The path to the input file to be split.')
|
||||
parser.add_argument('--chunk_size_mb', type=int, default=500, help='Maximum chunk size in MB (default: 500).')
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
split_lines(args.input_file, args.chunk_size_mb)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user