Add split_linewise.py
split_linewise.py splits large textfiles into smaller chunks, like 500MB for Uploading into splunk while also making sure each line stays intakt
This commit is contained in:
61
tools/split_linewise.py
Executable file
61
tools/split_linewise.py
Executable file
@@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def split_lines(input_file, chunk_size_mb=500):
|
||||||
|
"""
|
||||||
|
Splits a file into chunks of size chunk_size_mb but preserves lines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_file (str): Path to the file that will be splitted into chunks.
|
||||||
|
chunk_size_mb (int, optional): Size of each chunk in MB. Defaults to 500.
|
||||||
|
"""
|
||||||
|
chunk_size_bytes = chunk_size_mb * 1024 * 1024 # Convert to MB to bytes
|
||||||
|
current_chunk = 1
|
||||||
|
current_size = 0
|
||||||
|
output_file = None
|
||||||
|
# Try to extract original extension
|
||||||
|
file_extension = os.path.splitext(input_file)
|
||||||
|
if len(file_extension) == 2:
|
||||||
|
file_extension = file_extension[1]
|
||||||
|
else:
|
||||||
|
file_extension = ''
|
||||||
|
|
||||||
|
# Open the file and split it into chunks
|
||||||
|
with open(input_file, 'r') as infile:
|
||||||
|
for line in infile:
|
||||||
|
# Open new file if none exists or next line exceeds chunk size
|
||||||
|
if output_file is None or ((current_size + len(line.encode('utf-8'))) > chunk_size_bytes):
|
||||||
|
if output_file:
|
||||||
|
output_file.close()
|
||||||
|
output_filename = f"{os.path.splitext(input_file)[0]}_chunk{current_chunk:03d}{file_extension}"
|
||||||
|
print(f"Created {output_filename}")
|
||||||
|
output_file = open(output_filename, 'w')
|
||||||
|
current_chunk += 1
|
||||||
|
current_size = 0
|
||||||
|
output_file.write(line)
|
||||||
|
current_size += len(line.encode('utf-8'))
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
"""
|
||||||
|
Parses command line arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
argparse.Namespace: The arguments passed to the command line.
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description='Split a text file into smaller chunks but keep lines intakt.')
|
||||||
|
parser.add_argument('input_file', type=str, help='The path to the input file to be split.')
|
||||||
|
parser.add_argument('--chunk_size_mb', type=int, default=500, help='Maximum chunk size in MB (default: 500).')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_arguments()
|
||||||
|
split_lines(args.input_file, args.chunk_size_mb)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user