I want to read a large .txt file (c.2.5GB) in chunks and then perform some operations before loading into the database.
The file only has 2 columns (column delimiter is ¬) and is qualified with double quotes. The values in the second column could span across multiple lines (sample below). I thought of using this answer but the issue would be that it might process incomplete lines as it depends on preset chunk size. Can someone please help? I've included the sample data and code below.
Sample Data (Sample_load_file.txt)
"LINE_ID"¬"LINE_TEXT"
"C1111-G00-BC222"¬"this line is
split into
multiple lines
% All needs to be read into 1 line
% Currently that's not happening
"
"C22-f0-333"¬"2nd row. This line is
split into
multiple lines
% All needs to be read into 1 line
% Currently that's not happening
  *******************************************************************
  This line also includes the column delimiter within text qualifier
  *******************************************************************
  # !¬!¬!¬|
"
Code
import pandas as pd
import os
from dbconnection import DBConnection
path = r'C:\Sample_load_file.txt'
db = DBConnection(server ='XXXX', database='XXXX')
def read_in_chunks(file_object, chunk_size=1024):
    #Lazy load to read a file piece by piece (avoiding moemory issues)
    #Default chunk size: 1k.
    while True:
        data = file_object.read(chunk_size)
        if not data:
            break
        yield data
        
def process_chunk(data=piece):
    #Build a list of lines based on ' "\n" ' as custom separator
    data = data.split('"\n"')
    
    #Split each line based on ' "¬" ' as custom separator
    data = [line.split('"¬"') for line in data]
    
    #Cleanup remaining double quotes
    data = [[e.replace('"', '') for e in line] for line in data]
    
    #Check the number of columns
    number_of_cols = len(str(data[0]).split('¬'))
    number_of_cols
    
    #Load data into a dataframe
    df = pd.DataFrame(data)
    
    #Reformat dataframe
    df.columns = df.iloc[0] # Set first row as column index
    df = df.iloc[1:].reset_index(drop=True) # Drop first line and reset index
    
    #Split the first column into two
    try:
        df[['LINE_ID', 'LINE_TEXT']] = df['LINE_ID¬LINE_TEXT'].str.split('¬',expand=True)
    except:
        print('Error')
    del df['LINE_ID¬LINE_TEXT']
    
    #Add metadata
    df['loaded_by'] = 'XXXX'
    df['file_line_number'] = range(2,len(df)+2)
    df['load_date'] = pd.datetime.now()
    df['source_file'] = path
    df['loading_script'] = r'Load_Extracts.ipynb'    
    
    #Load in SQL db
    df.to_sql('SQL_table_name', db.engine, schema='dbo', index=False, if_exists='append')
    
#Load text file
with open(path) as f:
    for piece in read_in_chunks(f):
        process_data(piece)
 
    