I currently have about 1,500 .bin.txt files that I am analysing using Excel Powerquery; however, the loading of the data is very slow (15+ minutes) so I decided to create a short python script to combine all the bin files first, then read them all together in powerquery.
I have the following script, but the time to concatenate all these files is over 20+ minutes and the full dataset will be 10 or more times the 1,500 number. Is there any way to speed this up?
def combineBinFiles():
    root = tk.Tk()
    root.withdraw()
    #Get bin files folder
    folder_selected = filedialog.askdirectory()
    print(folder_selected)
    os.chdir(folder_selected)
    files = os.listdir(folder_selected)
    print(files)
    df = pd.DataFrame()
    temp_df = pd.DataFrame(columns= ['Timestamp', 'Wind speed', 'Own consumption'])
    for file in files:
        if file.endswith('.bin.txt'):
            print("Reading file: " + file)
            #Format file based on space delimiter
            temp_df = pd.read_csv(file, delimiter=" ", header=0)
            #Extract date from column name
            date = temp_df.columns[0] 
            #Concatenate date to the beginning of each timestamp before adding it to the dataframe
            temp_df[date] = temp_df[date].apply(lambda x: date + ' ' + x)
            df = pd.concat([df, temp_df], axis=0, ignore_index=True)
            
    df.to_csv('combinedBinFile.csv', index=False)
    print(df)
combineBinFiles()
The files are formatted as: Timestamp, Wind speed, and Consumption. Each file has the date as the timestamp header, and the rest of the column is the exact time (without the date). So in the code I concatenate the date to each time value before adding it to the overall dataframe.
Edit: Bin files example; each file looks like this just different data and dates.
14_07_2023  .WindSpeed  .Power  
17 50 00 006    10,53   0   
17 50 00 016    10,53   0   
17 50 00 026    10,53   0   
17 50 00 036    10,53   0   
17 50 00 046    10,53   0   
17 50 00 056    10,53   0   
Modified code with multithreading and chunk reading:
def worker(q, df_list):
    while not q.empty():
    file = q.get()
    if file.endswith('.bin.txt'):
        print("Reading file: " + file)
        temp_df = pd.read_csv(file, delimiter="\t", header=0, engine='python')
        date = temp_df.columns[0] 
        temp_df[date] = temp_df[date].apply(lambda x: date + ' ' + x)
        #Delete the last column:
        temp_df = temp_df.iloc[:, :-1]
        temp_df.columns = ['Timestamp', 'Wind speed', 'Own consumption']
        df_list.append(temp_df)
    q.task_done()
def combineBinFilesThreaded():
root = tk.Tk()
root.withdraw()
folder_selected = filedialog.askdirectory()
print(folder_selected)
os.chdir(folder_selected)
files = os.listdir(folder_selected)
print(files)
df_list = []
q = queue.Queue()
# Create 2 worker threads
for i in range(4):
    t = threading.Thread(target=worker, args=(q, df_list))
    t.daemon = True
    t.start()
# Put the files in the queue
for file in files:
    q.put(file)
# Wait for all the tasks in the queue to be processed
q.join()
print("Done joining")
# Combine all the dataframes
df = pd.concat(df_list, axis=0, ignore_index=True)
print("Done concatenating")
chunksize = 100000
#Add if statement to check if combinedBinFile exists so that it doesn't append to itself
if os.path.exists('combinedBinFile.csv'):
    os.remove('combinedBinFile.csv')
    
for i in range(0, len(df), chunksize):
    print("Writing chunk: " + str(i))
    df.iloc[i:i+chunksize].to_csv('combinedBinFile.csv', index=False, mode='a')
print("Done writing")
 
     
    