I have a working code with a for-loop that I would like to optimise in terms of speed (the project has been rejected because it's slow).
This is my situation: I have an Excel file from which I import a table with thousands of rows and a few columns into a Pandas DataFrame. The first column is a monotonic increasing series of timestamps, with 15 minutes frequency.
I have to use this table to calculate few extra columns, append them to the original table and save the resulting DataFrame into a new Excel file.
What is slowing the code is the core calculation of the extra columns. Here is a snippet of the working code:
import pandas as pd
from datetime import timedelta as td
cons_prod = pd.read_csv("sample.csv", index_col=0, parse_dates=True)
soc_dct = {}  # State of charge (kW)
charge_dct = {}  # Charge (kW)
discharge_dct = {}  # Discharge (kW)
acc_dct = {}  # Auto-consumption NEW (kW)
lst_dct = {}  # Lost injection due to battery efficiency (kW)
inj_dct = {}  # Injection NEW (kW)
gridcons_dct = {}  # Grid Consumption NEW (kW)
agg_dct = {}  # Additional Auto-consumption through battery (kW)
battery_parameters = {
    "power": 50,
    "energy": 130,
    "efficiency": 0.9,
    "minsoc": 0.1,
    "soct0": 65.0,
}
bp_energy = battery_parameters["energy"]
bp_power = battery_parameters["power"]
soct0 = 0.5 * bp_energy
for t in cons_prod.index:
    L = cons_prod.loc[t, "Injection (kW)"]
    m = cons_prod.loc[t, "Grid Consumption (kW)"]
    k = cons_prod.loc[t, "Auto-consumption (kW)"]
    f = cons_prod.loc[t, "Consumption (kW)"]
    if t == cons_prod.index[0]:
        # State of charge (kW)
        soc_dct.setdefault(t, soct0)
        # Charge (kW)
        charge_dct.setdefault(
            t,
            min(
                L,
                (bp_energy - soc_dct[t]) * 4,
                bp_power,
            )
            * battery_parameters["efficiency"]
            if L >= 0
            else 0,
        )
        # Discharge (kW)
        discharge_dct.setdefault(
            t,
            -min(
                m,
                4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0,
        )
        # Auto-consumption NEW (kW)
        acc_dct.setdefault(t, k - discharge_dct[t])
        # Lost injection due to battery efficiency (kW)
        lst_dct.setdefault(
            t,
            (charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
        )
        # Injection NEW (kW)
        inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
        # Grid Consumption NEW (kW)
        gridcons_dct.setdefault(t, f - acc_dct[t])
        # Additional Auto-consumption through battery (kW)
        agg_dct.setdefault(t, acc_dct[t] - k)
    else:
        # State of charge (kW)
        soc_dct.setdefault(
            t,
            soc_dct[t - td(minutes=15)]
            + (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
        )
        # Charge (kW)
        charge_dct.setdefault(
            t,
            min(
                L,
                (bp_energy - soc_dct[t]) * 4,
                bp_power,
            )
            * battery_parameters["efficiency"]
            if L >= 0
            else 0,
        )
        # Discharge (kW)
        discharge_dct.setdefault(
            t,
            -min(
                m,
                4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0,
        )
        # Auto-consumption NEW (kW)
        acc_dct.setdefault(t, k - discharge_dct[t])
        # Lost injection due to battery efficiency (kW)
        lst_dct.setdefault(
            t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
        )
        # Injection NEW (kW)
        inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
        # Grid Consumption NEW (kW)
        gridcons_dct.setdefault(t, f - acc_dct[t])
        # Additional Auto-consumption through battery (kW)
        agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
    data=[
        soc_dct,
        charge_dct,
        discharge_dct,
        acc_dct,
        lst_dct,
        inj_dct,
        gridcons_dct,
        agg_dct,
    ]
).T
output_df.columns = [
    "State of charge (kW)",
    "Charge (kW)",
    "Discharge (kW)",
    "Auto-consumption NEW (kW)",
    "Lost injection due to battery efficiency (kW)",
    "Injection NEW (kW)",
    "Grid Consumption NEW (kW)",
    "Additional Auto-consumption through battery (kW)",
]
charge_dct = {}  # Charge (kW)
discharge_dct = {}  # Discharge (kW)
acc_dct = {}  # Auto-consumption NEW (kW)
lst_dct = {}  # Lost injection due to battery efficiency (kW)
inj_dct = {}  # Injection NEW (kW)
gridcons_dct = {}  # Grid Consumption NEW (kW)
agg_dct = {}  # Additional Auto-consumption through battery (kW)
for t in cons_prod.index:
    L = cons_prod.loc[t, "Injection (kW)"]
    m = cons_prod.loc[t, "Grid Consumption (kW)"]
    k = cons_prod.loc[t, "Auto-consumption (kW)"]
    f = cons_prod.loc[t, "Consumption (kW)"]
    if t == cons_prod.index[0]:
        # State of charge (kW)
        soc_dct.setdefault(t, soct0)
        # Charge (kW)
        charge_dct.setdefault(
            t,
            min(
                L,
                (bp_energy - soc_dct[t]) * 4,
                bp_power,
            )
            * battery_parameters["efficiency"]
            if L >= 0
            else 0,
        )
        # Discharge (kW)
        discharge_dct.setdefault(
            t,
            -min(
                m,
                4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0,
        )
        # Auto-consumption NEW (kW)
        acc_dct.setdefault(t, k - discharge_dct[t])
        # Lost injection due to battery efficiency (kW)
        lst_dct.setdefault(
            t,
            (charge_dct[t] / battery_parameters["efficiency"]) - charge_dct[t],
        )
        # Injection NEW (kW)
        inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
        # Grid Consumption NEW (kW)
        gridcons_dct.setdefault(t, f - acc_dct[t])
        # Additional Auto-consumption through battery (kW)
        agg_dct.setdefault(t, acc_dct[t] - k)
    else:
        # State of charge (kW)
        soc_dct.setdefault(
            t,
            soc_dct[t - td(minutes=15)]
            + (charge_dct[t - td(minutes=15)] + discharge_dct[t - td(minutes=15)]) / 4,
        )
        # Charge (kW)
        charge_dct.setdefault(
            t,
            min(
                L,
                (bp_energy - soc_dct[t]) * 4,
                bp_power,
            )
            * battery_parameters["efficiency"]
            if L >= 0
            else 0,
        )
        # Discharge (kW)
        discharge_dct.setdefault(
            t,
            -min(
                m,
                4 * soc_dct[t] - 4 * (battery_parameters["minsoc"] * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0,
        )
        # Auto-consumption NEW (kW)
        acc_dct.setdefault(t, k - discharge_dct[t])
        # Lost injection due to battery efficiency (kW)
        lst_dct.setdefault(
            t, charge_dct[t] / battery_parameters["efficiency"] - charge_dct[t]
        )
        # Injection NEW (kW)
        inj_dct.setdefault(t, L - charge_dct[t] - lst_dct[t])
        # Grid Consumption NEW (kW)
        gridcons_dct.setdefault(t, f - acc_dct[t])
        # Additional Auto-consumption through battery (kW)
        agg_dct.setdefault(t, acc_dct[t] - k)
# Creating a DataFrame with all the values
output_df = pd.DataFrame(
    data=[
        soc_dct,
        charge_dct,
        discharge_dct,
        acc_dct,
        lst_dct,
        inj_dct,
        gridcons_dct,
        agg_dct,
    ]
).T
output_df.columns = [
    "State of charge (kW)",
    "Charge (kW)",
    "Discharge (kW)",
    "Auto-consumption NEW (kW)",
    "Lost injection due to battery efficiency (kW)",
    "Injection NEW (kW)",
    "Grid Consumption NEW (kW)",
    "Additional Auto-consumption through battery (kW)",
]
cons_prod is the table imported into a DataFrame.
As you can see, we have two situations: when t == cons_prod.index[0] (i.e., the first item of the timestamps), the calculations use the values at the same t. However, from the second timestamp on, some of the calculations refer to the previous value (in here referred to the 15 minutes before using the index t - td(minutes=15)).
These are the reasons why I am struggling to move away from the for-loop.
A few explanations to possible questions
- Q: Why are you using dictionaries? A: Because I figured that they fill faster than other data types, and I can use them to create a DataFrame later on.
- Q: Are the timestamps coherent, e.g. do they have missing values? A: No missing values, as there is a previous function I wrote to make sure that the timestamps are filled completely.
- Q: The calculations do not currently refer to a previous row in a DataFrame, but in a dictionary! Why the misleading title? A: This is the best solution I could come up with so far, but I am wondering whether my not-so-complete knowledge of Pandas is hiding a simpler, faster solution.
Hope the framework is clear.
Thank you in advance!
EDIT: as per request, added a 100-lines sample of cons_prod and modified the previous code to meet the requirements of a MRE.
EDIT 2: I have tried moving from dictionaries to a Pandas lookup, trying to optimise as much as possible. This is the code I came up with:
from time import time as tt
cp = cons_prod.copy(deep=True)
# Initialise the columns filling them with zeroes
cp["State of charge (kW)"] = 0
cp["Charge (kW)"] = 0
cp["Discharge (kW)"] = 0
# Storing the position of the columns in variables
cp_soc = cp.columns.get_loc("State of charge (kW)")
cp_charge = cp.columns.get_loc("Charge (kW)")
cp_discharge = cp.columns.get_loc("Discharge (kW)")
cp_inj = cp.columns.get_loc("Injection (kW)")
cp_gridcons = cp.columns.get_loc("Grid Consumption (kW)")
# Storing the values of the battery dictionary lookups in variables
bp_energy = dct_bp["energy"]
bp_power = dct_bp["power"]
bp_efficiency = dct_bp["efficiency"]
bp_soct0 = dct_bp["soct0"]
bp_minsoc = dct_bp["minsoc"]
start1 = tt()  # Measuring time
for row in cp.itertuples(name=None):  # Using itertuples to gain some speed
    L = cp.loc[row[0], "Injection (kW)"]
    m = cp.loc[row[0], "Grid Consumption (kW)"]
    k = cp.loc[row[0], "Auto-consumption (kW)"]
    f = cp.loc[row[0], "Consumption (kW)"]
    if row[0] == cp.index[0]:
        cp.iloc[0, cp_soc] = bp_soct0
        cp.iloc[0, cp_charge] = float(
            min(L, (bp_energy - bp_soct0) * 4, bp_power) * bp_efficiency
            if L >= 0
            else 0,
        )
        cp.iloc[0, cp_discharge] = float(
            -min(
                m,
                4 * bp_soct0 - 4 * (bp_minsoc * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0
        )
    else:
        t = pd.Index(cp.index).get_loc(row[0])
        cp.iloc[t, cp_soc] = float(
            cp.iloc[t - 1, cp_soc]
            + (cp.iloc[t - 1, cp_charge] + cp.iloc[t - 1, cp_discharge]) / 4
        )
        cp.iloc[t, cp_charge] = float(
            min(L, (bp_energy - cp.iloc[t, cp_soc]) * 4, bp_power) * bp_efficiency
            if L >= 0
            else 0,
        )
        cp.iloc[t, cp_discharge] = float(
            -min(
                m,
                4 * cp.iloc[t, cp_soc] - 4 * (dct_bp["minsoc"] * bp_energy),
                bp_power,
            )
            if m >= 0
            else 0
        )
end1 = tt() - start1
print(f"Pandas lookup took {end1:.2f} seconds")
With this code, I got an average of 42 seconds per complete task, while I used to have <20 seconds with dictionaries.
 
     
     
     
    