I have a large program where I am trying to read approximately 30000 lines of data and process it. I know that I can use the chuncksize functionality to do this, but I think I am not executing this effectively. I have attempted to use some other solutions to no avail.
A simplified version of my code:
all_combos = [] #appending all combos into a list
alpha_path = (r'alpha_out.csv')
chunksize = 100 #read .csv in chunks to prevent memory excess
tfr = pd.read_csv(alpha_path, chunksize=chunksize, iterator=True)
alpha_out = pd.concat(tfr, ignore_index=True)
alpha_no_x3 = alpha_out.values.tolist() #formatted to list
for a in alpha_no_x3:
    query = a
    Hydroxylation = 16 #K,N,P
    Carboxylation = 44 #K,D,E
    Phosphorylation = 80 #S,T,Y
    Acetylation = 42 #K, X @ N term
    Lactylation = 71 #K
    Formylation = 28 #K, X @ N term
    Methylation =  14 #K,R, X at C term
    Dimethylation = 28 #K,R
    Trimethylation = 42 #K
    Sulfonation = 80 #Y, T ,S
    Citrullination = 31 #R
    Nitrosylation = 47 #Y
    Butyrylation = 70 #K
    Crotonylation = 68 #K
    Glutarylation = 114 #K
    Hydroxybutyrylation = 87 #K
    Malonylation = 125 #K
    Succinylation = 100 #K
    Glu_to_PyroGlu = 17 #Q, E
    Amidation = -1 #X at C-term
    Deamidation = 1 #N,Q
    Oxidation_or_Hydroxylation = 16 #W,H,M
    Sodium_adduct = 22 #D, E, X at C-term
    Dihydroxy = 32 #M
    S_carbamoylmethylcysteine_cyclization = 40 #C @ N term
    Carbamylation = 43 #K, X @ N term
    Ethanolation = 44 #C
    Beta_methylthiolation = 46 #C
    Iodoacetamide_derivative = 57 #C
    Iodoacetic_acid_derivative = 58 #C
    Acrylamide_adduct = 71 #C
    N_isopropylcarboxamidomethyl = 99 #C
    S_pyridylethylation = 105 #C
    Hexose = 162 #S,T
    N_Acetylhexosamine = 203 #N
    Myristoylation = 210 #K,C,G
    Biotinylation = 226 #K, X @ N term
    no_mod = 0 #allows for no modification to be present
    #lysine combinations
    for a in query:
        k_instances = a.count('K')
        print(k_instances)
        k_modifications = [Hydroxylation, Carboxylation, Acetylation, Lactylation, Formylation,
                    Methylation, Dimethylation, Trimethylation, Butyrylation, Crotonylation,
                    Glutarylation, Hydroxybutyrylation, Malonylation, Succinylation
                    ,Sodium_adduct,Carbamylation,Myristoylation,Biotinylation,
                    no_mod
                    ]
        k_combinations = itertools.combinations_with_replacement(k_modifications, k_instances)
        k_comb_l = list(k_combinations)
        k_comb_sum = ([sum(x) for x in k_comb_l])
        k_comb_sum_list = list(k_comb_sum)
   
        ptm_list = []
        
        if k_instances != 0:
            ptm_list.append(k_comb_sum_list) #makes sure each AA is accounted for in mass
        combos = list(itertools.product(*ptm_list))
    
        combos_comb_sum = ([sum(x) for x in combos])
        combos_comb_sum_list = list(combos_comb_sum)
        all_combos.append(combos_comb_sum_list)
This is the explanation I have been consulting: Lazy Method for Reading Big File in Python?
If I can determine where to nest my script within the chunksize parameters, I think this might get me there.
