After searching and experimenting with different packages and measuring the time each one needed to calculate the scores, I found the nltk corpus bleu and PyRouge the most efficient ones. Just keep in mind that in each record, I had multiple hypotheses and that's why I calculate the means once for each record and
This is how I did it for BLEU:
reference = [[i.split() for i in ref]]
def find_my_bleu(text, w):
   candidates_ = [text.split()]
   return corpus_bleu(reference, candidates_, weights=w, 
                                    smoothing_function=cc.method4)
def get_final_bleu(output_df):
   print('Started calculating the bleu scores...')
   output_df.loc[:, 'bleu_1'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (1, 0, 0, 0)) for t in x])
   output_df.loc[:, 'bleu_2'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (0, 1, 0, 0)) for t in x])
   output_df.loc[:, 'bleu_3'] = output_df.loc[:, 'final_predicted_verses'].apply(lambda x:[find_my_bleu(t, (0, 0, 1, 0)) for t in x])
   print('Now the average score...')
   output_df.loc[:, 'bleu_3_mean'] = output_df.loc[:, 'bleu_3'].apply(lambda x:np.mean(x))
   output_df.loc[:, 'bleu_2_mean'] = output_df.loc[:, 'bleu_2'].apply(lambda x:np.mean(x))
   output_df.loc[:, 'bleu_1_mean'] = output_df.loc[:, 'bleu_1'].apply(lambda x:np.mean(x))
   print('mean bleu_3 score: ', np.mean(output_df.loc[:, 'bleu_3_mean']))
   print('mean bleu_2 score: ', np.mean(output_df.loc[:, 'bleu_2_mean']))
   print('mean bleu_1 score: ', np.mean(output_df.loc[:, 'bleu_1_mean']))
For ROUGE:
rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=False, rouge_s=False, rouge_su=False)
def find_my_rouge(text):
    hypotheses = [[text.split()]]
    score = rouge.evaluate_tokenized(hypotheses, [[reference_rouge]])
    return score
Then for taking the mean of all:
def get_short_rouge(list_dicts):
    """ get the mean of all generated text for each record"""
    l_r = 0
    l_p = 0
    l_f = 0
    one_r = 0
    one_p  = 0
    one_f  = 0
    two_r  = 0
    two_p  = 0
    two_f  = 0
    
    for d in list_dicts:
        
        
        one_r += d['rouge-1']['r']
        one_p += d['rouge-1']['p']
        one_f += d['rouge-1']['f']
        two_r += d['rouge-2']['r']
        two_p += d['rouge-2']['p']
        two_f += d['rouge-2']['f']
        
        l_r += d['rouge-l']['r']
        l_p += d['rouge-l']['p']
        l_f += d['rouge-l']['f']
    length = len(list_dicts)
    return {'rouge-1': {'r': one_r/length , 'p': one_p/length , 'f': one_f/length},
            'rouge-2': {'r': two_r/length, 'p': two_p/length, 'f': two_f/length},
            'rouge-l': {'r': l_r/length, 'p': l_p/length , 'f': l_f/length}
            }
def get_overal_rouge_mean(output_df):
    print('Started getting the overall rouge of each record...')
    output_df.loc[:, 'rouge_mean'] = output_df.loc[:, 'rouge'].apply(lambda x: get_short_rouge(x))
    print('Started getting the overall rouge of all record...')
    l_r = 0
    l_p = 0
    l_f = 0
    one_r = 0
    one_p  = 0
    one_f  = 0
    two_r  = 0
    two_p  = 0
    two_f  = 0
    for i in range(len(output_df)):
        d = output_df.loc[i, 'rouge_mean']
        
        one_r += d['rouge-1']['r']
        one_p += d['rouge-1']['p']
        one_f += d['rouge-1']['f']
        two_r += d['rouge-2']['r']
        two_p += d['rouge-2']['p']
        two_f += d['rouge-2']['f']
        
        l_r += d['rouge-l']['r']
        l_p += d['rouge-l']['p']
        l_f += d['rouge-l']['f']
    length = len(output_df)
    print('overall rouge scores: ')
    print({'rouge-1': {'r': one_r/length , 'p': one_p/length , 'f': one_f/length},
                'rouge-2': {'r': two_r/length, 'p': two_p/length, 'f': two_f/length},
                'rouge-l': {'r': l_r/length, 'p': l_p/length , 'f': l_f/length}
                })
    return output_df
I hope it helps anyone who's had this problem.