All,
I am using pandas groupby.apply to use my own custom function. However, I have noticed that the function is very, very slow. Can someone help me in converting this code to apply to spark dataframes? 
Adding simple example for people to use:
import pandas as pd
import operator
df = pd.DataFrame({
    'Instruments': ['A', 'B', 'A', 'B', 'A', 'C', 'C', 'B'],
    'Sers': ['Wind', 'Tool', 'Wind', 'Wind', 'Tool', 'Tool', 'Tool', 'Wind'],
    'Sounds': [42, 21, 34, 56, 43, 61, 24, 23]
})
def get_stats(data_frame):
    # For each grouped data_frame, cutoff all Sounds greater than 99th percentile
    cutoff_99 = data_frame[data_frame.Sounds <= data_frame.Sounds.quantile(.99)]
    # Based on total number of records, select the most-abundant sers
    sers_to_use = max((cutoff_99.Sers.value_counts() / cutoff_99.shape[0]).to_dict().items(), key = operator.itemgetter(1))[0]
    # Give me the average sound of the selected sers
    avg_sounds_of_sers_to_use = cutoff_99.loc[cutoff_99["Sers"] == sers_to_use].Sounds.mean()
    # Pre-allocate lists
    cool = []
    mean_sounds = []
    ratios = []
    _difference = []
    for i in cutoff_99.Sers.unique():
        # add each unique sers of that dataframe 
        cool.append(i) 
        # get the mean sound of that ser
        sers_mean_sounds = (cutoff_99.loc[cutoff_99["Sers"] == i].Sounds).mean()
        # add each mean sound for each sers
        mean_sounds.append(sers_mean_sounds) 
        # get the ratio of the sers to use vs. the current sers; add all of the ratios to the list
        ratios.append(avg_sounds_of_sers_to_use / sers_mean_sounds)
        # get the percent difference and add it to a list
        _difference.append(
            float(
                round(
                    abs(avg_sounds_of_sers_to_use - sers_mean_sounds)
                    / ((avg_sounds_of_sers_to_use + sers_mean_sounds) / 2),
                    2,
                )
                * 100
            )
        )
    # return a series with these lists/values.
    return pd.Series({
        'Cools': cool,
        'Chosen_Sers': sers_to_use,
        'Average_Sounds_99_Percent': mean_sounds,
        'Mean_Ratios': ratios,
        'Percent_Differences': _difference
    }) 
I call the function as follows in pandas:
df.groupby('Instruments').apply(get_stats)
 
     
     
    