I have the following code which I use to loop through row groups in a parquet metadata file to find the maximum values for columns i,j,k across the whole file. As far as I know I have to find the max value in each row group.
I am looking for:
- how to write it with at least two fewer levels of nesting
- in fewer lines in general
I tried to use a dictionary lambda combo as a switch statement in place of some of the if statements, and eliminate at least two levels of nesting, but I couldn't figure out how to do the greater than evaluation without nesting further.
import pyarrow.parquet as pq
def main():
    metafile = r'D:\my_parquet_meta_file.metadata'
    meta = pq.read_metadata(metafile)
    max_i = 0
    max_j = 0
    max_k = 0
    for grp in range(0, meta.num_row_groups):
        for col in range(0, meta.num_columns):
            # locate columns i,j,k
            if meta.row_group(grp).column(col).path_in_schema in ['i', 'j', 'k']:
                if meta.row_group(grp).column(col).path_in_schema == 'i':
                    if meta.row_group(grp).column(col).statistics.max > max_i:
                        max_i = meta.row_group(grp).column(col).statistics.max
                if meta.row_group(grp).column(col).path_in_schema == 'j':
                    if meta.row_group(grp).column(col).statistics.max > max_j:
                        max_j = meta.row_group(grp).column(col).statistics.max
                if meta.row_group(grp).column(col).path_in_schema == 'k':
                    if meta.row_group(grp).column(col).statistics.max > max_k:
                        max_k = meta.row_group(grp).column(col).statistics.max
    print('max i: ' + str(max_i), 'max j: ' + str(max_j), 'max k: ' + str(max_k))
if __name__ == '__main__':
    main()
 
     
    