I am facing a memory issue with an R script of mine that I cannot understand. In short, the script takes as input a list of paths to data tables and the script builds one big data table (about 10k rows and 2k columns) by aggregating iteratively each input table.
I tried to run locally on my MAC OS (Mojave, Memory: 16 GB 2133 MHz LPDDR3) but I run out of memory before the script ends. However, the complete aggregated table should not exceed 500Mb and I don't have many intermediate variables that would make the memory full.
I used pryr and profvis R packages to profile the memory usage alone the script. Here is some of the output
-INFO MEMORY: total memory usage is 223.99 MB
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_cpad_kegg_DNA_non_synonymous_tcga. tsv (8974,42) ... done!
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_sanchez_vega_DNA_non_synonymous_tcga.tsv (8974,12) ... done!
-INFO: adding DNA Mutation counts per pathway from table count_by_pathway_msigdb_hallmarks_DNA_non_synonymous_tcga.tsv (8974,52) ... done!
-INFO MEMORY: size of dfs is 81.79 MB
-INFO MEMORY: total memory usage is 253.44 MB
-INFO: adding MUT counts total from table count_total_DNA_all_tcga.tsv (8974,3) ... done!
-INFO MEMORY: size of dfs is 81.97 MB
-INFO MEMORY: total memory usage is 260.74 MB
-INFO: adding DNA Alteration counts per gene from table count_by_gene_DNA_annotated_tcga.tsv (3630,373) ... done!
-INFO MEMORY: size of dfs is 130.17 MB
-INFO MEMORY: total memory usage is 322.26 MB
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_cpad_kegg_DNA_annotated_tcga.tsv (3630,123) ... done!
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_sanchez_vega_DNA_annotated_tcga.tsv (3630,24) ... done!
-INFO: adding DNA Alteration counts per pathway from table count_by_pathway_msigdb_hallmarks_DNA_annotated_tcga.tsv (3630,137) ... done!
-INFO MEMORY: size of dfs is 166.64 MB
-INFO MEMORY: total memory usage is 400.82 MB
Here is the code for the last function I called
dfs <- add_counts(args$dna_alt_counts_pathway, dfs, agg="pathway", evt_type="Alteration", data_type="DNA", cohort=args$cohort)
add_counts <- function(filepaths, dfs, agg="gene", data_type="DNA", evt_type="Alteration", cohort="prism"){
  dfs_data <- lapply(filepaths, load_table)
  if (length(dfs_data)==0) return(dfs)
  if (agg %in% c("pathway")){
    pattern_1 <- paste0("(?<=count_by_", agg, "_)[a-z0-9A-Z\\_\\-]+(?=_", data_type, ")")
    level_names_1 <- sapply(filepaths, function(s) str_extract(s, pattern_1), USE.NAMES=F)
  } else {
    level_names_1 <- agg
  }
  pattern_2 <- paste0("(?<=_", data_type, "_)[a-z0-9A-Z\\_\\-]+(?=_", cohort, ")")
  level_names_2 <- sapply(filepaths, function(s) str_extract(s, pattern_2), USE.NAMES=F)
  col_tsb <- "Tumor_Sample_Barcode"
  col_nsb <- "Matched_Norm_Sample_Barcode"
  col_psb <- "Sample_Id_DNA_P"
  for (i in 1:length(filepaths)){
    df_dat_cnt <- dfs_data[[i]]
    level_1 <- level_names_1[[i]]
    level_2 <- level_names_2[[i]]
    
    dat_size <- paste0("(", nrow(df_dat_cnt), ",", ncol(df_dat_cnt), ")")
    cat(paste("-INFO: adding", data_type, evt_type, "counts per", agg, "from table", basename(filepaths[[i]]), 
              dat_size, "..."))
    if (all(c(col_tsb, col_nsb) %in% colnames(df_dat_cnt))){
      df_dat_cnt <- df_dat_cnt %>% unite(!!col_psb, all_of(c(col_tsb, col_nsb)), sep="_vs_")
    }
    col_row <- intersect(c(col_psb, "Tumor_Sample_Barcode", "Sample_Id", "Subject_Id"), colnames(df_dat_cnt))[1]
    df_dat_cnt <- df_dat_cnt %>% column_to_rownames(var=col_row)
    df_dat_sts <- df_dat_cnt %>% mutate_if(is.numeric, function(x) as.integer(as.logical(x))) 
    plot_names <- colnames(df_dat_cnt)
    colnames(df_dat_cnt) <- paste0(colnames(df_dat_cnt), "_", data_type, "_count_", level_2)
    colnames(df_dat_sts) <- paste0(colnames(df_dat_sts), "_", data_type, "_status_", level_2)
    df_cov_cnt <- data.frame(Covariate=colnames(df_dat_cnt), Plot_Name=plot_names) %>% 
      mutate(Nature="Continuous", Class_Lvl_1=data_type,
             Class_Lvl_2=paste0(evt_type, "_Counts_", str_to_title(agg)), 
             Class_Lvl_3=paste0(level_1, "_", level_2))
    df_cov_sts <- data.frame(Covariate=colnames(df_dat_sts), Plot_Name=plot_names) %>% 
      mutate(Nature="Binary", Class_Lvl_1=data_type,
             Class_Lvl_2=paste0(evt_type, "_Status_", str_to_title(agg)), 
             Class_Lvl_3=paste0(level_1, "_", level_2))
    if (col_row!="Subject_Id" & col_row!=col_psb){
      if (data_type=="RNA"){
        col_row <- "Sample_Id_RNA_T"
      } else {
        col_row <- "Sample_Id_DNA_T"
      }
    }
    df_dat_cnt <- df_dat_cnt %>% as.data.frame() %>% rownames_to_column(var=col_row)
    df_dat_sts <- df_dat_sts %>% as.data.frame() %>% rownames_to_column(var=col_row)
    dfs$dat <- left_join(dfs$dat, df_dat_cnt, by=col_row)
    dfs$cov <- bind_rows(dfs$cov, df_cov_cnt)
    
    if (!agg %in% c("total")){
      dfs$dat <- left_join(dfs$dat, df_dat_sts, by=col_row)
      dfs$cov <- bind_rows(dfs$cov, df_cov_sts)
    }
    cat(" done!\n")
  }
  print_size_object(dfs)
  print_total_memory()
  dfs
}
print_size_object <- function(obj){
  obj_name <- deparse(substitute(obj))
  obj_size <- object_size(obj)
  if (obj_size > 1e9){
    print_size <- round(obj_size/1e9, 2)
    unit_size <- "GB"
  } else {
    print_size <- round(obj_size/1e6, 2)
    unit_size <- "MB"
  }
  cat(paste("-INFO MEMORY: size of", obj_name, "is", print_size, unit_size, "\n"))
}
print_total_memory <- function(){
  mem_size <- mem_used()
  if (mem_size > 1e9){
    print_size <- round(mem_size/1e9, 2)
    unit_size <- "GB"
  } else {
    print_size <- round(mem_size/1e6, 2)
    unit_size <- "MB"
  }
  cat(paste("-INFO MEMORY: total memory usage is", print_size, unit_size, "\n"))
}
After running this line, the activity monitor reports the following statistics for the R process running
However, running mem_used from R console returns
266 MB
What is the reason for having a RSS > 3Gb when R objects occupy 266 Mb? And why is VMZ so huge? During the execution of
dfs <- add_counts(args$dna_alt_counts_pathway, dfs, agg="pathway", evt_type="Alteration", data_type="DNA", cohort=args$cohort)
the VMZ increased from about 10Gb to >30Gb and it does not seem to decrease a lot after the execution is finished.
Thank you very much for your help, I can't get my head around this issue!
Best, Yoann

