Introduction
I've been writing a program that processes some Raw data and then passes it through several statistical processes. All in all it uses several instances of "lapply".
For example: in one part of the script I use a function known as Maxstat (note the function only takes data either as data.frame or data.table - will not accept a matrix).
**
- REPRODUCIBLE SAMPLE below - just copy all sample data, and two parts of the code
**
Sample data:
The first two columns are neccesary. This is the large data sample seen below --> please read on after the sample
Genedata <- structure(list(time = c(120, 120, 28.96, 119.21, 59.53, 68.81, 
82.29, 110.82, 65.88, 84.13, 16.47, 89.75, 76.07, 67.82, 52.24, 
64.1, 55.13, 57.79, 50.1, 43.79, 30.27, 3.64, 36.59, 20.02, 118.58, 
55.33, 120, 120, 120, 106.94, 11.7, 28.79, 26.82, 52.5, 24.03, 
88.99, 102.44, 33.73, 85.28, 26.53, 37.31, 86.43, 55.92, 70.65, 
76.04, 79.13, 83.99, 80.25, 40.6, 95.37, 31.06, 37.31, 22.02, 
63.25, 34.09, 52.14, 66.04, 59.96, 47.86, 58.45, 45.99, 60.42, 
37.67, 15.71, 39.25, 49.87, 25.24, 44.97, 35.9, 26.66, 36.78, 
41.52, 22.22, 33.2, 39.68, 25.61, 83.99, 15.05, 8.38, 18.18, 
27.15, 24.26, 105.17, 11.76, 70.45, 95.07, 112.33, 27.51, 45.92, 
54.04, 103.98, 6.11, 99.51, 20.44, 76.6, 10.02, 41.45, 96.26, 
85.61, 78.87, 22.25, 74.53, 59.07, 47.8, 5.68, 79.39, 74.07, 
50.76, 67.82, 70.84, 50.59, 34.58, 38.72, 54.9, 67.92, 58.45, 
59.34, 54.54, 19.03, 26.26, 52.86, 32.05, 55.95, 56.67, 50.43, 
4.24, 49.11, 49.57, 50.49, 63.05, 49.24, 0.92, 31.36, 40.3, 116.64, 
31.92, 19.98, 24.62, 18.54, 120, 17.62, 5.32, 2.36, 5.72, 15.28, 
95.07, 4.96, 28.89, 3.25, 0.92, 18.77, 57.73, 14.39, 5.12, 4.99, 
33.17, 50.53, 5.72, 8.02, 34.02, 1.44, 36.95, 60.75, 37.44, 23.07, 
19.85, 31.85, 8.61, 42.27, 15.25, 14.56, 9.96, 8.94, 32.67, 2.07, 
22.78, 22.35), cens = c(0L, 0L, NA, 0L, 0L, 1L, 0L, 0L, NA, 0L, 
NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, NA, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, NA, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 
0L, 1L, 0L, NA, 1L, 1L, 1L, 0L, 0L, 0L, NA, 0L, NA, NA, 1L, 0L, 
0L, 1L, 1L, 0L, 0L, 1L, NA, NA, 0L, 0L, 0L, 0L, 1L, NA, 0L, 1L, 
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, NA, 0L, NA, 1L, 1L, 0L, 1L, 0L, 1L, 
1L, NA, 1L, 0L, 1L, 1L, 0L)), .Names = c("time", "cens"), class = "data.frame", row.names = c(NA, 
-177L))
To generate a larger set -> First define the sample above as Genedata
then the additional code (to change variable/column length, just change 1000 to 5000 or 50000 etc.
    data <- replicate(1000, abs(rnorm(177)))
    Genedata <- data.frame(Genedata, data)
library(data.table)
    Genedata2 <- data.table(Genedata)
Then to calculate
    #Calculating Maxstat
    library(maxstat)
    maxstat.genes <- as.list( names(Genedata) )[-c(1:3)] #generate a list of column numbers
system.time(maxstat.estimate <-lapply( maxstat.genes, function (x)
{maxstat.test(Surv(time, cens) ~ get (x), data=Genedata2, smethod="LogRank",
pmethod="Lau94")}))
I originally planned on using the for statement, as i'm familiar with it having a background in python. However, research has told me that using apply should be faster.
I also originally used data.frame, but switched to data.table as someone suggested this should be better for performance.
However, it seems there is still a significant problem with processing large datasets. Below is an explanation of why.
Running 25 variables
   user  system elapsed 
   0.04    0.00    0.05  
Running 1000 variables
  user  system elapsed 
   6.54    0.00    6.57 
Running 5000 variables
  user  system elapsed 
 132.89    0.00  133.02 
As you can see, the processing speed is not linear but rather begins to fall off at a staggering rate. Given that the dataset is 43000 variables long, I can only guess that it would take on the order of hours. Interestingly, if i manually divided the tables into small sets, and ran them individually, it would be faster.
I think it has something to do with copying increasingly larger data, but I' not sure how this could be fixed. I already tried the data.table and lapply. Any suggestions?
Update 1: Added sample data. Note to reproduce a larger dataset. You can copy columns 4 onwards and paste. Rows can also be copied down. Should work as far as i know.
Update 2: Rprof on 1000 variables
> Rprof(NULL)
> summaryRprof()
$by.self
                        self.time self.pct total.time total.pct
terms.formula                0.50    15.82       0.50     15.82
eval                         0.34    10.76       1.92     60.76
Surv                         0.12     3.80       0.20      6.33
.deparseOpts                 0.12     3.80       0.18      5.70
deparse                      0.08     2.53       0.32     10.13
na.omit.data.frame           0.08     2.53       0.30      9.49
lapply                       0.06     1.90       3.16    100.00
maxstat.test                 0.06     1.90       3.16    100.00
maxstat.test.data.frame      0.06     1.90       3.10     98.10
model.frame.default          0.06     1.90       1.68     53.16
cmaxstat                     0.06     1.90       0.38     12.03
match.arg                    0.06     1.90       0.30      9.49
ifelse                       0.06     1.90       0.06      1.90
is.matrix                    0.06     1.90       0.06      1.90
length                       0.06     1.90       0.06      1.90
match                        0.06     1.90       0.06      1.90
pmatch                       0.06     1.90       0.06      1.90
FUN                          0.04     1.27       3.16    100.00
maxstat                      0.04     1.27       0.90     28.48
terms                        0.04     1.27       0.62     19.62
[.data.frame                 0.04     1.27       0.20      6.33
order                        0.04     1.27       0.08      2.53
[.formula                    0.04     1.27       0.06      1.90
[.Surv                       0.04     1.27       0.06      1.90
attr.all.equal               0.04     1.27       0.06      1.90
makepredictcall              0.04     1.27       0.06      1.90
unique                       0.04     1.27       0.06      1.90
unlist                       0.04     1.27       0.06      1.90
<Anonymous>                  0.04     1.27       0.04      1.27
is.data.frame                0.04     1.27       0.04      1.27
names                        0.04     1.27       0.04      1.27
NextMethod                   0.04     1.27       0.04      1.27
sum                          0.04     1.27       0.04      1.27
model.frame                  0.02     0.63       1.70     53.80
na.omit                      0.02     0.63       0.32     10.13
[                            0.02     0.63       0.30      9.49
cscores                      0.02     0.63       0.28      8.86
cscores.Surv                 0.02     0.63       0.26      8.23
all.equal                    0.02     0.63       0.14      4.43
all.equal.numeric            0.02     0.63       0.12      3.80
paste                        0.02     0.63       0.12      3.80
[[                           0.02     0.63       0.08      2.53
sort                         0.02     0.63       0.08      2.53
[[.data.frame                0.02     0.63       0.06      1.90
sort.int                     0.02     0.63       0.06      1.90
as.list                      0.02     0.63       0.04      1.27
as.vector                    0.02     0.63       0.04      1.27
get                          0.02     0.63       0.04      1.27
-                            0.02     0.63       0.02      0.63
$<-                          0.02     0.63       0.02      0.63
/                            0.02     0.63       0.02      0.63
anyDuplicated                0.02     0.63       0.02      0.63
as.environment               0.02     0.63       0.02      0.63
as.list.data.frame           0.02     0.63       0.02      0.63
attr                         0.02     0.63       0.02      0.63
attr<-                       0.02     0.63       0.02      0.63
c                            0.02     0.63       0.02      0.63
cbind                        0.02     0.63       0.02      0.63
data.class                   0.02     0.63       0.02      0.63
is.array                     0.02     0.63       0.02      0.63
makepredictcall.default      0.02     0.63       0.02      0.63
mode                         0.02     0.63       0.02      0.63
pLausen94                    0.02     0.63       0.02      0.63
sys.function                 0.02     0.63       0.02      0.63
$by.total
                        total.time total.pct self.time self.pct
lapply                        3.16    100.00      0.06     1.90
maxstat.test                  3.16    100.00      0.06     1.90
FUN                           3.16    100.00      0.04     1.27
maxstat.test.data.frame       3.10     98.10      0.06     1.90
eval                          1.92     60.76      0.34    10.76
model.frame                   1.70     53.80      0.02     0.63
model.frame.default           1.68     53.16      0.06     1.90
maxstat                       0.90     28.48      0.04     1.27
do.call                       0.90     28.48      0.00     0.00
terms                         0.62     19.62      0.04     1.27
terms.formula                 0.50     15.82      0.50    15.82
cmaxstat                      0.38     12.03      0.06     1.90
sapply                        0.34     10.76      0.00     0.00
deparse                       0.32     10.13      0.08     2.53
na.omit                       0.32     10.13      0.02     0.63
na.omit.data.frame            0.30      9.49      0.08     2.53
match.arg                     0.30      9.49      0.06     1.90
[                             0.30      9.49      0.02     0.63
cscores                       0.28      8.86      0.02     0.63
cscores.Surv                  0.26      8.23      0.02     0.63
Surv                          0.20      6.33      0.12     3.80
[.data.frame                  0.20      6.33      0.04     1.27
.deparseOpts                  0.18      5.70      0.12     3.80
all.equal                     0.14      4.43      0.02     0.63
all.equal.numeric             0.12      3.80      0.02     0.63
paste                         0.12      3.80      0.02     0.63
order                         0.08      2.53      0.04     1.27
[[                            0.08      2.53      0.02     0.63
sort                          0.08      2.53      0.02     0.63
ifelse                        0.06      1.90      0.06     1.90
is.matrix                     0.06      1.90      0.06     1.90
length                        0.06      1.90      0.06     1.90
match                         0.06      1.90      0.06     1.90
pmatch                        0.06      1.90      0.06     1.90
[.formula                     0.06      1.90      0.04     1.27
[.Surv                        0.06      1.90      0.04     1.27
attr.all.equal                0.06      1.90      0.04     1.27
makepredictcall               0.06      1.90      0.04     1.27
unique                        0.06      1.90      0.04     1.27
unlist                        0.06      1.90      0.04     1.27
[[.data.frame                 0.06      1.90      0.02     0.63
sort.int                      0.06      1.90      0.02     0.63
%in%                          0.06      1.90      0.00     0.00
simplify2array                0.06      1.90      0.00     0.00
sort.default                  0.06      1.90      0.00     0.00
<Anonymous>                   0.04      1.27      0.04     1.27
is.data.frame                 0.04      1.27      0.04     1.27
names                         0.04      1.27      0.04     1.27
NextMethod                    0.04      1.27      0.04     1.27
sum                           0.04      1.27      0.04     1.27
as.list                       0.04      1.27      0.02     0.63
as.vector                     0.04      1.27      0.02     0.63
get                           0.04      1.27      0.02     0.63
-                             0.02      0.63      0.02     0.63
$<-                           0.02      0.63      0.02     0.63
/                             0.02      0.63      0.02     0.63
anyDuplicated                 0.02      0.63      0.02     0.63
as.environment                0.02      0.63      0.02     0.63
as.list.data.frame            0.02      0.63      0.02     0.63
attr                          0.02      0.63      0.02     0.63
attr<-                        0.02      0.63      0.02     0.63
c                             0.02      0.63      0.02     0.63
cbind                         0.02      0.63      0.02     0.63
data.class                    0.02      0.63      0.02     0.63
is.array                      0.02      0.63      0.02     0.63
makepredictcall.default       0.02      0.63      0.02     0.63
mode                          0.02      0.63      0.02     0.63
pLausen94                     0.02      0.63      0.02     0.63
sys.function                  0.02      0.63      0.02     0.63
formals                       0.02      0.63      0.00     0.00
is.na                         0.02      0.63      0.00     0.00
is.na.Surv                    0.02      0.63      0.00     0.00
rowSums                       0.02      0.63      0.00     0.00
$sample.interval
[1] 0.02
$sampling.time
[1] 3.16
Update 3 - Rprof on 5000 variables - as you can see its not linear increase
> Rprof(NULL)
> summaryRprof()
$by.self
                        self.time self.pct total.time total.pct
eval                        11.28    32.83      23.40     68.10
terms.formula               11.00    32.01      11.08     32.25
model.frame.default          0.68     1.98      22.38     65.13
deparse                      0.68     1.98       1.70      4.95
cmaxstat                     0.46     1.34       2.38      6.93
.deparseOpts                 0.44     1.28       0.74      2.15
na.omit.data.frame           0.40     1.16       1.74      5.06
[.data.frame                 0.36     1.05       1.06      3.08
maxstat.test.data.frame      0.32     0.93      34.16     99.42
pmatch                       0.32     0.93       0.34      0.99
match                        0.28     0.81       0.60      1.75
NextMethod                   0.28     0.81       0.28      0.81
na.omit                      0.26     0.76       2.00      5.82
Surv                         0.26     0.76       0.54      1.57
get                          0.26     0.76       0.28      0.81
unique                       0.24     0.70       0.46      1.34
makepredictcall              0.24     0.70       0.36      1.05
order                        0.24     0.70       0.34      0.99
match.arg                    0.22     0.64       1.26      3.67
[.Surv                       0.22     0.64       0.40      1.16
lapply                       0.20     0.58      34.36    100.00
paste                        0.20     0.58       0.90      2.62
maxstat                      0.18     0.52       4.50     13.10
[                            0.18     0.52       1.54      4.48
cscores                      0.18     0.52       1.26      3.67
$<-                          0.18     0.52       0.18      0.52
terms                        0.16     0.47      11.46     33.35
sort.int                     0.16     0.47       0.46      1.34
cbind                        0.16     0.47       0.16      0.47
is.matrix                    0.16     0.47       0.16      0.47
length                       0.16     0.47       0.16      0.47
[[.data.frame                0.14     0.41       0.34      0.99
FUN                          0.12     0.35      34.26     99.71
all.equal.numeric            0.12     0.35       0.24      0.70
c                            0.12     0.35       0.12      0.35
ifelse                       0.12     0.35       0.12      0.35
is.data.frame                0.12     0.35       0.12      0.35
makepredictcall.default      0.12     0.35       0.12      0.35
names                        0.12     0.35       0.12      0.35
cscores.Surv                 0.10     0.29       1.08      3.14
%in%                         0.10     0.29       0.70      2.04
all.equal                    0.10     0.29       0.40      1.16
mode                         0.10     0.29       0.34      0.99
unlist                       0.10     0.29       0.20      0.58
unique.default               0.10     0.29       0.16      0.47
as.list                      0.10     0.29       0.10      0.29
dim                          0.10     0.29       0.10      0.29
is.factor                    0.10     0.29       0.10      0.29
parent.frame                 0.10     0.29       0.10      0.29
model.frame                  0.08     0.23      22.46     65.37
is.na                        0.08     0.23       0.18      0.52
duplicated                   0.08     0.23       0.12      0.35
maxstat.test                 0.06     0.17      34.22     99.59
sapply                       0.06     0.17       1.72      5.01
sort                         0.06     0.17       0.52      1.51
pLausen94                    0.06     0.17       0.22      0.64
anyDuplicated                0.06     0.17       0.10      0.29
which                        0.06     0.17       0.10      0.29
ncol                         0.06     0.17       0.08      0.23
rowSums                      0.06     0.17       0.08      0.23
*                            0.06     0.17       0.06      0.17
/                            0.06     0.17       0.06      0.17
environment                  0.06     0.17       0.06      0.17
is.ordered                   0.06     0.17       0.06      0.17
list                         0.06     0.17       0.06      0.17
do.call                      0.04     0.12       4.56     13.27
[[                           0.04     0.12       0.38      1.11
attr.all.equal               0.04     0.12       0.10      0.29
formals                      0.04     0.12       0.08      0.23
!                            0.04     0.12       0.04      0.12
.Call                        0.04     0.12       0.04      0.12
<Anonymous>                  0.04     0.12       0.04      0.12
anyDuplicated.default        0.04     0.12       0.04      0.12
duplicated.default           0.04     0.12       0.04      0.12
floor                        0.04     0.12       0.04      0.12
is.array                     0.04     0.12       0.04      0.12
match.fun                    0.04     0.12       0.04      0.12
options                      0.04     0.12       0.04      0.12
pnorm                        0.04     0.12       0.04      0.12
seq.default                  0.04     0.12       0.04      0.12
sys.call                     0.04     0.12       0.04      0.12
simplify2array               0.02     0.06       0.52      1.51
[.formula                    0.02     0.06       0.14      0.41
is.na.Surv                   0.02     0.06       0.10      0.29
sys.function                 0.02     0.06       0.04      0.12
&                            0.02     0.06       0.02      0.06
.row_names_info              0.02     0.06       0.02      0.06
^                            0.02     0.06       0.02      0.06
<                            0.02     0.06       0.02      0.06
<=                           0.02     0.06       0.02      0.06
==                           0.02     0.06       0.02      0.06
as.environment               0.02     0.06       0.02      0.06
attr<-                       0.02     0.06       0.02      0.06
baseenv                      0.02     0.06       0.02      0.06
data.class                   0.02     0.06       0.02      0.06
is.numeric                   0.02     0.06       0.02      0.06
sqrt                         0.02     0.06       0.02      0.06
sum                          0.02     0.06       0.02      0.06
sys.parent                   0.02     0.06       0.02      0.06
$by.total
                        total.time total.pct self.time self.pct
lapply                       34.36    100.00      0.20     0.58
FUN                          34.26     99.71      0.12     0.35
maxstat.test                 34.22     99.59      0.06     0.17
maxstat.test.data.frame      34.16     99.42      0.32     0.93
eval                         23.40     68.10     11.28    32.83
model.frame                  22.46     65.37      0.08     0.23
model.frame.default          22.38     65.13      0.68     1.98
terms                        11.46     33.35      0.16     0.47
terms.formula                11.08     32.25     11.00    32.01
do.call                       4.56     13.27      0.04     0.12
maxstat                       4.50     13.10      0.18     0.52
cmaxstat                      2.38      6.93      0.46     1.34
na.omit                       2.00      5.82      0.26     0.76
na.omit.data.frame            1.74      5.06      0.40     1.16
sapply                        1.72      5.01      0.06     0.17
deparse                       1.70      4.95      0.68     1.98
[                             1.54      4.48      0.18     0.52
match.arg                     1.26      3.67      0.22     0.64
cscores                       1.26      3.67      0.18     0.52
cscores.Surv                  1.08      3.14      0.10     0.29
[.data.frame                  1.06      3.08      0.36     1.05
paste                         0.90      2.62      0.20     0.58
.deparseOpts                  0.74      2.15      0.44     1.28
%in%                          0.70      2.04      0.10     0.29
match                         0.60      1.75      0.28     0.81
Surv                          0.54      1.57      0.26     0.76
sort                          0.52      1.51      0.06     0.17
simplify2array                0.52      1.51      0.02     0.06
unique                        0.46      1.34      0.24     0.70
sort.int                      0.46      1.34      0.16     0.47
sort.default                  0.46      1.34      0.00     0.00
[.Surv                        0.40      1.16      0.22     0.64
all.equal                     0.40      1.16      0.10     0.29
[[                            0.38      1.11      0.04     0.12
makepredictcall               0.36      1.05      0.24     0.70
pmatch                        0.34      0.99      0.32     0.93
order                         0.34      0.99      0.24     0.70
[[.data.frame                 0.34      0.99      0.14     0.41
mode                          0.34      0.99      0.10     0.29
NextMethod                    0.28      0.81      0.28     0.81
get                           0.28      0.81      0.26     0.76
all.equal.numeric             0.24      0.70      0.12     0.35
pLausen94                     0.22      0.64      0.06     0.17
unlist                        0.20      0.58      0.10     0.29
$<-                           0.18      0.52      0.18     0.52
is.na                         0.18      0.52      0.08     0.23
cbind                         0.16      0.47      0.16     0.47
is.matrix                     0.16      0.47      0.16     0.47
length                        0.16      0.47      0.16     0.47
unique.default                0.16      0.47      0.10     0.29
[.formula                     0.14      0.41      0.02     0.06
c                             0.12      0.35      0.12     0.35
ifelse                        0.12      0.35      0.12     0.35
is.data.frame                 0.12      0.35      0.12     0.35
makepredictcall.default       0.12      0.35      0.12     0.35
names                         0.12      0.35      0.12     0.35
duplicated                    0.12      0.35      0.08     0.23
as.list                       0.10      0.29      0.10     0.29
dim                           0.10      0.29      0.10     0.29
is.factor                     0.10      0.29      0.10     0.29
parent.frame                  0.10      0.29      0.10     0.29
anyDuplicated                 0.10      0.29      0.06     0.17
which                         0.10      0.29      0.06     0.17
attr.all.equal                0.10      0.29      0.04     0.12
is.na.Surv                    0.10      0.29      0.02     0.06
ncol                          0.08      0.23      0.06     0.17
rowSums                       0.08      0.23      0.06     0.17
formals                       0.08      0.23      0.04     0.12
as.vector                     0.08      0.23      0.00     0.00
*                             0.06      0.17      0.06     0.17
/                             0.06      0.17      0.06     0.17
environment                   0.06      0.17      0.06     0.17
is.ordered                    0.06      0.17      0.06     0.17
list                          0.06      0.17      0.06     0.17
nrow                          0.06      0.17      0.00     0.00
!                             0.04      0.12      0.04     0.12
.Call                         0.04      0.12      0.04     0.12
<Anonymous>                   0.04      0.12      0.04     0.12
anyDuplicated.default         0.04      0.12      0.04     0.12
duplicated.default            0.04      0.12      0.04     0.12
floor                         0.04      0.12      0.04     0.12
is.array                      0.04      0.12      0.04     0.12
match.fun                     0.04      0.12      0.04     0.12
options                       0.04      0.12      0.04     0.12
pnorm                         0.04      0.12      0.04     0.12
seq.default                   0.04      0.12      0.04     0.12
sys.call                      0.04      0.12      0.04     0.12
sys.function                  0.04      0.12      0.02     0.06
getOption                     0.04      0.12      0.00     0.00
irank                         0.04      0.12      0.00     0.00
seq                           0.04      0.12      0.00     0.00
&                             0.02      0.06      0.02     0.06
.row_names_info               0.02      0.06      0.02     0.06
^                             0.02      0.06      0.02     0.06
<                             0.02      0.06      0.02     0.06
<=                            0.02      0.06      0.02     0.06
==                            0.02      0.06      0.02     0.06
as.environment                0.02      0.06      0.02     0.06
attr<-                        0.02      0.06      0.02     0.06
baseenv                       0.02      0.06      0.02     0.06
data.class                    0.02      0.06      0.02     0.06
is.numeric                    0.02      0.06      0.02     0.06
sqrt                          0.02      0.06      0.02     0.06
sum                           0.02      0.06      0.02     0.06
sys.parent                    0.02      0.06      0.02     0.06
match.call                    0.02      0.06      0.00     0.00
$sample.interval
[1] 0.02
$sampling.time
[1] 34.36
 
     
     
    