Ching-Chuan Chen's Blogger

Statistics, Machine Learning and Programming

0%

plyr::ddply vs data.table::rbindlist

最近遇到在計算functinoal data的cross-covariance surface的時候

發現plyr::ddply裡面的list_to_dataframe有點慢

反而利用plyr::dlply加上data.tablerbindlist可以快上不少

而且plyr::ddply消耗的記憶體相對起rbindlist高上不少

會發現這些都要感謝rstudio新出的套件profvis提供了良好的performance視覺化

其中profvis可以在github找到

下面是Benchmark的R script:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
library(plyr)
library(data.table)
library(pipeR)

N <- 2000
nt <- 100
p <- 4
dataDT <- data.table(subId = rep(1:(N/nt), p, each = nt), variable = rep(1:p, each = N),
timePnt = rep(seq(0, 10, length.out = nt), p*N/nt), value = rnorm(N*p))

getRawCrCov1 <- function(demeanDataDT){
# geneerate the all combinations of t1,t2 and varaibles
baseDT <- demeanDataDT[ , .(t1 = rep(timePnt, length(timePnt)), t2 = rep(timePnt, each=length(timePnt)),
value.var1 = rep(value, length(timePnt))), by = .(variable, subId)]
# calculation of raw cross-covariance
rawCrCovDT <- do.call("dlply", list(demeanDataDT, "variable", function(df){
merge(baseDT[variable >= df$variable[1]], df, suffixes = c("1", "2"),
by.x = c("subId", "t2"), by.y = c("subId", "timePnt"))
})) %>>% rbindlist %>>% setnames("value", "value.var2") %>>%
`[`(j = .(sse = sum(value.var1 * value.var2), cnt = .N), by = .(variable1, variable2, t1, t2)) %>>%
setorder(variable1, variable2, t1, t2) %>>% `[`(j = weight := 1)
return(rawCrCovDT)
}

getRawCrCov2 <- function(demeanDataDT){
# geneerate the all combinations of t1,t2 and varaibles
baseDT <- demeanDataDT[ , .(t1 = rep(timePnt, length(timePnt)), t2 = rep(timePnt, each=length(timePnt)),
value.var1 = rep(value, length(timePnt))), by = .(variable, subId)]
# calculation of raw cross-covariance
rawCrCovDT <- do.call("ddply", list(demeanDataDT, "variable", function(df){
merge(baseDT[variable >= df$variable[1]], df, suffixes = c("1", "2"),
by.x = c("subId", "t2"), by.y = c("subId", "timePnt"))
})) %>>% setDT %>>% `[`(j = variable := NULL) %>>% setnames("value", "value.var2") %>>%
`[`(j = .(sse = sum(value.var1 * value.var2), cnt = .N), by = .(variable1, variable2, t1, t2)) %>>%
setorder(variable1, variable2, t1, t2) %>>% `[`(j = weight := 1)
return(rawCrCovDT)
}

getRawCrCov3 <- function(demeanDataDT){
# geneerate the all combinations of t1,t2 and varaibles
baseDT <- demeanDataDT[ , .(t1 = rep(timePnt, length(timePnt)), t2 = rep(timePnt, each=length(timePnt)),
value.var1 = rep(value, length(timePnt))), by = .(variable, subId)]
# calculation of raw cross-covariance
rawCrCovDT <- do.call("llply", list(split(demeanDataDT, by = "variable"), function(df){
merge(baseDT[variable >= df$variable[1]], df, suffixes = c("1", "2"),
by.x = c("subId", "t2"), by.y = c("subId", "timePnt"))
})) %>>% rbindlist %>>% setnames("value", "value.var2") %>>%
`[`(j = .(sse = sum(value.var1 * value.var2), cnt = .N), by = .(variable1, variable2, t1, t2)) %>>%
setorder(variable1, variable2, t1, t2) %>>% `[`(j = weight := 1)
return(rawCrCovDT)
}

getRawCrCov4 <- function(demeanDataDT){
# geneerate the all combinations of t1,t2 and varaibles
baseDT <- demeanDataDT[ , .(t1 = rep(timePnt, length(timePnt)), t2 = rep(timePnt, each=length(timePnt)),
value.var1 = rep(value, length(timePnt))), by = .(variable, subId)]
# set the keys of data.table
setkey(baseDT, "subId", "t2")
setkey(demeanDataDT, "subId", "timePnt")
# calculation of raw cross-covariance
rawCrCovDT <- do.call("llply", list(split(demeanDataDT, by = "variable"), function(df){
merge(baseDT[variable >= df$variable[1]], df, suffixes = c("1", "2"),
by.x = c("subId", "t2"), by.y = c("subId", "timePnt"))
})) %>>% rbindlist %>>% setnames("value", "value.var2") %>>%
`[`(j = .(sse = sum(value.var1 * value.var2), cnt = .N), by = .(variable1, variable2, t1, t2)) %>>%
setorder(variable1, variable2, t1, t2) %>>% `[`(j = weight := 1)
return(rawCrCovDT)
}

x1 <- getRawCrCov1(dataDT)
x2 <- getRawCrCov2(dataDT)
x3 <- getRawCrCov3(dataDT)
x4 <- getRawCrCov4(dataDT)
all.equal(x1, x2) # TRUE
all.equal(x1, x3) # TRUE
all.equal(x1, x4) # TRUE

library(microbenchmark)
microbenchmark(rbindlist = getRawCrCov1(dataDT), ddply = getRawCrCov2(dataDT),
split.DT = getRawCrCov3(dataDT), setkey.first = getRawCrCov4(dataDT), times = 50L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rbindlist 657.1571 700.8676 726.3146 716.2397 741.4616 911.0223 50
# ddply 2951.0253 3196.2281 3292.1848 3283.4185 3415.7735 3638.6447 50
# split.DT 653.3183 699.5276 732.4667 720.5936 747.7091 1016.1446 50
# setkey.first 496.7661 542.6954 562.4172 554.0295 584.7101 701.3526 50

速度整整差了近5倍(3347 / 709 ~= 4.72)

因此,建議以後plyr系列,盡量避開*dply系列的函數

用到plyr:::list_to_dataframe這個函數的效能都不好

盡量去使用data.table::rbindlist

dlply + rbindlist跟用split.data.table + llply + rbindlist

其實最後兩者速度差不多,時間並沒有太大的區別 (726ms vs 732ms)

其實我覺得可以直接都統一用split.data.table + llply + rbindlist

統一減少使用*dply或是d*ply

而在迴圈中做merge前,建議全部都先setkey

這樣拆分之後的data.table,還是有key

merge時就不用再建key了

而這一點也是透過profvis這個套件發現

多看幾次profvis套件出來的結果,可以對每個程式怎麼運作更加了解

就能因時因地制宜了,程式自然會更有效率