Ching-Chuan Chen's Blogger

Statistics, Machine Learning and Programming

0%

Tables combining

A benchmark for combine the two data.frames in R.

There are two tables:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
library(data.table)
library(plyr)
library(dplyr)
library(magrittr)
library(reshape2)
ref_m = data.table(
"gene_a" = c("A", "B", "C"),
"Chromosome" = c("1", "X", "2"),
"gene_start" = c(25000, 1000, 0),
"gene_end" = c(50000, 2000, 800)
) %>% tbl_dt(FALSE)
ref_m
# gene_a Chromosome gene_start gene_end
# 1 A 1 25000 50000
# 2 B X 1000 2000
# 3 C 2 0 800

dat = data.table(
"Probe_b" = c("a1", "a2", "a3", "a4", "a5"),
"Chromosome" = c("2", "4", "1", "X", "1"),
"Chr_s" = as.integer(c(175, 600, 23575, 1010, 30000)),
"Chr_e" = as.integer(c(200, 625, 23600, 1035, 30025)),
stringsAsFactors = FALSE
) %>% tbl_dt(FALSE)
dat
# Probe_b Chromosome Chr_s Chr_e
# 1 a1 2 175 200
# 2 a2 4 600 625
# 3 a3 1 23575 23600
# 4 a4 X 1010 1035
# 5 a5 1 30000 30025

# we want combine these two tables like following by referring the Chromosome and checking that the range of Chr_s and Chr_e is fallen into range of gene_start and gene_end .
# gene_a match
# 1 A a5
# 2 B a4
# 3 C a1

## first method
combine_f = function(dat, ref_m){
check_f = function(v, ref_m){
loc <- match(v$Chromosome, ref_m$Chromosome, nomatch = 0)
if(length(loc) != 0 && loc > 0) # 避免loc出現integer(0)的情況
{
if(v$Chr_s >= ref_m[loc,]$gene_s && v$Chr_e <= ref_m[loc,]$gene_e)
return(as.character(ref_m[loc,]$gene_a))
else
return(NA)
}
else
return(NA)
}
result = rep(NA, nrow(dat))
aaply(1:nrow(dat), 1, function(v) check_f(dat[v,], ref_m)) %>%
{tapply(as.character(dat$Probe_b), ., c)}
}
combine_f(dat, ref_m)
# A B C
# "a5" "a4" "a1"

## second method
combine_f2 = function(dat, ref_m){
loc_ref_v = match(dat$Chromosome, ref_m$Chromosome)
loc_dat_v = loc_ref_v > 0
loc_dat_v = loc_dat_v[!is.na(loc_dat_v)]
loc_ref_v = loc_ref_v[loc_dat_v]
loc = dat$Chr_s[loc_dat_v] >= ref_m$gene_start[loc_ref_v] &
dat$Chr_e[loc_dat_v] <= ref_m$gene_end[loc_ref_v]
tapply(dat$Probe_b[loc_dat_v][loc], ref_m$gene_a[loc_ref_v[loc]], c)
}
combine_f2(dat, ref_m)
# A B C
# "a5" "a4" "a1"

## third method
combine_f3 = function(dat, ref_m){
merge(dat, ref_m, by="Chromosome") %>%
filter((Chr_s > gene_start) & (Chr_e < gene_end)) %>%
select(one_of(c("gene_a", "Probe_b"))) %>%
dlply(.(gene_a), function(x) x$Probe_b)
}
combine_f3(dat, ref_m)

## test the performance
N = 5000; k = 20
ref_m = data.table(
gene_a = LETTERS[1:k],
Chromosome = sample(c(as.character(1:25), "X"), k),
gene_start = sample(seq(0, 50000, 200), k)) %>%
mutate(gene_end = gene_start+sample(seq(600, 15000, 200), k)) %>%
tbl_dt(FALSE)
dat = data.table(
Probe_b = paste0("a", 1:N),
Chromosome = sample(c(as.character(1:25), "X"), N, replace=TRUE),
Chr_s = as.integer(sample(seq(0, 50000, 5), N, replace=TRUE))) %>%
mutate(Chr_e = Chr_s + sample(seq(1000, 10000, 100), N,
replace=TRUE)) %>% tbl_dt(FALSE)

all.equal( combine_f(dat, ref_m), combine_f2(dat, ref_m))
# TRUE
all.equal( combine_f2(dat, ref_m), combine_f3(dat, ref_m), check.attributes =FALSE)
# TRUE

library(rbenchmark)
benchmark(combine_f(dat, ref_m), combine_f2(dat, ref_m),
combine_f3(dat, ref_m), replications = 20,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
# test replications elapsed relative
# 2 combine_f2(dat, ref_m) 20 0.11 1.000
# 3 combine_f3(dat, ref_m) 20 0.38 3.455
# 1 combine_f(dat, ref_m) 20 406.35 3694.091

The second method has the best performance, but it is a hard coding. Therefore, I recommend the third method.