1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
| library(data.table) library(plyr) library(dplyr) library(magrittr) library(reshape2) ref_m = data.table( "gene_a" = c("A", "B", "C"), "Chromosome" = c("1", "X", "2"), "gene_start" = c(25000, 1000, 0), "gene_end" = c(50000, 2000, 800) ) %>% tbl_dt(FALSE) ref_m
dat = data.table( "Probe_b" = c("a1", "a2", "a3", "a4", "a5"), "Chromosome" = c("2", "4", "1", "X", "1"), "Chr_s" = as.integer(c(175, 600, 23575, 1010, 30000)), "Chr_e" = as.integer(c(200, 625, 23600, 1035, 30025)), stringsAsFactors = FALSE ) %>% tbl_dt(FALSE) dat
combine_f = function(dat, ref_m){ check_f = function(v, ref_m){ loc <- match(v$Chromosome, ref_m$Chromosome, nomatch = 0) if(length(loc) != 0 && loc > 0) { if(v$Chr_s >= ref_m[loc,]$gene_s && v$Chr_e <= ref_m[loc,]$gene_e) return(as.character(ref_m[loc,]$gene_a)) else return(NA) } else return(NA) } result = rep(NA, nrow(dat)) aaply(1:nrow(dat), 1, function(v) check_f(dat[v,], ref_m)) %>% {tapply(as.character(dat$Probe_b), ., c)} } combine_f(dat, ref_m)
combine_f2 = function(dat, ref_m){ loc_ref_v = match(dat$Chromosome, ref_m$Chromosome) loc_dat_v = loc_ref_v > 0 loc_dat_v = loc_dat_v[!is.na(loc_dat_v)] loc_ref_v = loc_ref_v[loc_dat_v] loc = dat$Chr_s[loc_dat_v] >= ref_m$gene_start[loc_ref_v] & dat$Chr_e[loc_dat_v] <= ref_m$gene_end[loc_ref_v] tapply(dat$Probe_b[loc_dat_v][loc], ref_m$gene_a[loc_ref_v[loc]], c) } combine_f2(dat, ref_m)
combine_f3 = function(dat, ref_m){ merge(dat, ref_m, by="Chromosome") %>% filter((Chr_s > gene_start) & (Chr_e < gene_end)) %>% select(one_of(c("gene_a", "Probe_b"))) %>% dlply(.(gene_a), function(x) x$Probe_b) } combine_f3(dat, ref_m)
N = 5000; k = 20 ref_m = data.table( gene_a = LETTERS[1:k], Chromosome = sample(c(as.character(1:25), "X"), k), gene_start = sample(seq(0, 50000, 200), k)) %>% mutate(gene_end = gene_start+sample(seq(600, 15000, 200), k)) %>% tbl_dt(FALSE) dat = data.table( Probe_b = paste0("a", 1:N), Chromosome = sample(c(as.character(1:25), "X"), N, replace=TRUE), Chr_s = as.integer(sample(seq(0, 50000, 5), N, replace=TRUE))) %>% mutate(Chr_e = Chr_s + sample(seq(1000, 10000, 100), N, replace=TRUE)) %>% tbl_dt(FALSE)
all.equal( combine_f(dat, ref_m), combine_f2(dat, ref_m))
all.equal( combine_f2(dat, ref_m), combine_f3(dat, ref_m), check.attributes =FALSE)
library(rbenchmark) benchmark(combine_f(dat, ref_m), combine_f2(dat, ref_m), combine_f3(dat, ref_m), replications = 20, columns = c("test", "replications", "elapsed", "relative"), order = "relative")
|