Ching-Chuan Chen's Blogger

Statistics, Machine Learning and Programming

0%

Taking average every k periods

We have a repeated-measuring data. We want to take average every 3 periods. Here is code to do it.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library(plyr)
library(dplyr)
library(data.table)
library(magrittr)
# data generation
n = 200
dat = data.table(id = 1:n, len = sample(2:15, n, replace = TRUE)) %>%
mdply(function(id, len) data.table(id = rep(id, len),
values = rnorm(len)))
dat = select(dat, c(id, values))
# mean
k = 3
start_time = Sys.time()
result = dat %>% group_by(id) %>%
mutate(newgroup = rep(1:ceiling(length(values)/k),
each = k, length = length(values))) %>%
group_by(id, newgroup) %>% summarise(mean(values))
Sys.time() - start_time

library(dplyr)
library(data.table)
library(magrittr)
# data generation

dat_gen_f = function(N_patient, max_obs_time, n_vars){
dat = sample(max_obs_time, N_patient, replace = TRUE) %>% {
cbind(rep(1:N_patient, times=.), sapply(., seq, from = 1) %>% unlist())
} %>% cbind(matrix(rnorm(nrow(.)*n_vars),, n_vars)) %>% data.table()
setnames(dat, c("id", "obs_times", paste0("V", 1:n_vars)))
}

mean_dat_f = function(dat, k){
result = dat %>% group_by(id) %>%
mutate(newgroup = rep(1:ceiling(length(obs_times)/k), each = k,
length=length(obs_times)),
n_combine = (length(obs_times) %/% k) %>% {c(rep(k, . * k),
rep(length(obs_times) - . * k, length(obs_times) - . * k))}) %>%
ungroup() %>% mutate(times_combine = paste((newgroup-1)*3+1,
(newgroup-1)*3 + n_combine, sep="-"))
result = result %>% select(match(c(names(dat)[names(dat)!="obs_times"],
"times_combine"), names(result))) %>% extract(, lapply(.SD, mean),
by = "id,times_combine")
result
}

start_time = Sys.time()
dat = dat_gen_f(30000, 20, 15)
Sys.time() - start_time
# Time difference of 1.503086 secs
start_time = Sys.time()
result = mean_dat_f(dat, 3)
Sys.time() - start_time
# Time difference of 4.236243 secs

start_time = Sys.time()
dat = dat_gen_f(13820, 15, 1)
Sys.time() - start_time
# Time difference of 0.4750271 secs
start_time = Sys.time()
result = mean_dat_f(dat, 3)
Sys.time() - start_time
# Time difference of 1.848106 secs