a <- rnorm(1e3) b <- rnorm(1e4) d <- 0.5 e <- rnorm(1e5) # unused variables # f is not a good method to get that result, it is just for benchmark f <- function(x) { sum <- 0 for (i in seq(1, x)) sum <- sum + (mean(a) - mean(b))*d*i return(sum) } sfExport("a", "b", "d") clusterExport(cl, c("a", "b", "d"))
g1 <- function(x) { out1 <- vector("numeric", length = 100) for (i in1:1000) out1[[i]] <- f(i) return(out1) }
# find the main difference library(profvis) profvis(g3()) # using clusterApply profvis(g5()) # using clusterApplyLB # clusterApplyLB is a load balancing version of clusterApply. If the length p of seq is not greater # than the number of nodes n, then a job is sent to p nodes. Otherwise the first n jobs are placed # in order on the n nodes. When the first job completes, the next job is placed on the node that # has become free; this continues until all jobs are complete. Using clusterApplyLB can result in # better cluster utilization than using clusterApply, but increased communication can reduce performance. # Furthermore, the node that executes a particular job is non-deterministic.
a <- rnorm(1e3) b <- rnorm(1e4) d <- 0.5 e <- rnorm(1e5) # unused variables # f is not a good method to get that result, it is just for benchmark f <- function(x) { sum <- 0 for (i in seq(1, x)) sum <- sum + (mean(a) - mean(b))*d*i return(sum) } parallelExport("a", "b", "d")