1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| library(data.table) library(plyr) library(dplyr) library(magrittr) dat = fread(paste0(rep("001female2019920404\n002male 3019920505\n003male 4019920606\n004female5019920707\n", 100000), collapse=""), sep="\n", sep2="",header=FALSE)
tt = proc.time() dat_regex = dat %>% select(V1) %>% extract2(1) %>% regexec("([0-9]{3})(female|male\\s{2})([0-9]{2})([0-9]{8})", text = .) dat_split2 = dat %>% select(V1) %>% extract2(1) %>% regmatches(dat_regex) %>% do.call(rbind, .) %>% data.table() %>% select(2:ncol(.)) %>% setnames(c("id", "gender", "age", "birthday")) proc.time() - tt
library(Rcpp) library(inline) sourceCpp(code = ' #include <Rcpp.h> using namespace Rcpp;
// [[Rcpp::export]] CharacterMatrix dat_split_f( std::vector< std::string > strings, NumericVector loc) { int loc_len = loc.size(), num_strings = strings.size(); CharacterMatrix output(num_strings, loc_len); for( int j=0; j < num_strings; j++ ) { for (int i=0; i < loc_len-1; i++) output(j, i) = strings[j].substr(loc[i], loc[i+1] - loc[i]); } return output; }')
tt = proc.time() dat_split = dat_split_f(dat[[1]], c(0, 3, 9, 11, 19)) %>% data.table() %>% select(1:4) %>% setnames(c("id", "gender", "age", "birthday")) proc.time() - tt
library(rJava) .jinit() .jaddClassPath( "D:\\Program\\R\\some_code\\regex_java") regex_java = .jnew("regex_java")
tt = proc.time() output = dat %>% select(V1) %>% extract2(1) %>% .jcall(regex_java, "[S", "string_split_1d", ., as.integer(c(0, 3, 9, 11, 19))) %>% matrix(ncol=4) %>% data.table() %>% setnames(c("id", "gender", "age", "birthday")) proc.time() - tt
tt = proc.time() output2 = dat %>% select(V1) %>% extract2(1) %>% .jcall(regex_java, "[[Ljava/lang/String;", "string_split", ., as.integer(c(0, 3, 9, 11, 19)), simplify = TRUE) %>% data.table() %>% setnames(c("id", "gender", "age", "birthday")) proc.time() - tt
tt = proc.time() pattern = "(\\d{3})(female|male\\s{2})(\\d{2})(\\d{8})" size_recognize = "(\\((?>[^()]+|(?R))*\\))" %>% gregexpr(pattern, perl = TRUE) %>% extract2(1) %>% length() %>% as.integer() output3 = dat %>% select(V1) %>% extract2(1) %>% .jcall(regex_java, "[S", "regex_java", ., pattern, size_recognize) %>% matrix(ncol = 4, byrow=TRUE) %>% data.table() %>% setnames(c("id", "gender", "age", "birthday")) proc.time() - tt
all.equal(output, output2) all.equal(output, dat_split) all.equal(output, dat_split2) all.equal(output, output3)
|