1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
| library(data.table) library(dplyr) dat = lapply(paste0("List_", LETTERS[LETTERS!="L" & LETTERS!="R" & LETTERS!="S" & LETTERS!="Y"], ".csv"), read.csv) dat2 = rbindlist(dat) region_v = c("臺北市","臺中市","基隆市","臺南市","高雄市","新北市","宜蘭縣","桃園縣","嘉義市","新竹縣","苗栗縣","南投縣","彰化縣","新竹市","雲林縣","嘉義縣","屏東縣","花蓮縣","臺東縣","金門縣","澎湖縣","連江縣") region = unlist(lapply(1:length(dat), function(i) rep(region_v[i], nrow(dat[[i]])))) dat2 = data.table(dat2, region = region) cols = c(1:3, 5:7, 9:15, 20:21, 24) dat2[, (cols):=lapply(.SD, as.character),.SDcols=cols] setnames(dat2, "交易標的", "trading_target") dat2 = filter(dat2, grepl("建物", trading_target))
setnames(dat2, "交易年月", "trading_ym") dat2 = mutate(dat2, year.trading = substr(trading_ym, 0, nchar(trading_ym)-2)) dat2 = mutate(dat2, month.trading = substr(trading_ym, nchar(as.character(trading_ym))-1, nchar(trading_ym))) set(dat2, i = which(dat2$month.trading >= 13), j = "month.trading", value = NA) set(dat2, i = which(dat2$month.trading == 0), j = "month.trading", value = NA) set(dat2, i = which(is.na(dat2$month.trading)), j = "month.trading", value = "6") set(dat2, j = "year.trading", value = as.integer(dat2$year.trading)) set(dat2, j = "month.trading", value = as.integer(dat2$month.trading))
setnames(dat2, "交易筆棟數", "trading_detail") dat2 = mutate(dat2, n.land = as.integer(substr(trading_detail,3,3))) dat2 = mutate(dat2, n.building = as.integer(substr(trading_detail,6,6))) dat2 = mutate(dat2, n.parking_lot = as.integer(substr(trading_detail,9,9)))
setnames(dat2, "主要建材", "building_materials") dat2 = mutate(dat2, building_materials = as.character(building_materials)) set(dat2, i = which(dat2$building_materials==""), j = "building_materials", value = NA) dat2 = mutate(dat2, CRC = grepl("混凝土", building_materials))
setnames(dat2, "建築完成年月", "construction_ym") dat2 = mutate(dat2, year.construction = as.numeric(rep(NA, nrow(dat2)))) dat2 = mutate(dat2, month.construction = as.numeric(rep(NA, nrow(dat2))))
dat2[nchar(construction_ym)==2 & !is.na(construction_ym), year.construction := as.integer(construction_ym)]
dat2[nchar(construction_ym)==4 & substr(construction_ym, 1, 2) != 10 & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 2))] dat2[nchar(construction_ym)==4 & substr(construction_ym, 1, 2) != 10 & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 3, 4))] dat2[nchar(construction_ym)==4 & substr(construction_ym, 1, 2) == 10 & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 3))] dat2[nchar(construction_ym)==4 & substr(construction_ym, 1, 2) == 10 & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 4, 4))]
dat2[nchar(construction_ym)==5 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 3))] dat2[nchar(construction_ym)==5 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 4, 5))] dat2[nchar(construction_ym)==5 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 2))] dat2[nchar(construction_ym)==5 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 3, 3))]
dat2[nchar(construction_ym)==6 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 3))] dat2[nchar(construction_ym)==6 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 4, 5))] dat2[nchar(construction_ym)==6 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 2))] dat2[nchar(construction_ym)==6 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 3, 4))]
dat2[nchar(construction_ym)==7 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 3))] dat2[nchar(construction_ym)==7 & (substr(dat2$construction_ym, 1, 1) == 1 | substr(dat2$construction_ym, 1, 1) == 0) & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 4, 5))] dat2[nchar(construction_ym)==7 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), year.construction := as.integer(substr(construction_ym, 1, 2))] dat2[nchar(construction_ym)==7 & substr(dat2$construction_ym, 1, 1) != 1 & substr(dat2$construction_ym, 1, 1) != 0 & !is.na(construction_ym), month.construction := as.integer(substr(construction_ym, 4, 5))]
set(dat2, i = which(dat2$year.construction >= 104), j = "year.construction", value = NA) set(dat2, i = which(dat2$month.construction >= 13), j = "month.construction", value = NA) set(dat2, i = which(dat2$month.construction == 0), j = "month.construction", value = NA) set(dat2, i = which(is.na(dat2$month.construction)), j = "month.construction", value = 6)
dat2 = mutate(dat2, trading = year.trading *12 + month.trading) dat2 = mutate(dat2, construction = year.construction *12 + month.construction) dat2 = mutate(dat2, age_house = trading - construction)
dat3 = as.data.frame(select(dat2, c(23,29,4,5,39,11,12,40,16:21,25,32:35))) dat4 = dat3[setdiff(1:nrow(dat3), which(is.na(dat3), arr.ind = TRUE)[,1]),] names(dat4) = c("Y", paste0("V", 1:(ncol(dat4)-1))) dat4$V1 = as.factor(dat4$V1) dat4$V3 = as.factor(dat4$V3) dat4$V5 = as.numeric(dat4$V5) dat4$V6 = as.factor(dat4$V6) dat4$V12 = as.factor(dat4$V12) dat4$V13 = as.factor(dat4$V13) dat4$V18 = as.factor(dat4$V18) dat4 = dat4[dat4$Y!=0,] dat4 = dat4[setdiff(1:nrow(dat4), which(is.na(dat4), arr.ind = TRUE)[,1]),]
lm.fit <- lm(log(Y+0.1)~.,data=dat4) summary(lm.fit)
dat4.sub<-dat4[-which(rownames(dat4)%in%c(323747, 80936, 222834)),] lm.fit <- lm(log(Y+0.1)~.,data=dat4.sub) summary(lm.fit)
library(MASS) lm.AIC <- stepAIC(lm.fit)
fold = 10 cv_index_f = function(n, fold = 10){ fold_n = floor(n / fold) rem = n - fold_n * fold size = rep(fold_n, fold) if(rem > 0) size[1:rem] = fold_n + 1 cv_index = unlist(sapply(1:fold, function(i) rep(i, size[i]))) cv_index = sample(cv_index, length(cv_index)) return(cv_index) } index = cv_index_f(nrow(dat4.sub), fold)
library(snowfall) n_rep = 10 sfInit(TRUE, 8) sfExport("dat4.sub") pred.error = sapply(1:n_rep , function(i){ index = cv_index_f(nrow(dat4.sub), fold) sfExport("index") lm.CV <- sfSapply(1:fold, function(v){ dat4.train = dat4.sub[index != v,] dat4.test = dat4.sub[index == v,] lm.fit.train = lm(log(Y+0.1)~.,data = dat4.train) sum((log(dat4.test$Y+0.1) - predict(lm.fit.train, dat4.test))^2)/nrow(dat4.test) }) mean(lm.CV) }) sfStop() mean(pred.error)
pred.error
library(grpreg) X.m <- model.matrix(log(Y+0.1)~. , data=dat4.sub)[,-1] group.v <- substring(colnames(X.m),1,2) group.v[41:49] <- substring(colnames(X.m),1,3)[41:49] group.v <- as.numeric(factor(group.v, levels=paste("V",c(1:18),sep=""))) out.glasso <- cv.grpreg(X.m, log(dat4.sub$Y+0.1), group=group.v) plot(out.glasso) out.glasso$lambda.min
out.glasso$cve[out.glasso$min]
coef(out.glasso)
|