투빅스 9&10기 3주차 Ensemble - 10기 정윤호

by UNOVATE posted Aug 08, 2018
?

단축키

Prev이전 문서

Next다음 문서

ESC닫기

+ - Up Down Comment Print
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
rm(list=ls())
 
### setwd
setwd("/Users/yunho/2bigs/w3/ensemble/apt_train_test")
 
### packages
if(!require(caret)) install.packages("caret"); library(caret)
if(!require(e1071)) install.packages("e1071"); library(e1071)
if(!require(dplyr)) install.packages("dplyr"); library(dplyr)
if(!require(lime)) install.packages("lime"); library(lime)
if(!require(mice)) install.packages("mice"); library(mice)
if(!require(mlbench)) install.packages("mlbench"); library(mlbench)
if(!require(caretEnsemble)) install.packages("caretEnsemble"); library(caretEnsemble)
if(!require(gbm)) install.packages("gbm"); library(gbm)
if(!require(hydroGOF)) install.packages("hydroGOF"); library(hydroGOF) # for MSE
 
### data 불러오기
apt_train <- read.csv('apt_train.csv')
head(apt_train)
str(apt_train)
apt_train<-apt_train[-1# X열 제거
apt_test <- read.csv('apt_test.csv')
head(apt_test)
str(apt_test)
 
### 결측치 비율 20% 이상 - 해당 열 제거. 결측치 비율 20% 이하 - mice 함수로 결측치 대체.
28663*0.2 # apt_train - 5733(5732.6)
2973*0.2 # apt_test - 595(594.6)
colSums(is.na(apt_train))
colSums(is.na(apt_test))
# building_coverage_ratio, commute_dmc, commute_seongsu, commute_youngsan, commute_chungmuro
# floor_area_ratio, floor_min, parking_inside, parking_outside, permission_date, slope
 
## 결측치 비율 20% 이상 열 제거
apt_train2<-subset(apt_train,select=-c(building_coverage_ratio,commute_dmc,commute_seongsu,commute_yongsan,commute_chungmuro,floor_area_ratio,floor_max,floor_min, parking_inside,parking_outside,parking_rate,permission_date,slope))
colSums(is.na(apt_train2))
 
## 결측치 비율 20% 이하 열 mice함수로 결측치 대체
mc<-mice(apt_train2[,!names(apt_train2) %in% 'price'], method='rf')
miceOutput<-complete(mc)
apt_train3<-cbind(miceOutput,price=apt_train2$price)
 
## 결측치 비율 20% 이상 열 제거
apt_test2<-subset(apt_test,select=-c(building_coverage_ratio,commute_dmc,commute_seongsu,commute_yongsan,commute_chungmuro,floor_area_ratio,floor_max,floor_min, parking_inside,parking_outside,parking_rate,permission_date,slope))
colSums(is.na(apt_test2))
 
## 결측치 비율 20% 이하 열 mice함수로 결측치 대체(price 없음)
mc<-mice(apt_test2, method='rf')
apt_test3<-complete(mc)
#find_na(apt_train, rate = TRUE)
 
#write.csv(apt_train,'apt_train_mice.csv',row.names = F)
#write.csv(apt_test,'apt_test_mice.csv',row.names = F)
#apt_train3 <- read.csv('apt_train_mice.csv')
#apt_test3 <- read.csv('apt_test_mice.csv')
 
### 범주형 변수 더미화
## asile_type, earthquake, heat_source, heat_type 더미화
## apt_train3과 apt_test3에서 heat_source 레벨이 맞지 않으므로 apt_test3$heat_source에 OIL 레벨 추가해서 처리
summary(apt_train3$heat_source)
summary(apt_test3$heat_source)
apt_test3$heat_source <- factor(apt_test3$heat_source, levels=c(levels(apt_test3$heat_source), "OIL"))
summary(apt_test3$heat_source)
 
asile_type_dm = dummyVars('~ asile_type', apt_train3)
asile_type_dm = data.frame(predict(asile_type_dm, apt_train3))
 
earthquake_dm = dummyVars('~ earthquake', apt_train3)
earthquake_dm = data.frame(predict(earthquake_dm, apt_train3))
 
heat_source_dm = dummyVars('~ heat_source', apt_train3)
heat_source_dm = data.frame(predict(heat_source_dm, apt_train3))
 
heat_type_dm = dummyVars('~ heat_type', apt_train3)
heat_type_dm = data.frame(predict(heat_type_dm, apt_train3))
 
apt_train4 <- apt_train3[!colnames(apt_train3) %in% c('asile_type_dm','earthquake_dm','heat_source','heat_type')]
apt_train4 <- cbind(apt_train3,asile_type_dm, earthquake_dm,heat_source_dm,heat_type_dm)
 
## test 더미화
asile_type_dm = dummyVars('~ asile_type', apt_test3)
asile_type_dm = data.frame(predict(asile_type_dm, apt_test3))
 
earthquake_dm = dummyVars('~ earthquake', apt_test3)
earthquake_dm = data.frame(predict(earthquake_dm, apt_test3))
 
heat_source_dm = dummyVars('~ heat_source', apt_test3)
heat_source_dm = data.frame(predict(heat_source_dm, apt_test3))
 
heat_type_dm = dummyVars('~ heat_type', apt_test3)
heat_type_dm = data.frame(predict(heat_type_dm, apt_test3))
 
apt_test4 <- apt_test3[!colnames(apt_test3) %in% c('asile_type_dm','earthquake_dm','heat_source','heat_type')]
apt_test4 <- cbind(apt_test3,asile_type_dm, earthquake_dm,heat_source_dm,heat_type_dm)
str(apt_train4)
str(apt_test4)
 
### STACKING 함수
# train data로 가장 좋은 모델 찾기
# 세 모델의 결과값(예측값) 평균낸 것을 새로운 변수로 만들어서 가장 좋은 모델로 모델 학습(stacking)
STACKING = function(data_train,data_test){ #원하는 target으로 price를 대체해서 사용할 수 있다.
  ### 격자 탐색
  rf.grid = expand.grid(
    .mtry = c(1,3,5)
  )
  xgb.grid = expand.grid(
    nrounds = c(300,500),
    eta = c(0.03,0.05),
    gamma = c(3,5),
    max_depth = c(4,6),
    min_child_weight = c(6,8),
    colsample_bytree = c(0.3,0.5),
    subsample = c(0.2,0.6)
  )
  gbm.grid = expand.grid(
    shrinkage = c(0.1,0.3),
    interaction.depth = c(3,6,9),
    n.minobsinnode = c(5,10,15),
    n.trees = c(500,100,1500)
  )
  
  ### 예측의 정확도를 기준으로 가장 잘하는 모델을 찾아내기 위해 train 데이터를 분할하여 성능을 비교
  idx = createDataPartition(data_train$price, p = 0.7, list=F)
  train_model = data_train[idx,]
  test_model = data_train[-idx,]
  
  control = trainControl(method='cv', search='grid', number=5,verbose = TRUE)
  rf.model <- train(
    price ~ .,
    data = train_model,
    tuneGrid = rf.grid,
    trControl = control,
    method = 'rf'
  )
  xgb.model <- train(
    price ~ .,
    data = train_model,
    tuneGrid = xgb.grid,
    trControl = control,
    method = 'xgbTree'
  )
  gbm.model <- train(
    price ~ .,
    data = train_model,
    tuneGrid = gbm.grid,
    trControl = control,
    method = 'gbm'
  )
  pred.rf <- predict(rf.model,subset(test_model,select=-c(price))) # rf 예측값
  pred.xgb <- predict(xgb.model,subset(test_model,select=-c(price))) # xgb 예측값
  pred.gbm <- predict(gbm.model,subset(test_model,select=-c(price))) # gbm 예측값
  #mser.rf <- mse(pred.rf,subset(test_model,select=c(price))) # confusionMatrix은 factor만 가능. 따라서 mse함수를 통해 mse로 비교
  mser.rf<-mse(pred.rf,test_model$price)
  #mser.xgb <- mse(pred.xgb,subset(test_model,select=c(price)))
  mser.xgb<-mse(pred.xgb,test_model$price)
  #mser.gbm <- mse(pred.gbm,subset(test_model,select=c(price)))
  mser.gbm<-mse(pred.gbm,test_model$price)
  
  ### stacking
  rf.model2 <- train(
    price ~ ., 
    data = data_train,
    tuneGrid = rf.grid,
    trControl = control,
    method = 'rf'
  )
  xgb.model2 <- train(
    price ~ .,
    data = data_train,
    tuneGrid = xgb.grid,
    trControl = control,
    method = 'xgbTree'
  )
  gbm.model2 <- train(
    price ~ .,
    data = data_train,
    tuneGrid = gbm.grid,
    trControl = control,
    method = 'gbm'
  )
  pred.rf2 <- predict(rf.model2,data_test)
  pred.xgb2 <- predict(xgb.model2,data_test)
  pred.gbm2 <- predict(gbm.model2,data_test)
  pred.targ=cbind(pred.rf2,pred.xgb2,pred.gbm2)
  avg.targ=rowMeans(pred.targ)
  data_test2=cbind(data_test,avg.targ)
  
  if(min(mser.rf,mser.xgb,mser.gbm)==mser.rf){# rf의 mse값이 가장 작으면(가장 성능이 좋으면)
    rf.model3 <- train(
      avg.targ ~ .,
      data = data_test2,
      tuneGrid = rf.grid,
      trControl = control,
      method = 'rf'
    )
    pred = predict(rf.model3,subset(data_test2,select=-c(avg.targ)))
  }
  else if(min(mser.rf,mser.xgb,mser.gbm)==mser.xgb){# xgb의 mse값이 가장 작으면(가장 성능이 좋으면)
    xgb.model3 <- train(
      avg.targ ~ .,
      data = data_test2,
      tuneGrid = xgb.grid,
      trControl = control,
      method = 'xgbTree'
    )
    pred = predict(xgb.model3,subset(data_test2,select=-c(avg.targ)))
  }
  else if(min(mser.rf,mser.xgb,mser.gbm)==mser.gbm){# gbm의 mse값이 가장 작으면(가장 성능이 좋으면)
    gbm.model3 <- train(
      avg.targ ~ .,
      data = data_test2,
      tuneGrid = gbm.grid,
      trControl = control,
      method = 'gbm'
    )
    pred = predict(gbm.model3,subset(data_test2,select=-c(avg.targ)))
  }
  return(pred)
}
 
### 예측값 출력
pred = STACKING(data_train=apt_train4,data_test=apt_test4)
apt_test$price = pred
str(apt_test)
write.csv(apt_test,'apt_pred.csv',row.names = F)
cs

Articles

5 6 7 8 9 10 11 12 13 14

나눔글꼴 설치 안내


이 PC에는 나눔글꼴이 설치되어 있지 않습니다.

이 사이트를 나눔글꼴로 보기 위해서는
나눔글꼴을 설치해야 합니다.

설치 취소

Designed by sketchbooks.co.kr / sketchbook5 board skin

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5