1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | ### Problem 2-1 ### # mlbench패키지의 BreastCancer data를 7:3으로 train set/test set으로 # 나눈후 로지스틱회귀를 적합하여 예측하세요. # 정확한 결과를 위해 랜덤분할을50 회 실시하여 Accuracy의 평균을구해주세요 Sys.setlocale(category = "LC_CTYPE", locale = "ko_KR.UTF-8") rm(list=ls()) getwd() setwd("/Users/sungjinpark/Desktop/투빅스/week1/logistic") #install.packages("mlbench") library(mice) # 결측치 처리 library(caTools) # 데이터 분할 library(caret) # 데이터 분할 library(e1071) library(mlbench) library(ROCR) data(BreastCancer) str(BreastCancer) head(BreastCancer) ?BreastCancer # # [,1] Id Sample code number # [,2] Cl.thickness Clump Thickness # [,3] Cell.size Uniformity of Cell Size # [,4] Cell.shape Uniformity of Cell Shape # [,5] Marg.adhesion Marginal Adhesion # [,6] Epith.c.size Single Epithelial Cell Size # [,7] Bare.nuclei Bare Nuclei # [,8] Bl.cromatin Bland Chromatin # [,9] Normal.nucleoli Normal Nucleoli # [,10] Mitoses Mitoses # [,11] Class Class # 1.데이터 전처리 # 1-1. 종속변수의 범주 확인 levels(BreastCancer$Class) # 2가지 "benign", "malignant" # 1-2. 결측치 체크 summary(BreastCancer) # Bare.nuclei변수에서 16개의 결측치 발견 mean(is.na(BreastCancer)) # 전체에서 0.02% 매우 낮은 비율이지만 처리해줘야 한다고 판단 # 1-3. 결측치 처리 및 ID 변수 제외 dataset_impute <- mice(BreastCancer[,2:10], print = FALSE) BreastCancer <- cbind(BreastCancer[,11, drop = FALSE], mice::complete(dataset_impute, 1)) summary(BreastCancer) str(BreastCancer) # 1-4. 종속변수 범주화 BreastCancer$Class <- as.character(BreastCancer$Class) BreastCancer$Class[BreastCancer$Class=="malignant"] <- 1 BreastCancer$Class[BreastCancer$Class=="benign"] <- 0 BreastCancer$Class <- as.factor(BreastCancer$Class) str(BreastCancer$Class) # 1-4. 독립변수 탐색 및 전처리 # 순서형 독립변수 수치화 (주관적) BreastCancer[, c(2:6)] <- sapply(BreastCancer[, c(2:6)], as.character) BreastCancer[, c(2:6)] <- sapply(BreastCancer[, c(2:6)], as.numeric) #str(BreastCancer) # 범주형 독립변수 간의 독립성검정 # 시행결과 서로 모두 종속 chisq.test(BreastCancer$Bare.nuclei,BreastCancer$Normal.nucleoli) chisq.test(BreastCancer$Bare.nuclei,BreastCancer$Mitoses) chisq.test(BreastCancer$Normal.nucleoli,BreastCancer$Mitoses) chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Bare.nuclei) chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Normal.nucleoli) chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Mitoses) # 범주형 독립변수들 분포 확인 plot(BreastCancer$Bare.nuclei) plot(BreastCancer$Bl.cromatin) plot(BreastCancer$Normal.nucleoli) plot(BreastCancer$Mitoses) # 범주형 변수들이 서로 종속적이라는 특징이 있다. # 이를 감안하고 범주형 번수를 그대로 둔후 적합하여 보았다. fit.full <- glm(Class~., data = BreastCancer, family = binomial(link = 'logit')) summary(fit.full) # 범주형 변수의 범주가 각각 10개, 총 40개의 경우가 있기 때문에 # 적합에 무리가 있다고 판단. #범주형 변수를 일괄적으로 수치형으로 바꿔야 한다고 판단. # 9개 설명변수 모두 numeric 형태로 변환 BreastCancer[, c(7:10)] <- sapply(BreastCancer[, c(7:10)], as.character) BreastCancer[, c(7:10)] <- sapply(BreastCancer[, c(7:10)], as.numeric) str(BreastCancer) # 변수명 변환 names(BreastCancer) <- c("Y","X1","X2","X3","X4","X5","X6","X7","X8","X9") str(BreastCancer) # 반응변수 확인 table(BreastCancer$Y) # 0 1 -> 데이터 불균형 의심 추후 upsampling처리 # 458 241 # 2. 모델 적합 # 2.1 GLM fit.full <- glm(Y~., data = BreastCancer, family = binomial(link = 'logit')) summary(fit.full) # AIC: 142.94 # X1, X6, X7 ,X4 등 유의한 변수들.. # Null deviance와 residual deviance의 차이 확인 -> 적합 계속 진행 # 900.53 vs 122.94 # 2.2 stepwise 사용 # 독립변수 X2, X5 제거 후 AIC=139.27 reduced.model=step(fit.full) summary(reduced.model) # 2.3 적합모델(X2,X5 제거 모델) 에 의한 Confusion Matrix pred.glm1 = as.numeric(predict(reduced.model, BreastCancer, type = "response") > 0.5) confusionMatrix(as.factor(pred.glm1),as.factor(BreastCancer$Y)) table(pred.glm1) # 높은 Accuracy 그러나 데이터 비율이 불균형 -> 추후 upsampling 필요 # Accuracy : 0.9671 # Reference # Prediction 0 1 # 0 447 12 # 1 11 229 # Sensitivity : 0.9760 # Specificity : 0.9502 # 2.4 Upsampling table(BreastCancer$Y) BC_up = upSample(subset(BreastCancer, select=-Y), BreastCancer$Y) summary(BC_up) table(BC_up$Class) model.Ups = glm(Class~.-X2 -X5,BC_up, family = binomial) pred.Ups = as.numeric(predict(model.Ups, BC_up, type = "response") > 0.5) confusionMatrix(as.factor(pred.Ups),as.factor(BC_up$Class)) table(pred.Ups) # # Accuracy : 0.694 # Reference # Prediction 0 1 # 0 445 15 # 1 13 443 # # Sensitivity : 0.9716 # Specificity : 0.9672 # 3.학습 및 테스트 데이터 분할 # 데이터를 학습/테스트 셋으로 분리 + upsampling # 50번 랜덤 추출 # X2, X5 제거 후 Upsamling overall <- 0 for (i in 1:50) { index <- createDataPartition(BreastCancer$Y, p = 0.7, list = F) BreastCancertrain <- BreastCancer[index,] BreastCancertest <- BreastCancer[-index,] train_up = upSample(subset(BreastCancertrain, select=-Y), BreastCancertrain$Y) model.glm1 <- glm(Class~. -X2 -X5 ,train_up, family = binomial) pred.glm1 <- as.numeric(predict(model.glm1, BreastCancertest, type = "response")>0.5) ac <- confusionMatrix(as.factor(pred.glm1), as.factor(BreastCancertest$Y)) overall_accuracy <- sum(BreastCancertest$Y == pred.glm1) / length(BreastCancertest$Y) overall <- overall + overall_accuracy } table(pred.glm1) # accuracy 값 평균 (overall / 50) # 0.9630 ### Problem 2-2 ### #psub.Rdata데이터를 로드해7:3으로 train set/test set으로 나누세요. # AGEP, SEX, COW, PINCP, SCHL 변수만을 이용하여 학사학위 이상 소지자와 # 그렇지 않은 사람의 두범주로 bachdeg 변수를 생성해 SCHL을 # 제외한 나머지 변수들에 대해 로지스틱 회귀모형을 적합하고 # 예측하여Confusion Matrix를만들어주세요. # COW: class of worker, SCHL: level of education, # PINCP: personal income, AGEP : age Sys.setlocale(category = "LC_CTYPE", locale = "ko_KR.UTF-8") rm(list=ls()) getwd() setwd("/Users/sungjinpark/Desktop/투빅스/week1/logistic") # 1.데이터 불러오기 load("/Users/sungjinpark/Desktop/투빅스/week1/logistic/psub.RData") str(psub) # 2. 변수 추출하기 selected <- c("AGEP", "SEX", "COW", "PINCP", "SCHL") psub.se <- psub[,selected] str(psub.se) levels(psub.se$COW) # 범주확인 - 고용종류 # [1] "Employee of a private for-profit" # [2] "Federal government employee" # [3] "Local government employee" # [4] "Private not-for-profit employee" # [5] "Self-employed incorporated" # [6] "Self-employed not incorporated" # [7] "State government employee" levels(psub.se$SCHL) # 범주확인 - 학력정도 # ---------------------------------# # "no high school diploma" # "Regular high school diploma" 학사 학위 미만 # "some college credit, no degree" # "GED or alternative credential" # ---------------------------------# # "Associate's degree" # "Bachelor's degree" # "Doctorate degree" 학사 학위 이상 # "Master's degree" # "Professional degree" # ---------------------------------# #학사 학위 이상 소지여부 변환 psub.se$SCHL <- as.character(psub.se$SCHL) psub.se$SCHL[psub.se$SCHL=="no high school diploma"] <- 0 psub.se$SCHL[psub.se$SCHL=="GED or alternative credential"] <- 0 psub.se$SCHL[psub.se$SCHL=="Regular high school diploma"] <- 0 psub.se$SCHL[psub.se$SCHL=="some college credit, no degree"] <- 0 psub.se$SCHL[psub.se$SCHL=="Associate's degree"] <- 1 psub.se$SCHL[psub.se$SCHL=="Bachelor's degree"] <- 1 psub.se$SCHL[psub.se$SCHL=="Doctorate degree"] <- 1 psub.se$SCHL[psub.se$SCHL=="Master's degree"] <- 1 psub.se$SCHL[psub.se$SCHL=="Professional degree"] <- 1 psub.se$SCHL <- as.numeric(psub.se$SCHL) str(psub.se) # bachdeg 생성후 SCHL 제거 psub.se["bachdeg"] <- NA psub.se$bachdeg <- psub.se$SCHL psub.new <- subset(psub.se, select = -c(SCHL)) str(psub.new) hist(psub.new$bachdeg) # 데이터 불균형 없다고 판단 # Logistic regression 적합 # 주어진 문제 자체에서 변수를 지정해 주었으므로 # 추가적인 변수선택 작업 생략 fit.full <- glm(bachdeg~., data = psub.new, family = binomial(link = 'logit')) summary(fit.full) # Null deviance: 1666.6 on 1223 degrees of freedom # Residual deviance: 1348.6 on 1214 degrees of freedom # AIC: 1368.6 # 학습/테스트 데이터 나누기 library(caret) library(e1071) index <- createDataPartition(psub.new$bachdeg, p = 0.7, list = F) train <- psub.new[index,] test <- psub.new[-index,] #confusionMatrix 구하기 model.glm1 <- glm(bachdeg~., train, family = binomial) pred.glm1 <- as.numeric(predict(model.glm1, test, type = "response") > 0.5) confusionMatrix(as.factor(pred.glm1), as.factor(test$bachdeg)) table(pred.glm1) # Accuracy : 0.7439 # Reference # Prediction 0 1 # 0 177 70 # 1 24 96 # # # Sensitivity : 0.8806 # Specificity : 0.5783 | cs |
Designed by sketchbooks.co.kr / sketchbook5 board skin
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5
Sketchbook5, 스케치북5