투빅스 9&10기 1주차 Logistic Regression - 10기 박성진

by chad4545 posted Jul 25, 2018
?

단축키

Prev이전 문서

Next다음 문서

ESC닫기

+ - Up Down Comment Print
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
 
###  Problem 2-1   ###
# mlbench패키지의 BreastCancer data를 7:3으로 train set/test set으로
# 나눈후 로지스틱회귀를 적합하여 예측하세요.
# 정확한 결과를 위해 랜덤분할을50 회 실시하여 Accuracy의 평균을구해주세요
 
 
Sys.setlocale(category = "LC_CTYPE", locale = "ko_KR.UTF-8")
rm(list=ls())
getwd()
setwd("/Users/sungjinpark/Desktop/투빅스/week1/logistic")
#install.packages("mlbench")
library(mice) # 결측치 처리 
library(caTools) # 데이터 분할 
library(caret) # 데이터 분할 
library(e1071)
library(mlbench)
library(ROCR)
data(BreastCancer)
str(BreastCancer)
head(BreastCancer)
?BreastCancer
# [,1]    Id    Sample code number
# [,2]    Cl.thickness    Clump Thickness
# [,3]    Cell.size    Uniformity of Cell Size
# [,4]    Cell.shape    Uniformity of Cell Shape
# [,5]    Marg.adhesion    Marginal Adhesion
# [,6]    Epith.c.size    Single Epithelial Cell Size
# [,7]    Bare.nuclei    Bare Nuclei
# [,8]    Bl.cromatin    Bland Chromatin
# [,9]    Normal.nucleoli    Normal Nucleoli
# [,10]    Mitoses    Mitoses
# [,11]    Class    Class
 
# 1.데이터 전처리 
# 1-1. 종속변수의 범주 확인
levels(BreastCancer$Class) # 2가지 "benign", "malignant"
 
# 1-2. 결측치 체크 
summary(BreastCancer) # Bare.nuclei변수에서 16개의 결측치 발견 
mean(is.na(BreastCancer)) # 전체에서 0.02% 매우 낮은 비율이지만 처리해줘야 한다고 판단
 
# 1-3. 결측치 처리 및 ID 변수 제외  
dataset_impute <- mice(BreastCancer[,2:10],  print = FALSE)
BreastCancer <- cbind(BreastCancer[,11, drop = FALSE], mice::complete(dataset_impute, 1))
summary(BreastCancer) 
str(BreastCancer)
 
# 1-4. 종속변수 범주화
BreastCancer$Class <- as.character(BreastCancer$Class)
BreastCancer$Class[BreastCancer$Class=="malignant"<- 1
BreastCancer$Class[BreastCancer$Class=="benign"<- 0
BreastCancer$Class <- as.factor(BreastCancer$Class)
str(BreastCancer$Class)
 
# 1-4. 독립변수 탐색 및 전처리   
# 순서형 독립변수 수치화 (주관적)
BreastCancer[, c(2:6)] <- sapply(BreastCancer[, c(2:6)], as.character)
BreastCancer[, c(2:6)] <- sapply(BreastCancer[, c(2:6)], as.numeric)
#str(BreastCancer)
# 범주형 독립변수 간의 독립성검정
# 시행결과 서로 모두 종속
chisq.test(BreastCancer$Bare.nuclei,BreastCancer$Normal.nucleoli)
chisq.test(BreastCancer$Bare.nuclei,BreastCancer$Mitoses)
chisq.test(BreastCancer$Normal.nucleoli,BreastCancer$Mitoses)
chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Bare.nuclei)
chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Normal.nucleoli)
chisq.test(BreastCancer$Bl.cromatin,BreastCancer$Mitoses)
 
# 범주형 독립변수들 분포 확인
plot(BreastCancer$Bare.nuclei)
plot(BreastCancer$Bl.cromatin)
plot(BreastCancer$Normal.nucleoli)
plot(BreastCancer$Mitoses)
 
# 범주형 변수들이 서로 종속적이라는 특징이 있다. 
# 이를 감안하고 범주형 번수를 그대로 둔후 적합하여 보았다.
 
fit.full <- glm(Class~., data = BreastCancer, family = binomial(link = 'logit'))
summary(fit.full)
 
# 범주형 변수의 범주가 각각 10개, 총 40개의 경우가 있기 때문에 
# 적합에 무리가 있다고 판단.
#범주형 변수를 일괄적으로 수치형으로 바꿔야 한다고 판단.
# 9개 설명변수 모두  numeric 형태로 변환
 
BreastCancer[, c(7:10)] <- sapply(BreastCancer[, c(7:10)], as.character)
BreastCancer[, c(7:10)] <- sapply(BreastCancer[, c(7:10)], as.numeric)
str(BreastCancer)
 
# 변수명 변환
names(BreastCancer) <- c("Y","X1","X2","X3","X4","X5","X6","X7","X8","X9")
str(BreastCancer)
# 반응변수 확인 
table(BreastCancer$Y)  
 
#  0   1    -> 데이터 불균형 의심 추후 upsampling처리 
# 458 241 
 
 
 
# 2. 모델 적합 
 
# 2.1  GLM
fit.full <- glm(Y~., data = BreastCancer, family = binomial(link = 'logit'))
summary(fit.full)
 
# AIC: 142.94
# X1, X6, X7 ,X4 등 유의한 변수들..
 
# Null deviance와 residual deviance의 차이 확인 -> 적합 계속 진행
# 900.53  vs 122.94
 
# 2.2 stepwise 사용 
# 독립변수 X2, X5 제거 후 AIC=139.27
reduced.model=step(fit.full) 
summary(reduced.model)
 
# 2.3 적합모델(X2,X5 제거 모델) 에 의한 Confusion Matrix
pred.glm1 = as.numeric(predict(reduced.model, BreastCancer, type = "response"> 0.5)
confusionMatrix(as.factor(pred.glm1),as.factor(BreastCancer$Y))
table(pred.glm1)
 
 
# 높은 Accuracy  그러나 데이터 비율이 불균형  -> 추후 upsampling 필요 
# Accuracy : 0.9671
#             Reference
# Prediction   0   1
#           0 447  12
#           1  11 229
# Sensitivity : 0.9760        
# Specificity : 0.9502
 
# 2.4 Upsampling
 
table(BreastCancer$Y)
BC_up = upSample(subset(BreastCancer, select=-Y), BreastCancer$Y)
summary(BC_up)
table(BC_up$Class)
 
model.Ups = glm(Class~.-X2 -X5,BC_up, family = binomial)
pred.Ups = as.numeric(predict(model.Ups, BC_up, type = "response"> 0.5)
confusionMatrix(as.factor(pred.Ups),as.factor(BC_up$Class))
table(pred.Ups)
#         Accuracy : 0.694
 
#             Reference
# Prediction   0   1
#             0 445  15
#             1  13 443
#       Sensitivity : 0.9716          
#       Specificity : 0.9672
 
 
# 3.학습 및 테스트 데이터 분할 
#  데이터를 학습/테스트 셋으로 분리 + upsampling
# 50번 랜덤 추출
#  X2, X5  제거 후 Upsamling 
overall <- 0
for (i in 1:50) {
  index <- createDataPartition(BreastCancer$Y, p = 0.7, list = F)
  BreastCancertrain <- BreastCancer[index,]
  BreastCancertest <- BreastCancer[-index,]
  
  train_up = upSample(subset(BreastCancertrain, select=-Y), BreastCancertrain$Y)
  
  model.glm1 <- glm(Class~. -X2 -X5 ,train_up, family = binomial)
  pred.glm1 <- as.numeric(predict(model.glm1, BreastCancertest, type = "response")>0.5)
  ac <- confusionMatrix(as.factor(pred.glm1), as.factor(BreastCancertest$Y))
  overall_accuracy <- sum(BreastCancertest$Y == pred.glm1) / length(BreastCancertest$Y)
  overall <- overall + overall_accuracy
}
 
table(pred.glm1)
# accuracy 값 평균
(overall / 50)
# 0.9630
 
 
 
 
 
###  Problem 2-2 ###
#psub.Rdata데이터를 로드해7:3으로 train set/test set으로 나누세요.
# AGEP, SEX, COW, PINCP, SCHL 변수만을 이용하여 학사학위 이상 소지자와
# 그렇지 않은 사람의 두범주로 bachdeg 변수를 생성해 SCHL을
# 제외한 나머지 변수들에 대해 로지스틱 회귀모형을 적합하고 
# 예측하여Confusion Matrix를만들어주세요.
 
# COW: class of worker,      SCHL: level of education, 
# PINCP: personal income,    AGEP : age
 
Sys.setlocale(category = "LC_CTYPE", locale = "ko_KR.UTF-8")
rm(list=ls())
getwd()
setwd("/Users/sungjinpark/Desktop/투빅스/week1/logistic")
 
 
 
# 1.데이터 불러오기
load("/Users/sungjinpark/Desktop/투빅스/week1/logistic/psub.RData")
 
str(psub)
# 2. 변수 추출하기
selected <- c("AGEP""SEX""COW""PINCP""SCHL")
psub.se <- psub[,selected]
str(psub.se)
levels(psub.se$COW) # 범주확인 - 고용종류
# [1] "Employee of a private for-profit"
# [2] "Federal government employee"     
# [3] "Local government employee"       
# [4] "Private not-for-profit employee" 
# [5] "Self-employed incorporated"      
# [6] "Self-employed not incorporated"  
# [7] "State government employee"
 
 
levels(psub.se$SCHL) # 범주확인 - 학력정도
# ---------------------------------#
# "no high school diploma"
# "Regular high school diploma"        학사 학위 미만 
# "some college credit, no degree"  
# "GED or alternative credential"
# ---------------------------------#
# "Associate's degree"            
# "Bachelor's degree"
# "Doctorate degree"            학사 학위 이상
# "Master's degree"               
# "Professional degree"
# ---------------------------------#
 
 
#학사 학위 이상 소지여부 변환
psub.se$SCHL <- as.character(psub.se$SCHL)
psub.se$SCHL[psub.se$SCHL=="no high school diploma"<- 0
psub.se$SCHL[psub.se$SCHL=="GED or alternative credential"<- 0
psub.se$SCHL[psub.se$SCHL=="Regular high school diploma"<- 0
psub.se$SCHL[psub.se$SCHL=="some college credit, no degree"<- 0
psub.se$SCHL[psub.se$SCHL=="Associate's degree"<- 1
psub.se$SCHL[psub.se$SCHL=="Bachelor's degree"<- 1
psub.se$SCHL[psub.se$SCHL=="Doctorate degree"<- 1
psub.se$SCHL[psub.se$SCHL=="Master's degree"<- 1
psub.se$SCHL[psub.se$SCHL=="Professional degree"<- 1
psub.se$SCHL <- as.numeric(psub.se$SCHL)
str(psub.se)
 
# bachdeg 생성후 SCHL 제거 
psub.se["bachdeg"<- NA
psub.se$bachdeg <- psub.se$SCHL
psub.new <- subset(psub.se, select = -c(SCHL))
str(psub.new)
hist(psub.new$bachdeg)  # 데이터 불균형 없다고 판단 
 
# Logistic regression 적합
# 주어진 문제 자체에서 변수를 지정해 주었으므로 
# 추가적인 변수선택 작업 생략 
fit.full <- glm(bachdeg~., data = psub.new, family = binomial(link = 'logit'))
summary(fit.full)
 
# Null deviance: 1666.6  on 1223  degrees of freedom
# Residual deviance: 1348.6  on 1214  degrees of freedom
# AIC: 1368.6
 
 
 
# 학습/테스트 데이터 나누기 
library(caret)
library(e1071)
index <- createDataPartition(psub.new$bachdeg, p = 0.7, list = F)
train <- psub.new[index,]
test <- psub.new[-index,]
 
#confusionMatrix 구하기
 
model.glm1 <- glm(bachdeg~., train, family = binomial)
pred.glm1 <- as.numeric(predict(model.glm1, test, type = "response"> 0.5)
confusionMatrix(as.factor(pred.glm1), as.factor(test$bachdeg))
table(pred.glm1)
 
# Accuracy : 0.7439
 
#             Reference
# Prediction   0   1
#           0 177  70
#           1  24  96
#          
#         
# Sensitivity : 0.8806         
# Specificity : 0.5783
 
 
 
 
 
cs

Articles

5 6 7 8 9 10 11 12 13 14

나눔글꼴 설치 안내


이 PC에는 나눔글꼴이 설치되어 있지 않습니다.

이 사이트를 나눔글꼴로 보기 위해서는
나눔글꼴을 설치해야 합니다.

설치 취소

Designed by sketchbooks.co.kr / sketchbook5 board skin

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5

Sketchbook5, 스케치북5