data.step<-step(data.con,scope=list(upper=data.lm,lower=data.con,direction="both"))
data.step.out<-step(data.con.out,scope=list(upper=data.lm.out,lower=data.con.out,direction="both"))
fit_value_2<-predict.lm(data.step.out,data_test[,-8],interval='prediction',level=0.95)
plot(data_test[,7],ylim=c(0,1.2),ylab='ppm',main='Regression Model(Outlier.delete)')
-----------------------------------------------------------------------------------------------------------------------------------------------
rm(list=ls())
library(ROCR) # ROC curve를 그리기 위한 패키지 'ROCR'
#데이터 불러오기
adult_train<-read.csv(file="C:/toBigs/6주차/train_adult.csv",stringsAsFactors = F)
adult_test<-read.csv(file="C:/toBigs/6주차/test_adult.csv",stringsAsFactors=F)
#데이터 탐색
head(adult_train)
dim(adult_train) # 48842 / 16
summary(adult_train)
str(adult_train)
#NA값 제거하기
aa <- is.na(adult_train)
index<-sapply(1:nrow(adult_train),function(i) any(aa[i,]))
adult_train_2<-adult_train[!index,]
summary(adult_train_2)
#가중치의 의미를 지닌 fnlwgt는 제거하고
#education.num은 education을 숫자적으로 표현한 것이라 중복되는 열이라 판단하여 제거
adult_train2<-adult_train_2[,-c(1,2,5,7)]
head(adult_train2)
str(adult_train2)
#의사결정나무와, 나이브베이즈를 위한 데이터 전처리
##나이 나누기 20, 30, 40, 50, other
adult_train2$age[which(adult_train2$age<20)] <- '10s'
adult_train2$age[which(adult_train2$age>=20&adult_train2$age<30)] <- '20s'
adult_train2$age[which(adult_train2$age>=30&adult_train2$age<40)] <- '30s'
adult_train2$age[which(adult_train2$age>=40&adult_train2$age<50)] <- '40s'
adult_train2$age[which(adult_train2$age>=50)] <- 'other'
##북미 지역이 19373개로 대다수를 차지하기 때문에 북미와 북미가 아닌 것으로
adult_train2$native.country[which(adult_train2$native.country=='Canada')] <- 'North_Ame'
adult_train2$native.country[which(adult_train2$native.country=='United-States')] <- 'North_Ame'
adult_train2$native.country[which(adult_train2$native.country!='North_Ame')] <- 'etc'
##education column 1st - 12th 값들 범주화
adult_train2$education[which(adult_train2$education=='10th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='11th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='12th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='1st-4th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='5th-6th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='7th-8th')] <- 'HS-undergrad'
adult_train2$education[which(adult_train2$education=='9th')] <- 'HS-undergrad'
## capital.loss - 0인 row가 20149개이기 때문에 0인것과 0보다 큰것으로 범주화
adult_train2$capital.loss[which(adult_train2$capital.loss>0)]<-'capital.loss>0'
adult_train2$capital.loss[which(adult_train2$capital.loss=='0')]<-'capital.loss=0'
## capital.gain - 0인 row가 19379개이기 때문에 0인것과 0보다 큰것으로 범주화
adult_train2$capital.gain[which(adult_train2$capital.gain>0)]<-'capital.gain>0'
adult_train2$capital.gain[which(adult_train2$capital.gain=='0')]<-'capital.gain=0'
##hours.per.week
adult_train2$hours.per.week[which(adult_train2$hours.per.week<=40)]<-'hours<=40'
adult_train2$hours.per.week[which(adult_train2$hours.per.week!='hours<=40')]<-'hours>40'
##테스트 데이터셋 위와 같이 전처리하기
aaa <- is.na(adult_test)
index<-sapply(1:nrow(adult_test),function(i) any(aaa[i,]))
adult_test_2<-adult_test[!index,]
adult_test2<-adult_test_2[,-c(1,2,5,7)]
adult_test2$age[which(adult_test2$age<20)] <- '10s'
adult_test2$age[which(adult_test2$age>=20&adult_test2$age<30)] <- '20s'
adult_test2$age[which(adult_test2$age>=30&adult_test2$age<40)] <- '30s'
adult_test2$age[which(adult_test2$age>=40&adult_test2$age<50)] <- '40s'
adult_test2$age[which(adult_test2$age>=50)] <- 'other'
adult_test2$native.country[which(adult_test2$native.country=='Canada')] <- 'North_Ame'
adult_test2$native.country[which(adult_test2$native.country=='United-States')] <- 'North_Ame'
adult_test2$native.country[which(adult_test2$native.country!='North_Ame')] <- 'etc'
adult_test2$education[which(adult_test2$education=='10th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='11th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='12th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='1st-4th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='5th-6th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='7th-8th')] <- 'HS-undergrad'
adult_test2$education[which(adult_test2$education=='9th')] <- 'HS-undergrad'
adult_test2$capital.loss[which(adult_test2$capital.loss>0)]<-'capital.loss>0'
adult_test2$capital.loss[which(adult_test2$capital.loss=='0')]<-'capital.loss=0'
adult_test2$capital.gain[which(adult_test2$capital.gain>0)]<-'capital.gain>0'
adult_test2$capital.gain[which(adult_test2$capital.gain=='0')]<-'capital.gain=0'
adult_test2$hours.per.week[which(adult_test2$hours.per.week<=40)]<-'hours<=40'
adult_test2$hours.per.week[which(adult_test2$hours.per.week!='hours<=40')]<-'hours>40'
#decision tree를 위해 전처리된 데이터 확인
str(adult_tarin2)
str(adult_test2)
#의사결정나무 분석실시
library(rpart)
library(rpart.plot)
library(rattle)
library(RColorBrewer)
adult_rp<-rpart(income~., adult_train2)
rattle::fancyRpartPlot(adult_rp)
printcp(adult_rp)
#최적의 임계치는 0.01
adult_rp$cptable[which.min(adult_rp$cptable[,"xerror"]),1]
dt_tb<-table(predict(adult_rp,adult_test2[,-13],type='class'),adult_test2[,13])
dt_tb #정분류율 7403/9001 = 0.822 82%
#같은 데이터로 나이브 베이즈 분석
#나이브베이즈를 위하여, character를 factor형으로 변환
adult_train2_nb <- adult_train2
adult_test2_nb <- adult_test2
for(i in 1:13){
adult_train2_nb[,i] <- as.factor(adult_train2[,i])
adult_test2_nb[,i] <- as.factor(adult_test2[,i])
}
#나이브베이즈 분석 실시
library(e1071)
library(MASS)
adult_nb<-e1071::naiveBayes(income~.,data=adult_train2_nb, laplace = 0)
adult_nb
adult_nb$apriori
adult_nb$tables
adult_nb$levels
adult_nb$call
summary(adult_nb)
nb_tb<-table(predict(adult_nb,adult_test2[,-13]),adult_test2[,13])
nb_tb #정분류율 7269/9001 = 0.808 81%
#logistic을 위한 데이터 전처리 실시
#연속형 변수들은 원래 상태로 변환
#목표치인 income변수는 large = 1, small = 0 으로 변환
adult_train3<-adult_train2
adult_test3<-adult_test2
adult_train3$age <-adult_train_2$age
adult_train3$capital.gain <-adult_train_2$capital.gain
adult_train3$capital.loss <-adult_train_2$capital.loss
adult_train3$hours.per.week <-adult_train_2$hours.per.week
adult_train3$income[which(adult_train3$income=='large')] <- 1
adult_train3$income[which(adult_train3$income=='small')] <- 0
adult_train3$income<-as.numeric(adult_train3$income)
adult_test3$age <-adult_test_2$age
adult_test3$capital.gain <-adult_test_2$capital.gain
adult_test3$capital.loss <-adult_test_2$capital.loss
adult_test3$hours.per.week <-adult_test_2$hours.per.week
adult_test3$income[which(adult_test3$income=='large')] <- 1
adult_test3$income[which(adult_test3$income=='small')] <- 0
adult_test3$income<-as.numeric(adult_test3$income)
#로지스틱 회귀분석을 위한 데이터확인
str(adult_train3)
str(adult_test3)
# 로지스틱 회귀분석 실시
adult.glm<-glm(income~.,data=adult_train3,family='binomial')
adult.con<-glm(income~1,data=adult_train3,family='binomial')
adult.step<-step(adult.con,scope=list(upper=adult.glm,lower=adult.con),direction="both")
summary(adult.step)
result<-predict(adult.step,adult_test3[,-13],type='response')
result<-round(result)
table(result,adult_test3[,13]) # 정분류율은 7645/9001 = 0.85 85%
#정분류율 정리
##의사결정나무 82%, 나이브베이즈 81%, 로지스틱 회귀분석 85%
x11()
par(mfrow=c(1,3))
#세 가지모델을 ROC curve를 통해 비교
#ROC curve - decision tree
dt_target<-predict(adult_rp,adult_test2[,-13],type='class')
#ROC curve를 그리기 위해 target변수들을 연속형 변수로 변환
#small은 0, large는 1
dt_target<-as.character(dt_target)
dt_target[which(dt_target=='small')] <- 0
dt_target[which(dt_target!='0')] <- 1
dt_target<-as.numeric(dt_target)
dt_label<-adult_test2[,13]
dt_label<-as.character(dt_label)
dt_label[which(dt_label=='small')] <- 0
dt_label[which(dt_label!='0')] <- 1
dt_label<-as.numeric(dt_label)
#prediction, performance함수를 사용하여 ROC curve 그리기
dt_pred<-prediction(dt_target,dt_label)
dt_perf<-performance(dt_pred,'tpr','fpr')
plot(dt_perf,col="green",lwd=2,main="ROC Curve for Decision Tree")
abline(a=0,b=1,lwd=2,lty=2,col="gray")
#AUC(Area Under the Curve) 도출
dt_auc <- performance(dt_pred,"auc")
dt_auc <- unlist(slot(dt_auc, "y.values"))
dt_auc<-round(dt_auc, digits = 2)
#AUC값 그래프에 나타내기
dt_auct <- paste(c("AUC = "),dt_auc,sep="")
legend(0.3,0.6,dt_auct,border="white",cex=1.7,box.col = "white")
#ROC curve - Naive
#위와 같은 방식으로 진행
nb_target<-predict(adult_nb,adult_test2[,-13])
nb_target<-as.character(nb_target)
nb_target[which(nb_target=='small')] <- 0
nb_target[which(nb_target!='0')] <- 1
nb_target<-as.numeric(nb_target)
label<-adult_test2[,13]
label<-as.character(label)
label[which(label=='small')] <- 0
label[which(label!='0')] <- 1
label<-as.numeric(label)
nb_pred<-prediction(nb_target,label)
nb_perf<-performance(nb_pred,'tpr','fpr')
plot(nb_perf,col="green",lwd=2,main="ROC Curve for Naive")
abline(a=0,b=1,lwd=2,lty=2,col="gray")
#AUC값 도출
nb_auc <- performance(nb_pred,"auc")
nb_auc <- unlist(slot(nb_auc, "y.values"))
nb_auc<-round(nb_auc, digits = 2)
nb_auct <- paste(c("AUC = "),nb_auc,sep="")
legend(0.3,0.6,nb_auct,border="white",cex=1.7,box.col = "white")
#Roc curve - Logistic
#이미 target변수들이 0과 1 로 연속형이기 때문에 따로 변환할 필요가 없음
glm_pred<-prediction(result,adult_test3[,13])
glm_perf<-performance(glm_pred,'tpr','fpr')
plot(glm_perf,col="green",lwd=2,main="ROC Curve for Logistic")
abline(a=0,b=1,lwd=2,lty=2,col="gray")
#AUC값 도출
glm_auc <- performance(glm_pred,"auc")
glm_auc <- unlist(slot(glm_auc, "y.values"))
glm_auc<-round(glm_auc, digits = 2)
glm_auct <- paste(c("AUC = "),glm_auc,sep="")
legend(0.3,0.6,glm_auct,border="white",cex=1.7,box.col = "white")
와....정리 엄청 깔끔하게 잘 해주셨네요 고생했구 시험 잘보구 합격후기 들려줘요 ㅋㅋ