Week02 Assignment Naive Bayes 코드입니다.
*R이 아니라 파이썬 코드입니다.
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
path = "~/NaiveBayes/"
file = pd.read_csv(path + 'train.csv')
file.info()
file.head()
file.isnull().sum()
pd.set_option('display.max_columns', 50)
cmap = sns.cubehelix_palette(dark=0.3, light=1, as_cmap=True)
graph = sns.heatmap(file[['Survived', 'SibSp', 'Parch', 'Age', 'Fare']].corr(), annot=True, fmt="0.2f", cmap=cmap)
graph
plt.show()
graph = sns.kdeplot(file["Age"][(file["Survived"] == 0) & (file["Age"].notnull())], color="Red", shade=True)
graph = sns.kdeplot(file["Age"][(file["Survived"] == 1) & (file["Age"].notnull())], ax=graph, color="Blue", shade=True)
graph.set_xlabel("Age")
graph.set_ylabel("Frequency")
graph = graph.legend(["Not Survived","Survived"])
graph
plt.show()
graph = sns.distplot(file["Fare"], color="orange")
graph
plt.show()
# 너무 치우쳐져 있기 때문에 log변환을 해주는 게 나아보인다.
file["Fare"] = file["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
file[["Sex","Survived"]].groupby('Sex').mean()
file[["Pclass","Survived"]].groupby('Pclass').mean()
file[file['Embarked'].isnull()==True]
file['Fare'].groupby(file['Embarked']).mean()
file['Pclass'].groupby(file['Embarked']).describe()
file['Embarked'].fillna('C', inplace = True)
graph = sns.catplot(x="Embarked", y="Survived", data=file,
height=6, kind="bar", palette="coolwarm")
graph = graph.set_ylabels("survival probability")
graph
plt.show()
# 확실히 선착상 C에서 탔다면 생존확률이 높아보인다.
file['Sex'] = file['Sex'].map({'male':0, 'female':1})
file['Embarked'] = file['Embarked'].map({'S':0, 'C':1, 'Q':2})
file['Embarked'] = file['Embarked'].astype(int)
file['Title'] = file['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
file['Title'].value_counts()
file['Title'].isnull().sum()
file['Title'] = file['Title'].replace(['Dr', 'Rev', 'Major', 'Col', 'Don', 'Jonkheer', 'Capt', 'Countess', 'Lady', 'Done'], 'etc')
title_mapping = {'Mr':0, 'Miss':1, 'Mrs':1, 'Master':2, 'Mlle':1, 'Ms':1, 'Sir':0, 'Mme':1, 'etc':3}
file['Title'] = file['Title'].map(title_mapping)
file['Title'] = file['Title'].astype(int)
index = list(file["Age"][file["Age"].isnull()].index)
for i in index:
age_median = file["Age"].median()
age_pred = file["Age"][((file['SibSp']==file.iloc[i]["SibSp"]) & (file['Parch']==file.iloc[i]["Parch"]) &
(file['Pclass']==file.iloc[i]["Pclass"]))].median()
if not np.isnan(age_pred):
file['Age'].iloc[i] = age_pred
else:
file['Age'].iloc[i] = age_median
file['nb_family'] = file['SibSp'] + file['Parch']
graph = sns.catplot(x="nb_family", y="Survived", data=file, kind='point')
graph = graph.set_ylabels("Survival Probability")
graph
plt.show()
file.drop(['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch'], axis=1, inplace=True)
file['Cabin'] = file['Cabin'].str[:1]
file["Cabin"] = file["Cabin"].fillna("N")
file['Cabin'].value_counts()
graph = sns.catplot(y="Survived", x="Cabin", data=file,kind="bar",order=['A','B','C','D','E','F','G','T','N'], palette='coolwarm')
graph = graph.set_ylabels("Survival Probability")
graph
plt.show()
file['Cabin'] = file['Cabin'].map({'A':0, 'B':0.1, 'C':0.2, 'D':0.3, 'E':0.4, 'F':0.5, 'G':0.6, 'T':0.7, 'N':1.0})
bins = pd.qcut(file['Age'], 6, precision=2, labels=['A1', 'A2', 'A3', 'A4', 'A5', 'A6'])
file['binned_Age'] = bins
file['binned_Age'] = file['binned_Age'].map({'A1':0.0, 'A2':0.2, 'A3':0.4, 'A4':0.6, 'A5':0.8, 'A6':1.0})
file = file.drop('Age', axis=1)
# 학습 및 결과 확인
y = np.array(file['Survived'])
x = file.drop('Survived', axis=1)
# random_state 번호를 담은 리스트를 argument에 삽입하면 리스트의 길이 만큼의 횟수로 데이터 셋을 분리하여
# Train Score와 Test Score의 평균을 반환하는 함수
def calculate_score(state_list):
train_scores = []
test_scores = []
for state in state_list:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.1, random_state=state)
nb = GaussianNB()
nb.fit(X_train, Y_train)
train_score = nb.score(X_train, Y_train)
test_score = nb.score(X_test, Y_test)
train_scores.append(train_score)
test_scores.append(test_score)
print("Mean of train scores: ", np.mean(train_scores))
print("Mean of test scores: ", np.mean(test_scores))
return train_scores, test_scores
train_scores, test_scores = calculate_score(state_list=list(range(0,20)))
Mean of train scores: 0.7828339575530587
Mean of test scores: 0.8016666666666667