ToBigs Week03 Assignment Support Vector Machine 파이썬 코드입니다.
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
[1st: Ffires 데이터]
path = "C:/Users/YY/Desktop/TB/Week03/SVM/"
file = pd.read_csv(os.path.join(path, 'Ffires.csv'))
file.info()
file.head()
file.isnull().sum()
pd.set_option('display.max_columns', 50)
sns.distplot(file["area"], color="darkblue", kde=True, hist=False)
plt.show()
file['log_area'] = file['area'].map(lambda i: np.log(i) if i > 5 else 0)
file['area'] = file['area'].map(lambda i: 1 if i > 5 else 0)
def edit_data(file, col_list):
"""
Arguments: file -- dataset
col_list -- a list containing names of columns; It should be continuous data
Return: Scaled file
"""
....scaler = MinMaxScaler()
....for col in col_list:
........file[col] = (file[col] - np.mean(file[col], axis=0)) / np.std(file[col])
........output = np.array( scaler.fit_transform(file[col].values.reshape(-1, 1)) )
........file[col] = output
....return file
col_list = ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']
file = edit_data(file=file, col_list=col_list)
sns.catplot(x="X", y="log_area", data=file, orient="v", kind="violin")
plt.show()
sns.catplot(x="Y", y="log_area", data=file, orient="v", kind="violin")
plt.show()
scaler = MinMaxScaler()
file['X'] = scaler.fit_transform(file['X'].values.astype('float64').reshape(-1, 1))
file['Y'] = scaler.fit_transform(file['Y'].values.astype('float64').reshape(-1, 1))
sns.catplot(x="month", y='area', kind="bar", palette="ch:.25", data=file,
order=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'])
plt.show()
sns.catplot(x="day", y='area', kind="bar", palette="ch:.25", data=file,
order=['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
plt.show()
month_mapping = {'jan':0, 'feb':1/11, 'mar':2/11, 'apr':3/11, 'may':4/11, 'jun':5/11,
'jul':6/11, 'aug':7/11, 'sep':8/11, 'oct':9/11, 'nov':10/11, 'dec':1}
day_mapping = {'mon':0, 'tue':1/6, 'wed':2/6, 'thu':3/6, 'fri':4/6, 'sat':5/6, 'sun':1}
file['month'] = file['month'].map(month_mapping)
file['day'] = file['day'].map(day_mapping)
file.head()
X Y month day FFMC DMC DC ISI temp
7 5 0.181818 0.666667 0.870968 0.086492 0.101325 0.090909 0.192926
7 4 0.818182 0.166667 0.927742 0.118194 0.775419 0.119430 0.508039
7 4 0.818182 0.833333 0.927742 0.146795 0.796294 0.119430 0.398714
8 6 0.181818 0.666667 0.941935 0.110958 0.081623 0.160428 0.196141
8 6 0.181818 1.000000 0.910968 0.172984 0.110590 0.171123 0.295820
RH wind rain area log_area
0.423529 0.700000 0.00000 0 0.0
0.211765 0.055556 0.00000 0 0.0
0.211765 0.100000 0.00000 0 0.0
0.964706 0.400000 0.03125 0 0.0
0.988235 0.155556 0.00000 0 0.0
X = file.drop(['area', 'log_area'], axis=1).values
Y = file['area'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)
print("Length of X_train: ", len(X_train))
print("Length of X_test: ", len(X_test))
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
svm = SVC(kernel='rbf')
grid = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, Y_train)
grid.best_params_
grid.score(X_train, Y_train)
grid.score(X_test, Y_test)
Best Parameters {'C': 10, 'gamma': 0.1}
Train Accuracy 73.37%
Test Accuracy 72.12%
[2nd: SN_ad 데이터]
path = "C:/Users/YY/Desktop/TB/Week03/SVM/"
sn = pd.read_csv(os.path.join(path, 'SN_ad.csv'))
sn.info()
sn.head()
sn.isnull().sum()
sn.drop('User ID', axis=1, inplace=True)
sn['Gender'] = sn['Gender'].map({'Male':0, 'Female':1})
sn = sn.rename(columns={"EstimatedSalary":"ES"}, inplace=False)
def edit_data(sn):
"""
Arguments: sn -- dataset / Age, ES -- name of columns
Return: Scaled Data
"""
....sn['Age'] = (sn['Age'] - np.mean(sn['Age'], axis=0)) / np.std(sn['Age'])
....sn['ES'] = (sn['ES'] - np.mean(sn['ES'], axis=0)) / np.std(sn['ES'])
....scaler = MinMaxScaler()
....Age = np.array( scaler.fit_transform(sn['Age'].values.reshape(-1, 1)) )
....ES = np.array( scaler.fit_transform(sn['ES'].values.reshape(-1, 1)) )
....return Age, ES
Age, ES = edit_data(sn)
sn['Age'] = Age
sn['ES'] = ES
sn.head(5)
Gender Age ES Purchased
0 0 0.023810 0.029630 0
1 0 0.404762 0.037037 0
2 1 0.190476 0.207407 0
3 1 0.214286 0.311111 0
4 0 0.023810 0.451852 0
cmap = sns.cubehelix_palette(n_colors=3, start=0, rot=0.2, light=0.9, dark=0.2, as_cmap=True)
sns.heatmap(sn[['Age', 'ES']].corr(), annot=True, fmt="0.2f", cmap=cmap)
plt.show()
sns.countplot(x='Purchased', hue='Gender', data=sn, palette=sns.color_palette("Paired", 2))
plt.show()
sns.distplot(sn[('Age')], hist=True, bins=10, kde=True, rug=True)
plt.show()
sns.catplot(x="Purchased", y="Age", kind="boxen", data=sn)
plt.show()
sns.distplot(sn[('ES')], hist=True, bins=20, kde=True, rug=True)
plt.show()
sns.catplot(x="Purchased", y="ES", kind="boxen", data=sn)
plt.show()
X = sn[['Gender', 'Age', 'ES']].values
Y = sn['Purchased'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)
print("Length of X_train: ", len(X_train))
print("Length of X_test: ", len(X_test))
sn['Purchased'].value_counts()
0 257
1 143
Name: Purchased, dtype: int64
param_grid = {'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.01, 0.1, 1, 10, 100]}
svm = SVC(kernel='linear', random_state=0)
grid1 = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy')
grid1.fit(X_train, Y_train)
param_grid = {'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.01, 0.1, 1, 10, 100]}
svm = SVC(kernel='rbf', random_state=0)
grid2 = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy')
grid2.fit(X_train, Y_train)
param_grid = {'C':[0.01, 0.1, 1, 10, 100], 'gamma':[0.01, 0.1, 1, 10, 100]}
svm = SVC(kernel='sigmoid', random_state=0)
grid3 = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy')
grid3.fit(X_train, Y_train)