Assignment Code: NLP Basic
import os
import re
import numpy as np
import pandas as pd
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
def load_data(file_path):
....path = "C:/Users/YY/Desktop/TB/Week05/NLP/"
....file = pd.read_csv(os.path.join(path, file_path), encoding='utf-8', index_col=0)
....file.drop(['from', 'Date'], axis=1, inplace=True)
....file.rename(columns={'x':'contents'}, inplace=True)
....print("loading done")
....return file
def make_stopwords(file):
....lines = []
....f = open(os.path.join(path, file), 'r')
....while True:
........line = f.readline()
........if not line:
............break
........lines.append(line)
....f.close()
....stopwords = set(re.sub('\n', '', word) for word in lines)
....print(list(stopwords)[0:10])
....print("making stopwords done")
....return stopwords
def remove_id():
....pattern = re.compile('.@+[A-Za-z0-9\_]*:*')
....tweets = [re.sub(pattern, ' ', sentence) for sentence in list(file['contents'])]
....print("removing id done")
....return tweets
class TweetTokenizer:
....def __init__(self):
........self.twitter = Twitter()
........self.stopwords = stopwords
....def nominalize(tweets, start, end):
........nouns = []
........for tweet in tweets[start:end]:
............nouns.append(' '.join([noun for noun in twitter.nouns(str(tweet)) if not noun in stopwords]))
........# print(len(nouns))
.....# document = ' '.join(nouns)
........print("tokenizing done")
........return nouns
def embedding_clustering():
....vect = CountVectorizer(min_df=0.001, encoding='utf-8', max_features=50, ngram_range=(1, 1))
....bow = vect.fit_transform(nouns)
....print("사전 길이: ", len(vect.vocabulary_))
....X = bow.toarray()
....print("X shape: ", X.shape)
....vect.get_feature_names()
....dict = {'문재인':0, '남북정상회담':1, '지방선거':2, '자유한국당':3, '안철수':4, '더불어민주당':5,
....................'미투':6, '바른미래당':7, '보수':8, '서울시장':9, '진보':10, '박원순':11, '김문수':12}
....Y = np.array(file['Keyword'].map(dict)).astype(int).reshape(-1, 1)
....kmeans = KMeans(n_clusters=13)
....kmeans.fit(X)
....pred = kmeans.predict(X).reshape(-1, 1)
....result = np.concatenate([pred, Y], axis=1)
....print(pd.Series(pred.reshape(-1, )).value_counts())
....print(pd.Series(Y.reshape(-1, )).value_counts())
....return result
def main():
....file = load_data('tweet.csv')
....stopwords = make_stopwords('korean_stopwords.txt')
....twitter = Twitter()
....tweets = remove_id()
....nouns = TweetTokenizer.nominalize(tweets, 0, 118570)
....result = embedding_clustering()
if __name__ == '__main__':
....main()
# print를 통해 중간 결과 기록
print(list(stopwords)[0:10])
['이와같다면', '자', '그에 따르는', '영차', '얼마만큼', '양자', '막론하고', '아무도', '근거로', '이용하여']
print("사전 길이: ", len(vect.vocabulary_))
사전 길이: 50
print("X shape: ", X.shape)
X shape: (118570, 50)
print(pd.Series(pred.reshape(-1, )).value_counts())
7 25403
12 24670
1 11498
6 11219
10 10595
2 8117
0 8080
4 5044
11 3448
3 3084
8 3009
5 2612
9 1791
dtype: int64
print(pd.Series(Y.reshape(-1, )).value_counts())
0 39300
1 17885
2 13530
3 10447
4 9834
5 7228
6 5391
7 4375
8 3602
9 2962
10 2381
11 1311
12 324
dtype: int64
실제로 result를 통해 확인해보면 제대로 Keyword를 예측했다고 이야기 하기 어려운 결과를 보였다.
대부분의 트윗이 정치와 관련된 주제를 담고 있었기 때문에 단순한 1-gram 임베딩으로는 원하는 결과를
얻지 못하는 것으로 보인다.