본문 바로가기
딥러닝/자연어처리

[word2vec] 카카오 댓글데이터로 word2vec 임베딩 해보기

by 달죽 2020. 11. 10.
반응형
# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','\n']


okt = Okt()
tokenized_data = []
for sentence in contents:
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    tokenized_data.append(temp_X)
    
    
# 리뷰 길이 분포 확인
print('리뷰의 최대 길이 :',max(len(l) for l in tokenized_data))
print('리뷰의 평균 길이 :',sum(map(len, tokenized_data))/len(tokenized_data))
plt.hist([len(s) for s in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()    ​
#-*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import os
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt



__file__ = 'C:/Users/user/Documents/카카오/035720'

titles = []
contents = []




print(__file__)
for file in os.listdir(__file__):
    titles.append(file.split('_')[-1][:-4])
    if file.lower().endswith(".txt"):
        fpath = os.path.join(__file__, file)
        try:
            with open(fpath, 'r') as f:
                content = f.readlines()
                content_str =''

                for i in content:
                    content_str += i
                #print(content_str.strip('\n').replace(".",""))
                contents.append(content_str.strip('\n').replace(".",""))
        except:
            pass
        
print(len(titles),len(contents))



        #with open(fpath, 'w') as f:
        #    f.write(''.join(content))

 

from gensim.models import Word2Vec
model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

model.wv.vectors.shape
# (23655, 100)

print(model.wv.most_similar("카카오"))
print(model.wv.most_similar("가즈아"))
print(model.wv.most_similar("화이자"))
print(model.wv.most_similar("ㅠㅠ"))
print(model.wv.most_similar("본전"))

 

 

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

path = 'C:\\Users\\user\\Downloads\\NEXONLv1GothicLight.ttf'
fontprop = fm.FontProperties(fname=path, size=8)

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False
from sklearn.manifold import TSNE


vocab = list(model.wv.vocab)
tsne = TSNE(n_components=2)
print(len(vocab))

vocab_1 = []
i = 0
while True:
    i = i + 1
    try:
        if len(vocab[i]) > 2:
            
            vocab_1.append(vocab[i])
            
    except:
        break
print(len(vocab_1))
        

X = model[vocab_1]
# 100개의 단어에 대해서만 시각화
X_tsne = tsne.fit_transform(X[:100,:])
#X_tsne = tsne.fit_transform(X)

df = pd.DataFrame(X_tsne, index=vocab_1[:100], columns=['x', 'y'])
print(df.shape)

fig = plt.figure()
fig.set_size_inches(20, 10)
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
    ax.annotate(word, pos, fontsize=12,fontproperties=fontprop)
plt.show()

df.head(10)

 

더 깊은 분석은 따로 진행하는걸로 ^^

활용할수 있는 방안이 많은 것 같다. 

 

참고 : programmers.co.kr/learn/courses/21/lessons/1698

 

반응형

댓글