실습 코드
import pandas as pd
df = pd.read_excel('patents.xlsx')
from kiwipiepy import Kiwi
kiwi = Kiwi()
def extract_nouns(text):
result = kiwi.tokenize(text)
for token in result:
if token.tag in ['NNG', 'NNP']:
yield token.form
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
max_features=100,
tokenizer=extract_nouns)
dtm = cv.fit_transform(df['abstract'])
word_count = pd.DataFrame({
'단어': cv.get_feature_names_out(),
'빈도': dtm.sum(axis=0).flat
})
count_dic = dict(zip(word_count.단어, word_count.빈도))
word_count.sort_values('빈도', ascending=False)
from wordcloud import WordCloud
from PIL import Image
import numpy as np
from wordcloud import ImageColorGenerator
mask = Image.open('mask.png')
mask = np.asarray(mask)
wc = WordCloud(font_path='온글잎 누카.ttf', background_color='white', mask=mask)
wc.fit_words(count_dic)
color_func = ImageColorGenerator(mask)
cloud = wc.recolor(color_func=color_func)
cloud.to_image()