[텍스트 분석] kiwi :: 마인드스케일

설치

!pip install kiwipiepy

형태소 분석

from kiwipiepy import Kiwi
kiwi = Kiwi()

text = '오늘은 자연어 처리를 배우기 좋은 날이다.'
result = kiwi.tokenize(text)
result

[Token(form='오늘', tag='NNG', start=0, len=2),
 Token(form='은', tag='JX', start=2, len=1),
 Token(form='자연어', tag='NNP', start=4, len=3),
 Token(form='처리', tag='NNG', start=8, len=2),
 Token(form='를', tag='JKO', start=10, len=1),
 Token(form='배우', tag='VV', start=12, len=2),
 Token(form='기', tag='ETN', start=14, len=1),
 Token(form='좋', tag='VA', start=16, len=1),
 Token(form='은', tag='ETM', start=17, len=1),
 Token(form='날', tag='NNG', start=19, len=1),
 Token(form='이', tag='VCP', start=20, len=1),
 Token(form='다', tag='EF', start=21, len=1),
 Token(form='.', tag='SF', start=22, len=1)]

명사 추출

def extract_noun(text):
    result = kiwi.tokenize(text)
    for token in result:
        if token.tag in ['NNG', 'NNP']:
            yield token.form


list(extract_noun('어제는 홍차를 마시고, 오늘은 커피를 마셨다.'))

['어제', '홍차', '오늘', '커피']

한국어 문서 단어 행렬

데이터

import pandas as pd
df = pd.read_csv('news_ai.csv')

사용자 단어

kiwi = Kiwi()
kiwi.add_user_word('인공지능', 'NNG')

True

문서 단어 행렬

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    max_features=100,       # 최대 단어 수(빈도 순)
    tokenizer=extract_noun) # 토큰화 방법

dtm = cv.fit_transform(df['본문'])

C:\Users\eupho\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

word_count = pd.DataFrame({
    '단어': cv.get_feature_names_out(),
    '빈도': dtm.sum(axis=0).flat
})

word_count.sort_values('빈도', ascending=False).head(10)

	단어	빈도
16	기술	279
66	인공지능	229
2	개발	150
11	교육	146
19	대표	125
77	제공	118
42	서비스	116
72	전형	109
17	기업	106
36	사업	103

준단어 토큰화

from kiwipiepy.sw_tokenizer import SwTokenizer, SwTokenizerConfig

tokenizer = SwTokenizer.train(
    save_path='ai_news_subword.json',
    texts=df['본문'],
    vocab_size=3000,
    config=SwTokenizerConfig())

Tokenizing: 100%|███████████████████████████████████████████████████████████████████| 102/102 [00:00<00:00, 385.11it/s]
Reducing #1:   1%|▎                                     | 7/1000 [00:00<00:05, 188.99it/s, vocab_size=3000, loss=-6.44]

Iteration: 0 VocabSize: 5636 Loss: -6.7434
Iteration: 1 VocabSize: 5001 Loss: -6.7030
Iteration: 2 VocabSize: 4515 Loss: -6.6663
Iteration: 3 VocabSize: 4070 Loss: -6.6091
Iteration: 4 VocabSize: 3673 Loss: -6.5501
Iteration: 5 VocabSize: 3311 Loss: -6.4947
Iteration: 6 VocabSize: 3008 Loss: -6.4368
Iteration: 7 VocabSize: 3000 Loss: -6.4354
Finished. Iteration: 7 VocabSize: 3000 Loss: -6.4354

codes = tokenizer.encode('자연어 처리가 재밌다')

[tokenizer.id2vocab[i] for i in codes]

['자연', '##어', '처', '##리', '가/J', '재', '##미', '##ᆻ', '다/E']

근대 한국어 준단어 토큰화

다운로드: https://github.com/ByungjunKim/ModernKoreanSubword

tokenizer = SwTokenizer('ModernKoreanSubword.json', kiwi=Kiwi())

codes = tokenizer.encode('外來語音을標記할때에새로운符號를新制하지말고朝鮮在來의字母만가지고하자는것이다')
[tokenizer.id2vocab[i] for i in codes]

['外',
 '##來',
 '##語',
 '##音',
 '을/J',
 '##標',
 '##記',
 '하/V',
 'ᆯ/E',
 '##때',
 '에/J',
 '새롭/V-I',
 '은/E',
 '##符',
 '##號',
 '를/J',
 '##新',
 '##制',
 '하/V',
 '지/E',
 '말/V',
 '고/E',
 '##朝鮮',
 '##在來',
 '의/J',
 '##字',
 '##母',
 '만/J',
 '가지/V',
 '고/E',
 '하/V',
 '자는/E',
 '##것',
 '이/V',
 '다/E']