!pip install transformers kobert-transformers Korpora
from Korpora import KoreanChatbotKorpus
data = KoreanChatbotKorpus()
from kobert_transformers import get_tokenizer
tokenizer = get_tokenizer()
from transformers import TFBertForMaskedLM
model = TFBertForMaskedLM.from_pretrained('monologg/kobert', from_pt=True)
from transformers import FillMaskPipeline
pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer)
'[MASK]'
[{'score': 0.023531658574938774,
'sequence': '[CLS]넨 하루![SEP]',
'token': 5706,
'token_str': '넨'},
{'score': 0.008909070864319801,
'sequence': '[CLS]샌 하루![SEP]',
'token': 6539,
'token_str': '샌'},
{'score': 0.007057589013129473,
'sequence': '[CLS] 김정 하루![SEP]',
'token': 1330,
'token_str': '▁김정'},
{'score': 0.006253202445805073,
'sequence': '[CLS]자마자 하루![SEP]',
'token': 7160,
'token_str': '자마자'},
{'score': 0.006053244695067406,
'sequence': '[CLS]짠 하루![SEP]',
'token': 7364,
'token_str': '짠'}]
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data.train, test_size=0.2, random_state=1234)
train_texts = [item.text for item in train_data]
test_texts = [item.text for item in test_data]
train_encodings = tokenizer(train_texts, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True)
import random
import numpy as np
train_labels = []
for input_ids in train_encodings['input_ids']:
n = input_ids.index(tokenizer.sep_token_id)
i = random.randint(1, n-1)
labels = np.ones_like(input_ids) * -100
labels[i] = input_ids[i]
input_ids[i] = tokenizer.mask_token_id
train_labels.append(labels)
test_labels = []
for input_ids in test_encodings['input_ids']:
n = input_ids.index(tokenizer.sep_token_id)
i = random.randint(1, n-1)
labels = np.ones_like(input_ids) * -100
labels[i] = input_ids[i]
input_ids[i] = tokenizer.mask_token_id
test_labels.append(labels)
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels
))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 6s 81ms/step - loss: 10.3061 - accuracy: 1.5660e-05
[10.306079864501953, 1.566048013046384e-05]
model.fit(train_dataset.batch(32), epochs=1, batch_size=32)
[{'score': 0.20509740710258484,
'sequence': '[CLS] 첫 하루![SEP]',
'token': 4481,
'token_str': '▁첫'},
{'score': 0.061551231890916824,
'sequence': '[CLS] 이 하루![SEP]',
'token': 3647,
'token_str': '▁이'},
{'score': 0.054259803146123886,
'sequence': '[CLS] 마지막 하루![SEP]',
'token': 1919,
'token_str': '▁마지막'},
{'score': 0.046807821840047836,
'sequence': '[CLS] 내 하루![SEP]',
'token': 1434,
'token_str': '▁내'},
{'score': 0.031148234382271767,
'sequence': '[CLS] 오늘 하루![SEP]',
'token': 3419,
'token_str': '▁오늘'}]
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 6s 81ms/step - loss: 3.7584 - accuracy: 0.0130
[3.75840425491333, 0.013013859279453754]