!pip install transformers kobert-transformers Korpora
from Korpora import KoreanChatbotKorpus
data = KoreanChatbotKorpus()
'12시 땡!'
'위로해 드립니다.'
from kobert_transformers import get_tokenizer
tokenizer = get_tokenizer()
from transformers import TFBertForNextSentencePrediction
model = TFBertForNextSentencePrediction.from_pretrained('monologg/kobert', from_pt=True)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data.train, test_size=0.2, random_state=1234)
train_texts = []
train_pairs = []
train_labels = []
for item in train_data:
train_texts.append(item.text)
label = random.randint(0, 1)
train_labels.append(label)
if label == 1:
train_pairs.append(item.pair)
else:
fake = random.choice(train_data)
train_pairs.append(fake.pair)
test_texts = []
test_pairs = []
test_labels = []
for item in test_data:
test_texts.append(item.text)
label = random.randint(0, 1)
test_labels.append(label)
if label == 1:
test_pairs.append(item.pair)
else:
fake = random.choice(test_data)
test_pairs.append(fake.pair)
train_encodings = tokenizer(train_texts, train_pairs, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, test_pairs, padding=True, truncation=True)
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
train_encodings['input_ids'][0]
train_encodings['token_type_ids'][0]
train_encodings['attention_mask'][0]
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels
))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 9s 121ms/step - loss: 2.4674 - accuracy: 0.5755
[2.467373847961426, 0.5754756927490234]
model.fit(train_dataset.batch(32), epochs=1, batch_size=32)
296/296 [==============================] - 100s 339ms/step - loss: 0.7384 - accuracy: 0.4986
<tensorflow.python.keras.callbacks.History at 0x7f54a4376160>
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 9s 128ms/step - loss: 0.6931 - accuracy: 0.5027
[0.6931469440460205, 0.502748429775238]
pred_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings))
prediction = model.predict(pred_dataset.batch(32))
tf.math.argmax(prediction[0], axis=-1).numpy()
array([0, 0, 0, ..., 0, 0, 0])