!pip install transformers kobert-transformers Korpora
from Korpora import KoreanChatbotKorpus
data = KoreanChatbotKorpus()
'12시 땡!'
'하루가 또 가네요.'
0
from kobert_transformers import get_tokenizer
tokenizer = get_tokenizer()
from transformers import TFBertForSequenceClassification, BertConfig
setting = {
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": False,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 1,
"type_vocab_size": 2,
"vocab_size": 8002,
"num_labels": 3
}
config = BertConfig(**setting)
model = TFBertForSequenceClassification.from_pretrained('monologg/kobert', config=config, from_pt=True)
from transformers import TextClassificationPipeline
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)
[{'label': 'LABEL_0', 'score': 0.3773807883262634}]
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data.train, test_size=0.2, random_state=1234)
9458
2365
train_texts = [item.text for item in train_data]
train_labels = [item.label for item in train_data]
test_texts = [item.text for item in test_data]
test_labels = [item.label for item in test_data]
train_encodings = tokenizer(train_texts, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
test_labels
))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 5s 73ms/step - loss: 1.0564 - accuracy: 0.4643
[1.0563859939575195, 0.46427062153816223]
model.fit(train_dataset.batch(32), epochs=1, batch_size=32)
296/296 [==============================] - 67s 225ms/step - loss: 0.5619 - accuracy: 0.7763
<tensorflow.python.keras.callbacks.History at 0x7ff8ba9b2630>
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 5s 71ms/step - loss: 0.4092 - accuracy: 0.8465
[0.40923404693603516, 0.8465116024017334]
[{'label': 'LABEL_0', 'score': 0.9609343409538269}]