BERT를 이용한 문장 분류 :: 대화형 AI - mindscale
Skip to content

BERT를 이용한 문장 분류

!pip install transformers kobert-transformers Korpora
from Korpora import KoreanChatbotKorpus
data = KoreanChatbotKorpus()
item = data.train[0]
item.text
'12시 땡!'
item.pair
'하루가 또 가네요.'
item.label
0
from kobert_transformers import get_tokenizer
tokenizer = get_tokenizer()
from transformers import TFBertForSequenceClassification, BertConfig
setting = {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": False,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "type_vocab_size": 2,
  "vocab_size": 8002,
  "num_labels": 3
}
config = BertConfig(**setting)
model = TFBertForSequenceClassification.from_pretrained('monologg/kobert', config=config, from_pt=True)
from transformers import TextClassificationPipeline
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)
pipeline('오늘은 즐거운 하루!')
[{'label': 'LABEL_0', 'score': 0.3773807883262634}]
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data.train, test_size=0.2, random_state=1234)
len(train_data)
9458
len(test_data)
2365
train_texts = [item.text for item in train_data]
train_labels = [item.label for item in train_data]

test_texts = [item.text for item in test_data]
test_labels = [item.label for item in test_data]
train_encodings = tokenizer(train_texts, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True)
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 5s 73ms/step - loss: 1.0564 - accuracy: 0.4643
[1.0563859939575195, 0.46427062153816223]
model.fit(train_dataset.batch(32), epochs=1, batch_size=32)
296/296 [==============================] - 67s 225ms/step - loss: 0.5619 - accuracy: 0.7763
<tensorflow.python.keras.callbacks.History at 0x7ff8ba9b2630>
model.evaluate(test_dataset.batch(32), batch_size=32)
74/74 [==============================] - 5s 71ms/step - loss: 0.4092 - accuracy: 0.8465
[0.40923404693603516, 0.8465116024017334]
pipeline('오늘 수업은 여기까지!')
[{'label': 'LABEL_0', 'score': 0.9609343409538269}]