챗봇 데이터로 미세 조정

!pip install transformers Korpora

from Korpora import KoreanChatbotKorpus

chatbot_corpus = KoreanChatbotKorpus()

sample = chatbot_corpus.train[0]

sample.text

'12시 땡!'

sample.pair

'하루가 또 가네요.'

sample.label

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive

from sentencepiece import SentencePieceProcessor

tokenizer = SentencePieceProcessor(model_file='/gdrive/My Drive/kogpt2/kogpt2_news_wiki_ko_cased_818bfa919d.spiece')

import tensorflow as tf

def data_generator():
    for sample in chatbot_corpus.train:
        bos = [tokenizer.bos_id()]
        body = tokenizer.encode_as_ids(f'Q: {sample.text} A: {sample.pair}')
        eos = [tokenizer.eos_id()]
        yield bos + body + eos

dataset = tf.data.Dataset.from_generator(data_generator, output_types=tf.int32)

dataset = dataset.padded_batch(32, padded_shapes=(None, ), padding_values=tokenizer.pad_id())

for batch in dataset:
    print(batch)
    break

from transformers import TFGPT2LMHeadModel

model = TFGPT2LMHeadModel.from_pretrained('/gdrive/My Drive/kogpt2/kogpt2_transformers', from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.9.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.3.attn.bias', 'transformer.h.7.attn.bias', 'transformer.h.4.attn.bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.6.attn.bias', 'transformer.h.9.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.8.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.0.attn.bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.10.attn.bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.5.attn.bias', 'transformer.h.2.attn.bias', 'lm_head.weight', 'transformer.h.11.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.

opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

import tqdm

total = len(chatbot_corpus.train) // 32 + 1

for batch in tqdm.tqdm_notebook(dataset, total=total):
    with tf.GradientTape() as tape:
        result = model(batch, labels=batch)
        loss = result[0]
        mean_loss = tf.reduce_mean(loss)

    grads = tape.gradient(mean_loss, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))

model.save_pretrained('/gdrive/My Drive/kogpt2/kogpt2_chatbot')