!pip install transformers Korpora
from google.colab import drive
drive.mount('/gdrive')
Mounted at /gdrive
from Korpora import KoreanChatbotKorpus
corpus = KoreanChatbotKorpus()
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(corpus.train, test_size=0.2, random_state=42)
'엄청 로맨틱해'
2
import tensorflow as tf
from sentencepiece import SentencePieceProcessor
tokenizer = SentencePieceProcessor(model_file='/gdrive/My Drive/kogpt2/kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
def serialize_example(sample):
bos = [tokenizer.bos_id()]
body = tokenizer.encode_as_ids(sample.text)
eos = [tokenizer.eos_id()]
input_tokens = bos + body + eos
features = {
'input': tf.train.Feature(int64_list=tf.train.Int64List(value=input_tokens)),
'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[sample.label])),
}
example = tf.train.Example(features=tf.train.Features(feature=features))
return example.SerializeToString()
dataset = tf.data.Dataset.from_generator(lambda: map(serialize_example, x_train), output_types=tf.string)
filename = '/gdrive/My Drive/kogpt2/x_train_token.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(dataset)