토큰화된 데이터를 TFRecord로 저장 :: 대화형 AI - mindscale
Skip to content

토큰화된 데이터를 TFRecord로 저장

!pip install transformers Korpora
from google.colab import drive
drive.mount('/gdrive')
Mounted at /gdrive
from Korpora import KoreanChatbotKorpus
corpus = KoreanChatbotKorpus()
from sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(corpus.train, test_size=0.2, random_state=42)
sample = x_train[0]
sample.text
'엄청 로맨틱해'
sample.label
2
import tensorflow as tf
from sentencepiece import SentencePieceProcessor
tokenizer = SentencePieceProcessor(model_file='/gdrive/My Drive/kogpt2/kogpt2_news_wiki_ko_cased_818bfa919d.spiece')
def serialize_example(sample):
    bos = [tokenizer.bos_id()]
    body = tokenizer.encode_as_ids(sample.text)
    eos = [tokenizer.eos_id()]
    input_tokens = bos + body + eos

    features = {
        'input': tf.train.Feature(int64_list=tf.train.Int64List(value=input_tokens)),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[sample.label])),
    }

    example = tf.train.Example(features=tf.train.Features(feature=features))

    return example.SerializeToString()
dataset = tf.data.Dataset.from_generator(lambda: map(serialize_example, x_train), output_types=tf.string)
filename = '/gdrive/My Drive/kogpt2/x_train_token.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(dataset)