GPT2 모형의 출력을 TFRecord로 저장

!pip install transformers

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive

filenames = ['/gdrive/My Drive/kogpt2/x_train_token.tfrecord']

import tensorflow as tf

raw_dataset = tf.data.TFRecordDataset(filenames)

feature_description = {
    'input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

parsed_example = raw_dataset.map(lambda x: tf.io.parse_single_example(x, feature_description))

sample = next(iter(parsed_example))

from transformers import TFGPT2Model

model = TFGPT2Model.from_pretrained('/gdrive/My Drive/kogpt2/kogpt2_transformers', from_pt=True)

%%time
x = tf.expand_dims(sample['input'], 0)
result = model(x)

BATCH_SIZE = 64
pad_dataset = parsed_example.map(lambda x: x['input']) \
    .padded_batch(BATCH_SIZE, padded_shapes=(None,), padding_values=tf.constant(3, dtype='int64'))

len_ds = parsed_example.map(lambda x: len(x['input']) - 1).batch(BATCH_SIZE)

def make_embedding():
    for tokens, lengths in tf.data.Dataset.zip((pad_dataset, len_ds)):
        result = model(tokens)
        for i, n in enumerate(lengths.numpy()):
            yield tf.io.serialize_tensor(tf.squeeze(tf.gather_nd(result[0], [(i, n)])))

embed_dataset = tf.data.Dataset.from_generator(make_embedding, output_types=tf.string)

%%time
filename = '/gdrive/My Drive/kogpt2/x_train_embed.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(embed_dataset)

CPU times: user 20.7 s, sys: 1.7 s, total: 22.4 s
Wall time: 21.6 s