!pip install transformers
from google.colab import drive
drive.mount('/gdrive')
Mounted at /gdrive
filenames = ['/gdrive/My Drive/kogpt2/x_train_token.tfrecord']
raw_dataset = tf.data.TFRecordDataset(filenames)
feature_description = {
'input': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
'label': tf.io.FixedLenFeature([], tf.int64)
}
parsed_example = raw_dataset.map(lambda x: tf.io.parse_single_example(x, feature_description))
sample = next(iter(parsed_example))
from transformers import TFGPT2Model
model = TFGPT2Model.from_pretrained('/gdrive/My Drive/kogpt2/kogpt2_transformers', from_pt=True)
%%time
x = tf.expand_dims(sample['input'], 0)
result = model(x)
BATCH_SIZE = 64
pad_dataset = parsed_example.map(lambda x: x['input']) \
.padded_batch(BATCH_SIZE, padded_shapes=(None,), padding_values=tf.constant(3, dtype='int64'))
len_ds = parsed_example.map(lambda x: len(x['input']) - 1).batch(BATCH_SIZE)
def make_embedding():
for tokens, lengths in tf.data.Dataset.zip((pad_dataset, len_ds)):
result = model(tokens)
for i, n in enumerate(lengths.numpy()):
yield tf.io.serialize_tensor(tf.squeeze(tf.gather_nd(result[0], [(i, n)])))
embed_dataset = tf.data.Dataset.from_generator(make_embedding, output_types=tf.string)
%%time
filename = '/gdrive/My Drive/kogpt2/x_train_embed.tfrecord'
writer = tf.data.experimental.TFRecordWriter(filename)
writer.write(embed_dataset)
CPU times: user 20.7 s, sys: 1.7 s, total: 22.4 s
Wall time: 21.6 s