언어 모형으로 다음 토큰 예측

import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer

모형 다운로드

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")

어휘 수

tokenizer.vocab_size

단어 목록

vocab = tokenizer.get_vocab()
id2word = {i: word for word, i in vocab.items()}

문장

sequence = f"Once upon a time, there was "

토큰화

input_ids = tokenizer.encode(sequence, return_tensors="tf")

토큰 아이디

input_ids

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[1977,  975,   24,   92,   19,  105,   30,    4,    3]],
      dtype=int32)>

토큰화된 문장을 모형에 입력

result = model(input_ids)

로짓

logits = result[0]

다음 토큰의 로짓

next_token_logits = logits[:, -1, :]

next_token_logits.shape

TensorShape([1, 32000])

다음 토큰의 확률

tf.nn.softmax(next_token_logits)

<tf.Tensor: shape=(1, 32000), dtype=float32, numpy=
array([[1.5795988e-05, 7.4948890e-13, 1.0546671e-12, ..., 1.7906471e-09,
        4.4336779e-09, 1.8470767e-12]], dtype=float32)>

로짓(=확률)이 최대인 토큰 아이디 10개

top = tf.math.top_k(next_token_logits, k=10)

출력

for i in top.indices[0].numpy().tolist():
    print(id2word[i])

▁or
▁
,
d
.
▁ever
▁a
▁just
▁was
▁and