transformers 깁스 추출
import random
import tensorflow as tf
from transformers import TFAutoModelForMaskedLM, AutoTokenizer
from transformers.generation_tf_utils import tf_top_k_top_p_filtering
모형 로딩
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = TFAutoModelForMaskedLM.from_pretrained('distilroberta-base')
마스크 토큰
tokenizer.mask_token
'<mask>'
초기화
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')
tokenizer.decode(input_ids)
'<s>I like this movie<mask><mask><mask><mask><mask><mask></s>'
길이
SEQ_LEN = len(input_ids)
SEQ_LEN
12
무작위로 위치 하나를 골라 마스킹
i = random.randint(1, SEQ_LEN - 2)
input_ids[i] = tokenizer.mask_token_id
문장을 모형에 입력
result = model(tf.convert_to_tensor([input_ids]))
로짓
logits = result[0]
앞에서 마스킹한 위치의 로짓을 선택
logits = logits[:, i, :]
로짓에 따라 무작위로 토큰을 고름
token_id = tf.random.categorical(logits, num_samples=1)
고른 토큰의 번호를 마스킹된 위치에 대입
input_ids[i] = token_id.numpy()[0,0]
문장 확인
tokenizer.decode(input_ids)
'<s>I like this movie<mask><mask><mask>!!!!<mask><mask></s>'
위의 과정을 20회 반복
for _ in range(20):
i = random.randint(1, SEQ_LEN - 2)
input_ids[i] = tokenizer.mask_token_id
result = model(tf.convert_to_tensor([input_ids]))
logits = result[0]
logits = logits[:, i, :]
token_id = tf.random.categorical(logits, num_samples=1)
input_ids[i] = token_id.numpy()[0,0]
print(tokenizer.decode(input_ids))
<s>I like this movie<mask><mask><mask><mask><mask>!]</s> <s>Just like this movie<mask><mask><mask><mask><mask>!]</s> <s>Just like this movie<mask><mask><<mask><mask>!]</s> <s> indeed like this movie<mask><mask><<mask><mask>!]</s> <s> indeed like this movie<mask><mask>サ<mask><mask>!]</s> <s> indeed like this movie<mask><mask>サ<mask><mask>�</s> <s> indeed like this movie<mask><mask>�<mask><mask>�</s> <s> indeed like this movie<mask> 😊<mask><mask>�</s> <s> indeed like this movie<mask> 😊<mask><mask> Join</s> <s> indeed like this movie awesome 😊<mask><mask> Join</s> <s> indeed like this sounds awesome 😊<mask><mask> Join</s> <s> indeed like this sounds awesome 😊<mask><mask> Join</s> <s> indeed like this sounds great 😊<mask><mask> Join</s> <s> indeed YES this sounds great 😊<mask><mask> Join</s> <s> indeed YES this looks great 😊<mask><mask> Join</s> <s> indeed YES this looks great 😊 �<mask> Join</s> <s>� YES this looks great 😊 �<mask> Join</s> <s>� YES Angela looks great 😊 �<mask> Join</s> <s>� YES Angela looks amazing 😊 �<mask> Join</s> <s>� YES Angela IS amazing 😊 �<mask> Join</s>
top-k를 적용하여 20회 반복
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')
for _ in range(20):
i = random.randint(1, SEQ_LEN - 2)
input_ids[i] = tokenizer.mask_token_id
result = model(tf.convert_to_tensor([input_ids]))
logits = result[0]
logits = logits[:, i, :]
logits = tf_top_k_top_p_filtering(logits, top_k=50) # top-k
token_id = tf.random.categorical(logits, num_samples=1)
input_ids[i] = token_id.numpy()[0,0]
print(tokenizer.decode(input_ids))
<s>I like this movie<mask>!!!!<mask><mask><mask><mask></s> <s>I enjoyed this movie<mask>!!!!<mask><mask><mask><mask></s> <s>I enjoyed this movie!!!!!!!!!!!!<mask><mask><mask><mask></s> <s>I loved this movie!!!!!!!!!!!!<mask><mask><mask><mask></s> <s>I loved this movie!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>I loved this!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>I loved everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>I loved everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>I loved everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God loved everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask><mask></s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask>!!!</s> <s>God bless everyone!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask>!!</s> <s>God bless!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<mask>!!</s>
top-p를 적용하여 20회 반복
input_ids = tokenizer.encode('I like this movie <mask><mask><mask><mask><mask><mask>')
for _ in range(20):
i = random.randint(1, SEQ_LEN - 2)
input_ids[i] = tokenizer.mask_token_id
result = model(tf.convert_to_tensor([input_ids]))
logits = result[0]
logits = logits[:, i, :]
logits = tf_top_k_top_p_filtering(logits, top_p=0.9) # top-p
token_id = tf.random.categorical(logits, num_samples=1)
input_ids[i] = token_id.numpy()[0,0]
print(tokenizer.decode(input_ids))
<s>I like this movie<mask>like<mask><mask><mask><mask></s> <s>I like this movie<mask>like Dark<mask><mask><mask></s> <s>More like this movie<mask>like Dark<mask><mask><mask></s> <s>More about this movie<mask>like Dark<mask><mask><mask></s> <s>More about this movie<mask>like Dark Twisted<mask><mask></s> <s>More about this movie<mask>Pretty Dark Twisted<mask><mask></s> <s>More about this movie<mask>Pretty Dark Twisted<mask><mask></s> <s>More about this movie<mask>Pretty Dark Twisted<mask><mask></s> <s>More about this story<mask>Pretty Dark Twisted<mask><mask></s> <s>More about this story<mask>Pretty Dark Twisted<mask><mask></s> <s>More about this story<mask>Pretty Dark Side<mask><mask></s> <s>More about this story<mask> A Dark Side<mask><mask></s> <s>More about this story<mask> A Dark Side<mask> Subscribe</s> <s>More about this story<mask> A Dark Side<mask> News</s> <s>More about this story: A Dark Side<mask> News</s> <s>More about this post: A Dark Side<mask> News</s> <s>More about this gem: A Dark Side<mask> News</s> <s>More on this gem: A Dark Side<mask> News</s> <s>More on this gem: A Dark Side<mask> News</s> <s>More on this gem: A Dark Side<mask> News</s>