import torch

from fastai2.text.all import *
from transformers import AutoModelWithLMHead, AutoTokenizer

from fastai_transformers_utils.generated_lm import GeneratedLM, GenerateArgs

The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
We recommend you upgrade now or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic: more info.

# all_slow

Used on Hunggingface's Model

# load pretrained model and vocab
lm = AutoModelWithLMHead.from_pretrained('distilgpt2')
lm.eval()
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
RobertaTokenizerFast has an issue when working on mask language modeling where it introduces an extra encoded space before the mask token.See https://github.com/huggingface/transformers/pull/2778 for more information.
num_returns = 3
sentence = 'The dog'
tgt = torch.tensor([tokenizer.encode(sentence)] * num_returns)

generate_args = GenerateArgs(   
    max_length=20,
    do_sample=True,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    top_p=1.0,
    repetition_penalty=1,
    length_penalty=1.0,
)
generated_lm = GeneratedLM(lm, tokenizer.vocab_size, lm.config.pad_token_id, [lm.config.eos_token_ids], True)
numeric_result = generated_lm.generate(tgt, generate_args)

for i in range(num_returns):
    result = tokenizer.decode(list(numeric_result[i]), skip_special_tokens=True)
    print(result)
Using pad_token, but it is not set yet.
Disabled padding because no padding token set (pad_token: None, pad_token_id: None).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))
The dog!!!!!!!!!!!!!!!!!!
The dog is extremely popular and often gets the respect to it.SOULIER:!
The dog you're chasing? That's my favorite dog!!!!!!!!!

Used on Fastai2 AWD_LSTM

# load pretrained model and vocab
path = untar_data(URLs.WT103_FWD)
vocab = list(path.glob('*.pkl'))[0].load()
model_weights = torch.load(list(path.glob('*.pth'))[0], map_location = lambda storage,loc: storage)
model = get_language_model(AWD_LSTM, len(vocab))
load_ignore_keys(model, model_weights)
<All keys matched successfully>
# Tokenize and Numericalize
tokenizer = Tokenizer(SpacyTokenizer())
numericalizer = Numericalize(vocab=vocab)
pipe = Pipeline([tokenizer, numericalizer], True)

num_returns = 2
sentence = 'The dog'
tgt = torch.stack([pipe(sentence)] * num_returns, dim=0)
tgt
tensor([[   2,    5,    9, 2235],
        [   2,    5,    9, 2235]])
# Generate and Decode
generate_args = GenerateArgs(   
    max_length=20,
    do_sample=True,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    top_p=1,
    repetition_penalty=1,
    length_penalty=1.0,
)
generated_lm = GeneratedLM(model, len(vocab), awd_lstm_lm_config['pad_token'], [3], False)
numeric_result = generated_lm.generate(tgt, generate_args)

for i in range(num_returns):
    result = pipe.decode(list(numeric_result[i]))
    print(result)
xxbos xxmaj the dog - like actors of the 1960s films ; he said there were films in the xxeos
xxbos xxmaj the dog show is performed during the day . xxunk is considered as one and they should xxeos