import torch

from fastai2.text.all import *
from transformers import AutoModelWithLMHead, AutoTokenizer

from fastai_transformers_utils.generated_lm import GeneratedLM, GenerateArgs

# all_slow

Used on Hunggingface's Model¶

# load pretrained model and vocab
lm = AutoModelWithLMHead.from_pretrained('distilgpt2')
lm.eval()
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

RobertaTokenizerFast has an issue when working on mask language modeling where it introduces an extra encoded space before the mask token.See https://github.com/huggingface/transformers/pull/2778 for more information.

num_returns = 3
sentence = 'The dog'
tgt = torch.tensor([tokenizer.encode(sentence)] * num_returns)

generate_args = GenerateArgs(   
    max_length=20,
    do_sample=True,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    top_p=1.0,
    repetition_penalty=1,
    length_penalty=1.0,
)
generated_lm = GeneratedLM(lm, tokenizer.vocab_size, lm.config.pad_token_id, [lm.config.eos_token_ids], True)
numeric_result = generated_lm.generate(tgt, generate_args)

for i in range(num_returns):
    result = tokenizer.decode(list(numeric_result[i]), skip_special_tokens=True)
    print(result)

Using pad_token, but it is not set yet.
Disabled padding because no padding token set (pad_token: None, pad_token_id: None).
To remove this error, you can add a new pad token and then resize model embedding:
	tokenizer.pad_token = '<PAD>'
	model.resize_token_embeddings(len(tokenizer))

The dog!!!!!!!!!!!!!!!!!!
The dog is extremely popular and often gets the respect to it.SOULIER:!
The dog you're chasing? That's my favorite dog!!!!!!!!!

Used on Fastai2 AWD_LSTM¶

# load pretrained model and vocab
path = untar_data(URLs.WT103_FWD)
vocab = list(path.glob('*.pkl'))[0].load()
model_weights = torch.load(list(path.glob('*.pth'))[0], map_location = lambda storage,loc: storage)
model = get_language_model(AWD_LSTM, len(vocab))
load_ignore_keys(model, model_weights)

<All keys matched successfully>

# Tokenize and Numericalize
tokenizer = Tokenizer(SpacyTokenizer())
numericalizer = Numericalize(vocab=vocab)
pipe = Pipeline([tokenizer, numericalizer], True)

num_returns = 2
sentence = 'The dog'
tgt = torch.stack([pipe(sentence)] * num_returns, dim=0)
tgt

tensor([[   2,    5,    9, 2235],
        [   2,    5,    9, 2235]])

# Generate and Decode
generate_args = GenerateArgs(   
    max_length=20,
    do_sample=True,
    num_beams=5,
    temperature=1.5,
    top_k=50,
    top_p=1,
    repetition_penalty=1,
    length_penalty=1.0,
)
generated_lm = GeneratedLM(model, len(vocab), awd_lstm_lm_config['pad_token'], [3], False)
numeric_result = generated_lm.generate(tgt, generate_args)

for i in range(num_returns):
    result = pipe.decode(list(numeric_result[i]))
    print(result)

xxbos xxmaj the dog - like actors of the 1960s films ; he said there were films in the xxeos
xxbos xxmaj the dog show is performed during the day . xxunk is considered as one and they should xxeos