Numericalize and Padding

The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
We recommend you upgrade now or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic: more info.

TransformersNumericalize

class TransformersNumericalize[source]

TransformersNumericalize(tokenizer:PreTrainedTokenizer) :: Transform

Delegates (__call__,decode,setup) to (encodes,decodes,setups) if split_idx matches

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tok_list = ['[CLS]', 'this', 'is', 'a', 'test', '[SEP]']
num_list = TensorText([ 101, 2023, 2003, 1037, 3231,  102])
transformersNumericalizer = TransformersNumericalize(tokenizer)

test_eq(transformersNumericalizer.encodes(tok_list), num_list)
test_eq(transformersNumericalizer.decodes(num_list), '[CLS] this is a test [SEP]')

Pad2Max

class Pad2Max[source]

Pad2Max(max_len, pad_idx) :: Transform

pad rank one tensor by pad_idx to max_len, if original len is larger than max_len, truncate it

pad2max = Pad2Max(10, 1)

num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102,    1,    1,    1,    1])
test_eq(pad2max(num_list), padded_num_list)

pad2max = Pad2Max(3, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003])
test_eq(pad2max(num_list), padded_num_list)

pad2max = Pad2Max(6, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
test_eq(pad2max(num_list), padded_num_list)