Numericalize and Padding
TransformersNumericalize¶
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tok_list = ['[CLS]', 'this', 'is', 'a', 'test', '[SEP]']
num_list = TensorText([ 101, 2023, 2003, 1037, 3231, 102])
transformersNumericalizer = TransformersNumericalize(tokenizer)
test_eq(transformersNumericalizer.encodes(tok_list), num_list)
test_eq(transformersNumericalizer.decodes(num_list), '[CLS] this is a test [SEP]')
Pad2Max¶
pad2max = Pad2Max(10, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231, 102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231, 102, 1, 1, 1, 1])
test_eq(pad2max(num_list), padded_num_list)
pad2max = Pad2Max(3, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231, 102])
padded_num_list = torch.tensor([ 101, 2023, 2003])
test_eq(pad2max(num_list), padded_num_list)
pad2max = Pad2Max(6, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231, 102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231, 102])
test_eq(pad2max(num_list), padded_num_list)