data_dir = Path('../data/tiny_data/birds')

Items

s = "This��small��bird��has��a��white��belly,��black"
test_eq(preprocess_text(s), 'this small bird has a white belly black')
get_caps(data_dir/'caps', '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg')
['light tan colored bird with a white head and an orange beak',
 'the bird has a very thick curved and beige beak',
 'this bird has a long neck that is grainy and a pastel orangeblue narrow beak that droops down at the tip',
 'this bird is light brown has a long hooked bill and looks dumb',
 'this large white bird has a large curved bill and a brown eye',
 'this bird is white with grey and has a long pointy beak',
 'this bird is white with grey and has a long pointy beak',
 'the crown of the bird is white with light brown tones throughout',
 'the crown of the bird has distinctive tones of white and brown throughout',
 'this bird has a long neck and an orange bill']
train_items, valid_items = get_items(data_dir)
len(train_items), train_items[0]
(282,
 (Path('../data/tiny_data/birds/imgs/001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg'),
  (#4) [60.0,27.0,325.0,304.0],
  ['light tan colored bird with a white head and an orange beak',
   'the bird has a very thick curved and beige beak',
   'this bird has a long neck that is grainy and a pastel orangeblue narrow beak that droops down at the tip',
   'this bird is light brown has a long hooked bill and looks dumb',
   'this large white bird has a large curved bill and a brown eye',
   'this bird is white with grey and has a long pointy beak',
   'this bird is white with grey and has a long pointy beak',
   'the crown of the bird is white with light brown tones throughout',
   'the crown of the bird has distinctive tones of white and brown throughout',
   'this bird has a long neck and an orange bill']))
# # exporti
# def split_items(items):
#     ''' items: List of (split, img_loc, bbox, caps) 
#         returns: train_items, valid_items(List of (img_loc, bbox, caps)) '''
#     train_items = []
#     valid_items = []
#     for item in items:
#         if item[0]=='1':
#             train_items.append((item[1], item[2], item[3]))
#         elif item[0]=='0':
#             valid_items.append((item[1], item[2], item[3]))
#         else:
#             raise Exception('Opps!!!')
#     return train_items, valid_items
# train_items, valid_items = split_items(items)
# test_eq(len(train_items), 150)
# test_eq(len(valid_items), 132)
# train_items[0]

Datasets

tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')

tokenizer.encode('This is a cat', max_length=2, pad_to_max_length=True, add_special_tokens=False)
[48, 25]

class BertTokenizer[source]

BertTokenizer()

tokenizer = BertTokenizer()
ori_str = 'the bird has a very thick curved and beige beak'
out, tok_len = tokenizer.encode(ori_str)
test_eq(out, [14, 2838, 63, 21, 253, 2318, 9279, 17, 44, 49, 834, 44, 1378, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
test_eq(tok_len, 13)
test_eq(len(out), tokenizer.max_seq_len)
test_eq(tokenizer.decode(out), ori_str)
tokenizer.pad_id, tokenizer.vocab_sz
(0, 30000)
dset = BirdsDataset(train_items)
cap, cap_len, img = dset[0]
test_eq(cap.shape, (dset.tokenizer.max_seq_len,))
test_eq(cap_len.shape, ())
test_eq(img.shape, (256, 256, 3))

print(cap, cap_len)
plt.imshow(img)
tensor([  48, 2838,   63,   21,  175, 1425,   17,   40, 2987, 1071,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]) tensor(10)
<matplotlib.image.AxesImage at 0x7efb7254aa90>

class Datasets[source]

Datasets(data_dir, pct=1)

dsets = Datasets(data_dir)
test_eq(len(dsets.train), 282)
test_eq(len(dsets.valid), 0)

DataLoaders

class DataLoaders[source]

DataLoaders(dsets, bs=64)

dls = DataLoaders(dsets, bs=16)
for cap, cap_len, img in dls.train:
    test_eq(cap.shape, (16, dls.train.dataset.tokenizer.max_seq_len))
    test_eq(cap_len.shape, (16,))
    test_eq(img.shape, (16, 256, 256, 3))
    break