data_dir = Path('../data/tiny_data/birds')
s = "This��small��bird��has��a��white��belly,��black"
test_eq(preprocess_text(s), 'this small bird has a white belly black')
get_caps(data_dir/'caps', '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg')
train_items, valid_items = get_items(data_dir)
len(train_items), train_items[0]
# # exporti
# def split_items(items):
# ''' items: List of (split, img_loc, bbox, caps)
# returns: train_items, valid_items(List of (img_loc, bbox, caps)) '''
# train_items = []
# valid_items = []
# for item in items:
# if item[0]=='1':
# train_items.append((item[1], item[2], item[3]))
# elif item[0]=='0':
# valid_items.append((item[1], item[2], item[3]))
# else:
# raise Exception('Opps!!!')
# return train_items, valid_items
# train_items, valid_items = split_items(items)
# test_eq(len(train_items), 150)
# test_eq(len(valid_items), 132)
# train_items[0]
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
tokenizer.encode('This is a cat', max_length=2, pad_to_max_length=True, add_special_tokens=False)
tokenizer = BertTokenizer()
ori_str = 'the bird has a very thick curved and beige beak'
out, tok_len = tokenizer.encode(ori_str)
test_eq(out, [14, 2838, 63, 21, 253, 2318, 9279, 17, 44, 49, 834, 44, 1378, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
test_eq(tok_len, 13)
test_eq(len(out), tokenizer.max_seq_len)
test_eq(tokenizer.decode(out), ori_str)
tokenizer.pad_id, tokenizer.vocab_sz
dset = BirdsDataset(train_items)
cap, cap_len, img = dset[0]
test_eq(cap.shape, (dset.tokenizer.max_seq_len,))
test_eq(cap_len.shape, ())
test_eq(img.shape, (256, 256, 3))
print(cap, cap_len)
plt.imshow(img)
dsets = Datasets(data_dir)
test_eq(len(dsets.train), 282)
test_eq(len(dsets.valid), 0)
dls = DataLoaders(dsets, bs=16)
for cap, cap_len, img in dls.train:
test_eq(cap.shape, (16, dls.train.dataset.tokenizer.max_seq_len))
test_eq(cap_len.shape, (16,))
test_eq(img.shape, (16, 256, 256, 3))
break