In order to have a good model, the first step is to have a great dataset. Here are a few sources which contain a large number of pictures of high quality:
- Open Images Dataset
- ImageNet - though it contains a bit too many pictures of dogsā¦
We initially save the data in ../data/train
path = Path('../data')
We verify images and remove incorrect files.
fns = get_image_files(path)
failed = verify_images(fns)
show_path_images(path / 'train')
We remove all files that are too small.
def move_file(f, init_path, dest_path):
"Move file f from init_path to dest_path, keeping folder structure"
new_path = dest_path / f.relative_to(init_path)
new_path.parent.mkdir(parents=True, exist_ok=True)
print(f'Moving {f} to {new_path}')
def move_small_file_size(path, low_size_path, file_ext='jpg'):
p =[f'find {path} -name *.{file_ext} -size -16k'], shell=True, text=True, capture_output=True)
for f in p.stdout.splitlines():
f = Path(f)
move_file(f, path, low_size_path)
low_size_path = path/'removed'/'low_size'
move_small_file_size(path/'train', low_size_path)
We just need to check images with neutral A & BĀ channels in the LAB space.
We also take advantage to remove images with colorizing filters. After observing the range of values in LAB space of typical images, we can notice that A & B channels typically cover the entire range [0, 255]. When this is not the case, we will typically have a picture with a strong color accent due to filters, which we want to avoid.
def find_black_and_white(path, black_and_white_path, display_only=True):
"find black & white pictures"
thresh = 300 # minimum delta required on channels A & B -> delta(A) + delta(B)
getLAB = Pipeline([PILImage.create, RGBToLAB(), ToTensor(), Split_L_AB()])
items = get_image_files(path)
for f in progress_bar(items):
_, img_AB = getLAB(f)
AB_delta = sum([(img_AB[i].max() - img_AB[i].min()).item() for i in [0,1]])
if (AB_delta) < thresh:
if display_only:
print(f'file {f} - AB_delta {AB_delta}')
move_file(f, path, black_and_white_path)
black_and_white_path = path/'removed'/'black_and_white'
find_black_and_white(path/'train', black_and_white_path, display_only=False)
We keep some images as part of the validation set.
Training set is assumed to be at ../data/train
and validation set at ../data/valid
def create_validation_set(train_path, valid_path, n_valid):
"Create a validation set"
if valid_path.exists(): return print(f'Validation set already exists at {valid_path}')
items = get_image_files(train_path)
for f in np.random.choice(items, n_valid, replace=False):
move_file(f, train_path, valid_path)
print(f'Validation set created at {valid_path}')
create_validation_set(path/'train', path/'valid', 2**11)