import json import os import torch from torchvision.io import ImageReadMode, read_image # SUPPORTED_EXTENSIONS = {'PNG', 'JPG', 'png', 'JPEG', 'jpg', 'jpeg'} for split in ["train", "valid", "test"]: with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset.json") as f: examples = [json.loads(line) for line in f.readlines()] supported_examples = [] for example in examples: try: image = read_image(example["image_path"], mode=ImageReadMode.RGB) supported_examples.append(json.dumps(example, ensure_ascii=False)) except Exception as e: print(f"Excluding file: {example['image_path']} due to error: {e}") print(f"Total {split} examples: {len(supported_examples)}") with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset_filtered.json", "w") as f: f.write("\n".join(supported_examples))