import json | |
import os | |
import torch | |
from torchvision.io import ImageReadMode, read_image | |
# SUPPORTED_EXTENSIONS = {'PNG', 'JPG', 'png', 'JPEG', 'jpg', 'jpeg'} | |
for split in ["train", "valid", "test"]: | |
with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset.json") as f: | |
examples = [json.loads(line) for line in f.readlines()] | |
supported_examples = [] | |
for example in examples: | |
try: | |
image = read_image(example["image_path"], mode=ImageReadMode.RGB) | |
supported_examples.append(json.dumps(example, ensure_ascii=False)) | |
except Exception as e: | |
print(f"Excluding file: {example['image_path']} due to error: {e}") | |
print(f"Total {split} examples: {len(supported_examples)}") | |
with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset_filtered.json", "w") as f: | |
f.write("\n".join(supported_examples)) | |