clip-spanish / discard_incorrect_files.py

Add training scripts and initial model trained on 1% of the data.

8e2b754 over 3 years ago

931 Bytes

	import json
	import os

	import torch
	from torchvision.io import ImageReadMode, read_image

	# SUPPORTED_EXTENSIONS = {'PNG', 'JPG', 'png', 'JPEG', 'jpg', 'jpeg'}

	for split in ["train", "valid", "test"]:
	with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset.json") as f:
	examples = [json.loads(line) for line in f.readlines()]

	supported_examples = []
	for example in examples:
	try:
	image = read_image(example["image_path"], mode=ImageReadMode.RGB)
	supported_examples.append(json.dumps(example, ensure_ascii=False))
	except Exception as e:
	print(f"Excluding file: {example['image_path']} due to error: {e}")

	print(f"Total {split} examples: {len(supported_examples)}")
	with open(f"/home/{os.environ['USER']}/data/wit/prepared_dataset/{split}_dataset_filtered.json", "w") as f:
	f.write("\n".join(supported_examples))