File size: 1,069 Bytes
8e2b754 4463ade 8e2b754 2daf3c7 8e2b754 2daf3c7 8e2b754 2daf3c7 8e2b754 2daf3c7 4463ade 2daf3c7 8e2b754 4463ade 8e2b754 2daf3c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import json
import os
from tqdm import tqdm
JOINT_JSON_DIRECTORY = f"/home/{os.environ['USER']}/data/wit/all_jsons"
SCALE_CONVERTED_DIRECTORY = f"/home/{os.environ['USER']}/data/wit_scale_converted"
for split in ["train", "valid", "test"]:
print("Reading json")
with open(f"{JOINT_JSON_DIRECTORY}/{split}_dataset_all_98_1_1_split.json") as f:
examples = [json.loads(line) for line in f.readlines()]
valid_files = set(os.listdir(SCALE_CONVERTED_DIRECTORY))
supported_examples = []
for example in tqdm(examples):
directory, filename = os.path.split(example["image_path"])
if filename in valid_files:
example["image_path"] = os.path.join(SCALE_CONVERTED_DIRECTORY, filename)
supported_examples.append(json.dumps(example, ensure_ascii=False))
print(f"Total {split} examples: {len(supported_examples)}")
with open(
f"{SCALE_CONVERTED_DIRECTORY}/{split}_dataset_scale_converted_98_1_1_split.json",
"w",
) as f:
f.write("\n".join(supported_examples))
print("DONE!")
|