# Copyright (C) 2022-present Naver Corporation. All rights reserved. # Licensed under CC BY-NC-SA 4.0 (non-commercial use only). """ Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere. """ import os import glob from tqdm import tqdm import shutil import json from datasets.habitat_sim.paths import * import argparse import collections if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input_dir") parser.add_argument("output_dir") args = parser.parse_args() input_dirname = args.input_dir output_dirname = args.output_dir input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True) images_count = collections.defaultdict(lambda : 0) os.makedirs(output_dirname) for input_filename in tqdm(input_metadata_filenames): # Ignore empty files with open(input_filename, "r") as f: original_metadata = json.load(f) if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0: print("No views in", input_filename) continue relpath = os.path.relpath(input_filename, input_dirname) print(relpath) # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability. # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern. scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True)) metadata = dict() for key, value in original_metadata.items(): if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": known_path = False for dataset, dataset_path in scenes_dataset_paths.items(): if value.startswith(dataset_path): value = os.path.join(dataset, os.path.relpath(value, dataset_path)) known_path = True break if not known_path: raise KeyError("Unknown path:" + value) metadata[key] = value # Compile some general statistics while packing data scene_split = metadata["scene"].split("/") upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0] images_count[upper_level] += len(metadata["multiviews"]) output_filename = os.path.join(output_dirname, relpath) os.makedirs(os.path.dirname(output_filename), exist_ok=True) with open(output_filename, "w") as f: json.dump(metadata, f) # Print statistics print("Images count:") for upper_level, count in images_count.items(): print(f"- {upper_level}: {count}")