LLaVA / utils /expand_coco_images.py
starriver030515's picture
Upload folder using huggingface_hub
bd4d522 verified
import os
import shutil
import json
import random
# path to the generated images
source_folder = "/mnt/petrelfs/zhuchenglin/diffusion/coco/images/train2017"
# path to the llava training images, which has the initial 660 subfolders
target_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/images"
# path to the llava training annotations folder
target_anno_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain"
# path to the COCO annotations file
annotations_coco_path = (
"/mnt/petrelfs/zhuchenglin/diffusion/coco/annotations/captions_train2017.json"
)
with open(annotations_coco_path, "r") as f:
annotations = json.load(f)
new_annotations = []
for index, annotation in enumerate(annotations["annotations"][:200000]):
print(index)
# 680 is the starting index
folder_index = 680 + (index // 10000)
target_subfolder = f"{folder_index:05d}"
# format of the image name: 00000xxxx.jpg
target_image_name = f"{folder_index:05d}{index % 10000:04d}.jpg"
target_image_path = os.path.join(target_folder, target_subfolder, target_image_name)
if not os.path.exists(os.path.join(target_folder, target_subfolder)):
os.makedirs(os.path.join(target_folder, target_subfolder))
# the default name of generated images is index.jpg
source_image_path = os.path.join(
source_folder, f"{annotation['image_id']:012d}.jpg"
)
if os.path.exists(source_image_path):
shutil.copy(source_image_path, target_image_path)