|
import json |
|
|
|
clipscore_llava_path = "/mnt/petrelfs/zhuchenglin/clipscore/llava_raw_200k.json" |
|
clipscore_coco_path = "/mnt/petrelfs/zhuchenglin/clipscore/coco_raw_200k.json" |
|
caption_llava_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/llava_raw_200k.json" |
|
caption_coco_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/coco_raw_200k.json" |
|
result_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/select_raw_200k.json" |
|
|
|
def merge_json(clipscore_path, caption_path): |
|
with open(clipscore_path) as f: |
|
clipscores = json.load(f) |
|
with open(caption_path) as f: |
|
captions = json.load(f) |
|
for i, clipscore_data in enumerate(clipscores): |
|
captions[i]["clipscore"] = clipscore_data["clipscore"] |
|
return captions |
|
|
|
|
|
data1 = merge_json(clipscore_coco_path, caption_coco_path) |
|
data2 = merge_json(clipscore_llava_path, caption_llava_path) |
|
|
|
|
|
combined_data = data1 + data2 |
|
|
|
|
|
sorted_combined_data = sorted(combined_data, key=lambda x: x["clipscore"], reverse=True)[:200000] |
|
count = 0 |
|
for data in sorted_combined_data: |
|
if data["id"][:3] == "006": |
|
count += 1 |
|
print(count) |
|
|
|
with open(result_path, "w") as outfile: |
|
json.dump(sorted_combined_data, outfile, indent=4) |