import json

clipscore_llava_path = "/mnt/petrelfs/zhuchenglin/clipscore/llava_raw_200k.json"
clipscore_coco_path = "/mnt/petrelfs/zhuchenglin/clipscore/coco_raw_200k.json"
caption_llava_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/llava_raw_200k.json"
caption_coco_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/coco_raw_200k.json"
result_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/select_raw_200k.json"

def merge_json(clipscore_path, caption_path):
    with open(clipscore_path) as f:
        clipscores = json.load(f)
    with open(caption_path) as f:
        captions = json.load(f)
    for i, clipscore_data in enumerate(clipscores):
        captions[i]["clipscore"] = clipscore_data["clipscore"]
    return captions

# Merge JSON data
data1 = merge_json(clipscore_coco_path, caption_coco_path)
data2 = merge_json(clipscore_llava_path, caption_llava_path)

# Combine the data
combined_data = data1 + data2

# Sort combined data by clipscore in descending order and take top 200k
sorted_combined_data = sorted(combined_data, key=lambda x: x["clipscore"], reverse=True)[:200000]
count = 0
for data in sorted_combined_data:
    if data["id"][:3] == "006":
        count += 1
print(count)
# Save the result to a JSON file
with open(result_path, "w") as outfile:
    json.dump(sorted_combined_data, outfile, indent=4)