import json clipscore_llava_path = "/mnt/petrelfs/zhuchenglin/clipscore/llava_raw_200k.json" clipscore_coco_path = "/mnt/petrelfs/zhuchenglin/clipscore/coco_raw_200k.json" caption_llava_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/llava_raw_200k.json" caption_coco_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/coco_raw_200k.json" result_path = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/select_raw_200k.json" def merge_json(clipscore_path, caption_path): with open(clipscore_path) as f: clipscores = json.load(f) with open(caption_path) as f: captions = json.load(f) for i, clipscore_data in enumerate(clipscores): captions[i]["clipscore"] = clipscore_data["clipscore"] return captions # Merge JSON data data1 = merge_json(clipscore_coco_path, caption_coco_path) data2 = merge_json(clipscore_llava_path, caption_llava_path) # Combine the data combined_data = data1 + data2 # Sort combined data by clipscore in descending order and take top 200k sorted_combined_data = sorted(combined_data, key=lambda x: x["clipscore"], reverse=True)[:200000] count = 0 for data in sorted_combined_data: if data["id"][:3] == "006": count += 1 print(count) # Save the result to a JSON file with open(result_path, "w") as outfile: json.dump(sorted_combined_data, outfile, indent=4)