model1 / llava /eval /masp_eval /video_chair /eval_instance_video_chair.py

Wangpeng An

Upload folder using huggingface_hub

bbfa6f6 verified 10 months ago

3.84 kB

	# compute chair for each video
	import json
	import collections
	import argparse
	from pathlib import Path

	def eval_video_chair(file_name, metric):
	with file_name.open("r") as json_file:
	data = json.load(json_file)

	items = {}
	coverages = collections.defaultdict(list)
	hallucinations = collections.defaultdict(list)
	buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
	index = 0
	for object_id, tag_info in data.items():
	items[object_id] = index
	for tag in buckets:
	if tag in tag_info:
	cvg = round(tag_info[tag][0]*100 / tag_info[tag][1], 2)
	coverages[tag].append(cvg) if metric == "coverage" else hallucinations[tag].append(round(100 - cvg, 2))
	else: # "-100" means gt has no such tag for coverage and pred has no such tag for hallucination, leading to N/A value.
	coverages[tag].append(-100) if metric == "coverage" else hallucinations[tag].append(-100)
	index += 1
	return (items, coverages) if metric == "coverage" else (items, hallucinations)


	def get_dict_val(inputs, items, key):
	for dd in inputs:
	if str(dd["object_id"]) == str(items):
	return dd["cap_info"][key] if key in dd["cap_info"] else []
	return []


	def get_instance_result(pred_file, gt_file, coverage_file, hallucination_file, save_file):
	buckets = ['subjects', 'attributes', 'activities', 'locations', 'text_overlays']
	pred = json.load(open(pred_file, "r"))
	gt = json.load(open(gt_file, "r"))
	output_dir = Path(pred_file).parent

	items1, coverages = eval_video_chair(output_dir / coverage_file, "coverage")
	items2, hallucinations = eval_video_chair(output_dir / hallucination_file, "hallucination")

	gt_map = {str(item['object_id']): item for item in gt}
	pred_map = {str(item['object_id']): item for item in pred}

	out = []
	for obj_id, idx_1 in items1.items():
	if obj_id not in items2:
	continue
	idx_2 = items2[obj_id]
	res = {}
	for key in buckets:
	res["object_id"] = obj_id
	res["coverage_"+key] = coverages[key][idx_1] if coverages[key][idx_1] != -100 else "N/A"
	res["hallucination_"+key] = hallucinations[key][idx_2] if hallucinations[key][idx_2] != -100 else "N/A"
	if key == "attributes": # "skip attributes which are combined in subjects"
	continue
	res["pred_"+key] = get_dict_val(pred, obj_id, key)
	res["gt_"+key] = get_dict_val(gt, obj_id, key)
	res['masp_inference'] = pred_map[obj_id]['masp_inference']
	res['refine_caption'] = gt_map[obj_id]['refine_caption']
	out.append(res)


	with (output_dir / save_file).open("w") as json_data:
	json.dump(out, json_data, indent=4)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument("--pred_file", type=str, default='/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_public800k_unfreeze_qformer/video_chair/video_chair_1k_res_info.json')
	parser.add_argument("--gt_file", type=str, default='/mnt/bn/algo-masp-nas-2/kaili.zhao/data/masp_data/eval/eval_v1.0/eval_benchmark_pos_diverse_1k_11policies_gt.json')
	parser.add_argument("--coverage_file", type=str, default='each_video_coverage_detail.json')
	parser.add_argument("--hallucination_file", type=str, default='each_video_halluciantion_detail.json')
	parser.add_argument("--save_file", type=str, default='video_chair_final.json')
	args = parser.parse_args()
	get_instance_result(args.pred_file, args.gt_file, args.coverage_file, args.hallucination_file, args.save_file)
	print(f"===== Completed video chair for each individual computation! =====")