Spaces:

jirvin16
/

TEOChat

Runtime error

App Files Files Community

TEOChat / videollava /eval /infer_eval.py

jirvin16

Initial commit

134cb11 12 months ago

raw

history blame contribute delete

14.6 kB

	import fire
	import json
	from pathlib import Path

	from videollava.model.builder import load_pretrained_model
	from videollava.utils import disable_torch_init
	from videollava.mm_utils import get_model_name_from_path
	from videollava.model.multimodal_encoder.languagebind.video.processing_video import LanguageBindVideoProcessor

	from eval_classification import accuracy_precision_recall
	from eval_referring import referring_expression
	from classification_segmentation import classification_segmentation

	from ben_utils import run_ben_inference
	from aid_fmow_ucmerced_utils import run_aid_fmow_ucmerced_inference
	from qfabric_utils import run_qfabric_inference
	from geochat_utils import run_geochat_inference
	from s2looking_utils import run_s2looking_inference
	from xbd_utils import run_xbd_inference
	from cdvqa_utils import run_cdvqa_inference


	def aggregated(answer_path, dataset=None, verbose=False, split=None):
	"""
	Define an aggregated metric for our created instruction-following datasets.
	It includes eval_description and eval_referring metrics.
	"""
	saving_path_root = Path(answer_path).parent

	with open(answer_path, 'r') as f:
	answers = json.load(f)

	print("Referring expression")
	referring_expression(answer_path, dataset, False, saving_path_root, split=split)
	print()
	print("Accuracy")
	accuracy_precision_recall(answer_path, dataset, verbose=False)
	print()

	# TODO per-task metrics for qfabric and xbd

	if dataset == 'qfabric' or dataset == 'xbd':
	classification_segmentation(answer_path, dataset)

	if dataset == "s2looking":
	# also run per-question referring expression
	question1 = 'temporal_question_answering: Are there any buildings in the first image which were {destructed,torn down} in the second?'
	question2 = 'temporal_referring_expression: Identify the buildings in the first image which were {built,constructed,destructed,torn down} as seen in the second image.'
	question3 = 'localization_task: Identify all changed buildings.'
	question4 = 'referring_expression: identify the {constructed, destructed} buildings in the image.'
	question5 = 'question_answering: Have any buildings been task in the area? Please answer with Yes or No'


	for question in [question1, question2, question3, question4, question5]:
	dataset_question = {}
	for data in answers:
	if answers[data]['task'] == question:
	dataset_question[data] = answers[data]
	if len(dataset_question) > 0:
	print('Evaluating for question ', question)
	print('Size of the dataset is ', len(dataset_question))
	referring_expression(dataset_question, dataset, False, saving_path_root, split=split)
	print()


	def load_model(model_path, model_base, cache_dir, device, vision_type=None, load_4bit=False, load_8bit=False):
	model_name = get_model_name_from_path(model_path)

	tokenizer, model, processor, _ = load_pretrained_model(
	model_path,
	model_base,
	model_name,
	load_4bit=load_4bit,
	load_8bit=load_8bit,
	device=device,
	cache_dir=cache_dir,
	vision_type=vision_type,
	)

	if vision_type is None:
	# Automatically determine which to us
	# For now assumes one of the processors is not None and one is None
	vision_types = ['image', 'video']
	if processor['image'] is None and processor['video'] is None:
	raise ValueError("Both image and video processors are None")
	elif processor['image'] is not None and processor['video'] is not None:
	vision_processor = processor['image']
	for vision_type in vision_types:
	vision_processor = processor[vision_type]
	if vision_processor is not None:
	break
	else:
	vision_processor = processor[vision_type]
	use_video_data = vision_type == 'video'
	return tokenizer, model, vision_processor, use_video_data


	def infer_eval(
	dataset_path,
	model_path,
	model_base="LanguageBind/Video-LLaVA-7B",
	cache_dir="/deep/group/aicc-bootcamp/geovlm/models/vllava_cache",
	outname=None,
	open_prompt=None,
	repeat_frames=None,
	prompt_strategy="interleave",
	chronological_prefix=True,
	load_8bit=False,
	load_4bit=False,
	verbose=False,
	rerun=False,
	vision_type=None,
	data_frac=None,
	data_size=None,
	conv_mode="v1",
	delete_system_prompt=False,
	start_ind=None,
	end_ind=None,
	last_image=None,
	print_prompt=False
	):
	"""
	Args:
	dataset_path: path to dataset
	model_path: path to model
	model_base: model base name
	cache_dir: cache directory
	outname: output file name (uses args if None)
	open_prompt options: None, "open", "multi-open"
	repeat_frames options: None, "uniform", "first", "last"
	prompt_strategy options: None, "interleave"
	chronological_prefix: whether to use chronological prefix "in chronological order"
	load_8bit: whether to load 8-bit model
	load_4bit: whether to load 4-bit model
	verbose: whether to print verbose output
	rerun: whether to rerun inference
	vision_type: "image" or "video"
	data_frac: fraction of data to use
	data_size: number of data samples to use
	conv_mode: conversation mode (should be v1 for our models, geochat, and videollava)
	delete_system_prompt: whether to delete system prompt
	start_ind: start index of data
	end_ind: end index of data
	last_image: whether to use last image in video
	print_prompt: whether to print prompt
	"""
	args = locals()
	print(f"Arguments passed to infer_eval:")
	for k, v in args.items():
	print(f"{k} ({type(v).__name__}): {v}")

	# check that data_size and data_frac are not both set
	if data_size is not None and data_frac is not None:
	raise ValueError("data_size and data_frac cannot both be set")
	if data_size is None and data_frac is None:
	data_frac = 1

	dataset2metrics = {
	"lrben": [accuracy_precision_recall],
	"hrben": [accuracy_precision_recall],
	"fmow": [accuracy_precision_recall],
	"s2looking": [aggregated],
	"xbd": [aggregated],
	"qfabric": [aggregated],
	"aid": [accuracy_precision_recall],
	"ucmerced": [accuracy_precision_recall],
	"cdvqa": [accuracy_precision_recall]
	}

	eval_outdir = Path('scripts/geovlm/eval/')

	# Per dataset configurations
	if "lrben" in dataset_path.lower():
	dataset = "lrben"
	run_inference = run_ben_inference
	outdir = eval_outdir / "RSVQA-LRBEN/answers/"
	if open_prompt is not None:
	raise ValueError("LRBEN dataset does not support open prompt")
	elif "hrben" in dataset_path.lower():
	dataset = "hrben"
	run_inference = run_ben_inference
	outdir = eval_outdir / "RSVQA-HRBEN/answers/"
	if open_prompt is not None:
	raise ValueError("HRBEN dataset does not support open prompt")
	elif "fmow" in dataset_path.lower():
	dataset = "fmow"
	run_inference = run_aid_fmow_ucmerced_inference
	outdir = eval_outdir / "fmow-highres/answers/"
	elif "s2looking" in dataset_path.lower():
	dataset = "s2looking"
	run_inference = run_s2looking_inference
	outdir = eval_outdir / "s2looking/answers/"
	elif "xbd" in dataset_path.lower():
	dataset = "xbd"
	run_inference = run_xbd_inference
	outdir = eval_outdir / "xBD/answers/"
	elif 'qfabric' in dataset_path.lower() or 'geochat' in dataset_path.lower():
	dataset = "qfabric"
	run_inference = run_qfabric_inference
	outdir = eval_outdir / "QFabric/answers/"
	elif 'geochat' in dataset_path.lower():
	dataset = "geochat"
	run_inference = run_geochat_inference
	outdir = eval_outdir / "GeoChat/answers/"
	elif 'aid' in dataset_path.lower():
	dataset = "aid"
	run_inference = run_aid_fmow_ucmerced_inference
	outdir = eval_outdir / "AID/answers/"
	elif 'ucmerced' in dataset_path.lower():
	dataset = "ucmerced"
	run_inference = run_aid_fmow_ucmerced_inference
	outdir = eval_outdir / "UCMerced/answers/"
	elif 'cdvqa' in dataset_path.lower():
	dataset = "cdvqa"
	run_inference = run_cdvqa_inference
	outdir = eval_outdir / "CDVQA/answers/"
	else:
	raise ValueError(f"No supported dataset found in {dataset_path}, supported datasets: fmow, lrben, s2looking, xbd, qfabric, aic, ucmerced")

	if (start_ind is not None or end_ind is not None) and dataset not in ['qfabric', 'hrben', 'lrben']:
	raise ValueError("start_ind and end_ind can only be used with qfabric, hrben, or lrben datasets")

	# Determine the split
	if 'test' in dataset_path.lower():
	split = 'test'
	elif 'val' or 'valid' or 'validation' in dataset_path.lower():
	split = 'val'
	elif 'train' in dataset_path.lower():
	split = 'train'
	else:
	print("Warning: Could not determine split from dataset path")

	args_to_determine_path = [
	'open_prompt',
	'repeat_frames',
	'prompt_strategy',
	'chronological_prefix',
	'load_8bit',
	'load_4bit',
	'data_frac',
	'data_size',
	'delete_system_prompt'
	]

	# Setup answer path
	outdir.mkdir(parents=True, exist_ok=True)
	model_name = Path(model_path).stem

	if 'llava' not in model_name and 'llava' not in model_name.lower() and 'teochat' not in model_name.lower():
	if model_base != None:
	if model_path[-1] == "/":
	model_path = model_path[:-1]
	model_name = model_path.split("/")[-2] + "-" + model_path.split("/")[-1]
	print("Model name used: ", model_name)
	else:
	raise ValueError(f"Model name {model_name} does not contain 'llava'")
	if 'lora' not in model_name:
	print("Warning: Model name does not contain 'lora'")

	if outname is None:
	dataset_path_name = Path(dataset_path).stem
	outname = f"{model_name}_{dataset}_{dataset_path_name}_{split}.json"

	if ".json" not in outname:
	outname = f"{outname}.json"

	args_to_determine_path = [
	'open_prompt',
	'repeat_frames',
	'prompt_strategy',
	'chronological_prefix',
	'load_8bit',
	'load_4bit',
	'data_frac',
	'data_size',
	'delete_system_prompt',
	'start_ind',
	'end_ind',
	'last_image'
	]
	for arg in args_to_determine_path:
	if args[arg] is not None:
	outname = outname.replace(".json", f"_{arg}_{args[arg]}.json")

	answer_path = outdir / outname

	print(f'answer_path: {answer_path}')

	# Save args to file
	args_path = outdir / outname.replace(".json", "_args.json")

	if len(str(args_path)) < 255:
	with open(args_path, 'w') as f:
	json.dump(args, f)
	else:
	# File name too long. Just use first letter of each arg
	for arg in args_to_determine_path:
	if args[arg] is not None:
	first_letters = ''.join([word[0] for word in arg.split('_')])
	#print("outname before replacing: ", outname)
	outname = outname.replace(f"{arg}", first_letters)
	#print("outname after replacing: ", outname)
	answer_path = outdir / outname
	args_path = outdir / outname.replace(".json", "_args.json")
	with open(args_path, 'w') as f:
	json.dump(args, f)
	print(f'New answer_path: {answer_path}')

	# If answer file exists, compute metrics
	if answer_path.exists() and not rerun:
	for metric in dataset2metrics[dataset]:
	if dataset == "s2looking":
	metric(answer_path, dataset=dataset, verbose=verbose, split=split)
	else:
	metric(answer_path, dataset=dataset, verbose=verbose)
	return

	# Load model
	disable_torch_init()
	device = 'cuda'
	tokenizer, model, processor, use_video_data = load_model(
	model_path,
	model_base,
	cache_dir,
	device,
	load_4bit=load_4bit,
	load_8bit=load_8bit,
	vision_type=vision_type
	)

	if use_video_data:
	if dataset == "lrben":
	raise ValueError("LRBEN dataset does not support video processing")
	# Hack to set backend of video processor
	# NOTE: If we change image size, we might need to change this in the config here too
	# (better solution is to figure out where this config is set when saving the model)
	processor.config.vision_config.video_decode_backend = "image_list"
	processor = LanguageBindVideoProcessor(processor.config, tokenizer)

	if rerun or not answer_path.exists():
	# Run inference
	answers = run_inference(
	model,
	dataset_path,
	processor,
	tokenizer,
	conv_mode,
	answer_path=answer_path,
	open_prompt=open_prompt,
	repeat_frames=repeat_frames,
	use_video_data = use_video_data,
	prompt_strategy=prompt_strategy,
	chronological_prefix=chronological_prefix,
	data_size=data_size,
	data_frac=data_frac,
	delete_system_prompt=delete_system_prompt,
	start_ind=start_ind,
	end_ind=end_ind,
	last_image=last_image,
	print_prompt=print_prompt
	)

	# Save answers
	with open(answer_path, 'w') as f:
	json.dump(answers, f, indent=4)
	else:
	answers = json.load(open(answer_path))


	# Calculate metrics
	for metric in dataset2metrics[dataset]:
	if dataset == "s2looking":
	metric(answer_path, dataset=dataset, verbose=verbose, split=split)
	else:
	metric(answer_path, dataset=dataset, verbose=verbose)


	if __name__ == '__main__':
	"""Example usage:
	export CUDA_VISIBLE_DEVICES=0;
	export PYTHONPATH=/path/to/aicc-win24-geo-vlm/videollava/:$PYTHONPATH;
	python videollava/eval/video/infer_eval.py infer_eval\
	--dataset fmow\
	--model_path /path/to/model\
	"""
	fire.Fire()