|
import fire |
|
import json |
|
from pathlib import Path |
|
|
|
from videollava.model.builder import load_pretrained_model |
|
from videollava.utils import disable_torch_init |
|
from videollava.mm_utils import get_model_name_from_path |
|
from videollava.model.multimodal_encoder.languagebind.video.processing_video import LanguageBindVideoProcessor |
|
|
|
from eval_classification import accuracy_precision_recall |
|
from eval_referring import referring_expression |
|
from classification_segmentation import classification_segmentation |
|
|
|
from ben_utils import run_ben_inference |
|
from aid_fmow_ucmerced_utils import run_aid_fmow_ucmerced_inference |
|
from qfabric_utils import run_qfabric_inference |
|
from geochat_utils import run_geochat_inference |
|
from s2looking_utils import run_s2looking_inference |
|
from xbd_utils import run_xbd_inference |
|
from cdvqa_utils import run_cdvqa_inference |
|
|
|
|
|
def aggregated(answer_path, dataset=None, verbose=False, split=None): |
|
""" |
|
Define an aggregated metric for our created instruction-following datasets. |
|
It includes eval_description and eval_referring metrics. |
|
""" |
|
saving_path_root = Path(answer_path).parent |
|
|
|
with open(answer_path, 'r') as f: |
|
answers = json.load(f) |
|
|
|
print("Referring expression") |
|
referring_expression(answer_path, dataset, False, saving_path_root, split=split) |
|
print() |
|
print("Accuracy") |
|
accuracy_precision_recall(answer_path, dataset, verbose=False) |
|
print() |
|
|
|
|
|
|
|
if dataset == 'qfabric' or dataset == 'xbd': |
|
classification_segmentation(answer_path, dataset) |
|
|
|
if dataset == "s2looking": |
|
|
|
question1 = 'temporal_question_answering: Are there any buildings in the first image which were {destructed,torn down} in the second?' |
|
question2 = 'temporal_referring_expression: Identify the buildings in the first image which were {built,constructed,destructed,torn down} as seen in the second image.' |
|
question3 = 'localization_task: Identify all changed buildings.' |
|
question4 = 'referring_expression: identify the {constructed, destructed} buildings in the image.' |
|
question5 = 'question_answering: Have any buildings been task in the area? Please answer with Yes or No' |
|
|
|
|
|
for question in [question1, question2, question3, question4, question5]: |
|
dataset_question = {} |
|
for data in answers: |
|
if answers[data]['task'] == question: |
|
dataset_question[data] = answers[data] |
|
if len(dataset_question) > 0: |
|
print('Evaluating for question ', question) |
|
print('Size of the dataset is ', len(dataset_question)) |
|
referring_expression(dataset_question, dataset, False, saving_path_root, split=split) |
|
print() |
|
|
|
|
|
def load_model(model_path, model_base, cache_dir, device, vision_type=None, load_4bit=False, load_8bit=False): |
|
model_name = get_model_name_from_path(model_path) |
|
|
|
tokenizer, model, processor, _ = load_pretrained_model( |
|
model_path, |
|
model_base, |
|
model_name, |
|
load_4bit=load_4bit, |
|
load_8bit=load_8bit, |
|
device=device, |
|
cache_dir=cache_dir, |
|
vision_type=vision_type, |
|
) |
|
|
|
if vision_type is None: |
|
|
|
|
|
vision_types = ['image', 'video'] |
|
if processor['image'] is None and processor['video'] is None: |
|
raise ValueError("Both image and video processors are None") |
|
elif processor['image'] is not None and processor['video'] is not None: |
|
vision_processor = processor['image'] |
|
for vision_type in vision_types: |
|
vision_processor = processor[vision_type] |
|
if vision_processor is not None: |
|
break |
|
else: |
|
vision_processor = processor[vision_type] |
|
use_video_data = vision_type == 'video' |
|
return tokenizer, model, vision_processor, use_video_data |
|
|
|
|
|
def infer_eval( |
|
dataset_path, |
|
model_path, |
|
model_base="LanguageBind/Video-LLaVA-7B", |
|
cache_dir="/deep/group/aicc-bootcamp/geovlm/models/vllava_cache", |
|
outname=None, |
|
open_prompt=None, |
|
repeat_frames=None, |
|
prompt_strategy="interleave", |
|
chronological_prefix=True, |
|
load_8bit=False, |
|
load_4bit=False, |
|
verbose=False, |
|
rerun=False, |
|
vision_type=None, |
|
data_frac=None, |
|
data_size=None, |
|
conv_mode="v1", |
|
delete_system_prompt=False, |
|
start_ind=None, |
|
end_ind=None, |
|
last_image=None, |
|
print_prompt=False |
|
): |
|
""" |
|
Args: |
|
dataset_path: path to dataset |
|
model_path: path to model |
|
model_base: model base name |
|
cache_dir: cache directory |
|
outname: output file name (uses args if None) |
|
open_prompt options: None, "open", "multi-open" |
|
repeat_frames options: None, "uniform", "first", "last" |
|
prompt_strategy options: None, "interleave" |
|
chronological_prefix: whether to use chronological prefix "in chronological order" |
|
load_8bit: whether to load 8-bit model |
|
load_4bit: whether to load 4-bit model |
|
verbose: whether to print verbose output |
|
rerun: whether to rerun inference |
|
vision_type: "image" or "video" |
|
data_frac: fraction of data to use |
|
data_size: number of data samples to use |
|
conv_mode: conversation mode (should be v1 for our models, geochat, and videollava) |
|
delete_system_prompt: whether to delete system prompt |
|
start_ind: start index of data |
|
end_ind: end index of data |
|
last_image: whether to use last image in video |
|
print_prompt: whether to print prompt |
|
""" |
|
args = locals() |
|
print(f"Arguments passed to infer_eval:") |
|
for k, v in args.items(): |
|
print(f"{k} ({type(v).__name__}): {v}") |
|
|
|
|
|
if data_size is not None and data_frac is not None: |
|
raise ValueError("data_size and data_frac cannot both be set") |
|
if data_size is None and data_frac is None: |
|
data_frac = 1 |
|
|
|
dataset2metrics = { |
|
"lrben": [accuracy_precision_recall], |
|
"hrben": [accuracy_precision_recall], |
|
"fmow": [accuracy_precision_recall], |
|
"s2looking": [aggregated], |
|
"xbd": [aggregated], |
|
"qfabric": [aggregated], |
|
"aid": [accuracy_precision_recall], |
|
"ucmerced": [accuracy_precision_recall], |
|
"cdvqa": [accuracy_precision_recall] |
|
} |
|
|
|
eval_outdir = Path('scripts/geovlm/eval/') |
|
|
|
|
|
if "lrben" in dataset_path.lower(): |
|
dataset = "lrben" |
|
run_inference = run_ben_inference |
|
outdir = eval_outdir / "RSVQA-LRBEN/answers/" |
|
if open_prompt is not None: |
|
raise ValueError("LRBEN dataset does not support open prompt") |
|
elif "hrben" in dataset_path.lower(): |
|
dataset = "hrben" |
|
run_inference = run_ben_inference |
|
outdir = eval_outdir / "RSVQA-HRBEN/answers/" |
|
if open_prompt is not None: |
|
raise ValueError("HRBEN dataset does not support open prompt") |
|
elif "fmow" in dataset_path.lower(): |
|
dataset = "fmow" |
|
run_inference = run_aid_fmow_ucmerced_inference |
|
outdir = eval_outdir / "fmow-highres/answers/" |
|
elif "s2looking" in dataset_path.lower(): |
|
dataset = "s2looking" |
|
run_inference = run_s2looking_inference |
|
outdir = eval_outdir / "s2looking/answers/" |
|
elif "xbd" in dataset_path.lower(): |
|
dataset = "xbd" |
|
run_inference = run_xbd_inference |
|
outdir = eval_outdir / "xBD/answers/" |
|
elif 'qfabric' in dataset_path.lower() or 'geochat' in dataset_path.lower(): |
|
dataset = "qfabric" |
|
run_inference = run_qfabric_inference |
|
outdir = eval_outdir / "QFabric/answers/" |
|
elif 'geochat' in dataset_path.lower(): |
|
dataset = "geochat" |
|
run_inference = run_geochat_inference |
|
outdir = eval_outdir / "GeoChat/answers/" |
|
elif 'aid' in dataset_path.lower(): |
|
dataset = "aid" |
|
run_inference = run_aid_fmow_ucmerced_inference |
|
outdir = eval_outdir / "AID/answers/" |
|
elif 'ucmerced' in dataset_path.lower(): |
|
dataset = "ucmerced" |
|
run_inference = run_aid_fmow_ucmerced_inference |
|
outdir = eval_outdir / "UCMerced/answers/" |
|
elif 'cdvqa' in dataset_path.lower(): |
|
dataset = "cdvqa" |
|
run_inference = run_cdvqa_inference |
|
outdir = eval_outdir / "CDVQA/answers/" |
|
else: |
|
raise ValueError(f"No supported dataset found in {dataset_path}, supported datasets: fmow, lrben, s2looking, xbd, qfabric, aic, ucmerced") |
|
|
|
if (start_ind is not None or end_ind is not None) and dataset not in ['qfabric', 'hrben', 'lrben']: |
|
raise ValueError("start_ind and end_ind can only be used with qfabric, hrben, or lrben datasets") |
|
|
|
|
|
if 'test' in dataset_path.lower(): |
|
split = 'test' |
|
elif 'val' or 'valid' or 'validation' in dataset_path.lower(): |
|
split = 'val' |
|
elif 'train' in dataset_path.lower(): |
|
split = 'train' |
|
else: |
|
print("Warning: Could not determine split from dataset path") |
|
|
|
args_to_determine_path = [ |
|
'open_prompt', |
|
'repeat_frames', |
|
'prompt_strategy', |
|
'chronological_prefix', |
|
'load_8bit', |
|
'load_4bit', |
|
'data_frac', |
|
'data_size', |
|
'delete_system_prompt' |
|
] |
|
|
|
|
|
outdir.mkdir(parents=True, exist_ok=True) |
|
model_name = Path(model_path).stem |
|
|
|
if 'llava' not in model_name and 'llava' not in model_name.lower() and 'teochat' not in model_name.lower(): |
|
if model_base != None: |
|
if model_path[-1] == "/": |
|
model_path = model_path[:-1] |
|
model_name = model_path.split("/")[-2] + "-" + model_path.split("/")[-1] |
|
print("Model name used: ", model_name) |
|
else: |
|
raise ValueError(f"Model name {model_name} does not contain 'llava'") |
|
if 'lora' not in model_name: |
|
print("Warning: Model name does not contain 'lora'") |
|
|
|
if outname is None: |
|
dataset_path_name = Path(dataset_path).stem |
|
outname = f"{model_name}_{dataset}_{dataset_path_name}_{split}.json" |
|
|
|
if ".json" not in outname: |
|
outname = f"{outname}.json" |
|
|
|
args_to_determine_path = [ |
|
'open_prompt', |
|
'repeat_frames', |
|
'prompt_strategy', |
|
'chronological_prefix', |
|
'load_8bit', |
|
'load_4bit', |
|
'data_frac', |
|
'data_size', |
|
'delete_system_prompt', |
|
'start_ind', |
|
'end_ind', |
|
'last_image' |
|
] |
|
for arg in args_to_determine_path: |
|
if args[arg] is not None: |
|
outname = outname.replace(".json", f"_{arg}_{args[arg]}.json") |
|
|
|
answer_path = outdir / outname |
|
|
|
print(f'answer_path: {answer_path}') |
|
|
|
|
|
args_path = outdir / outname.replace(".json", "_args.json") |
|
|
|
if len(str(args_path)) < 255: |
|
with open(args_path, 'w') as f: |
|
json.dump(args, f) |
|
else: |
|
|
|
for arg in args_to_determine_path: |
|
if args[arg] is not None: |
|
first_letters = ''.join([word[0] for word in arg.split('_')]) |
|
|
|
outname = outname.replace(f"{arg}", first_letters) |
|
|
|
answer_path = outdir / outname |
|
args_path = outdir / outname.replace(".json", "_args.json") |
|
with open(args_path, 'w') as f: |
|
json.dump(args, f) |
|
print(f'New answer_path: {answer_path}') |
|
|
|
|
|
if answer_path.exists() and not rerun: |
|
for metric in dataset2metrics[dataset]: |
|
if dataset == "s2looking": |
|
metric(answer_path, dataset=dataset, verbose=verbose, split=split) |
|
else: |
|
metric(answer_path, dataset=dataset, verbose=verbose) |
|
return |
|
|
|
|
|
disable_torch_init() |
|
device = 'cuda' |
|
tokenizer, model, processor, use_video_data = load_model( |
|
model_path, |
|
model_base, |
|
cache_dir, |
|
device, |
|
load_4bit=load_4bit, |
|
load_8bit=load_8bit, |
|
vision_type=vision_type |
|
) |
|
|
|
if use_video_data: |
|
if dataset == "lrben": |
|
raise ValueError("LRBEN dataset does not support video processing") |
|
|
|
|
|
|
|
processor.config.vision_config.video_decode_backend = "image_list" |
|
processor = LanguageBindVideoProcessor(processor.config, tokenizer) |
|
|
|
if rerun or not answer_path.exists(): |
|
|
|
answers = run_inference( |
|
model, |
|
dataset_path, |
|
processor, |
|
tokenizer, |
|
conv_mode, |
|
answer_path=answer_path, |
|
open_prompt=open_prompt, |
|
repeat_frames=repeat_frames, |
|
use_video_data = use_video_data, |
|
prompt_strategy=prompt_strategy, |
|
chronological_prefix=chronological_prefix, |
|
data_size=data_size, |
|
data_frac=data_frac, |
|
delete_system_prompt=delete_system_prompt, |
|
start_ind=start_ind, |
|
end_ind=end_ind, |
|
last_image=last_image, |
|
print_prompt=print_prompt |
|
) |
|
|
|
|
|
with open(answer_path, 'w') as f: |
|
json.dump(answers, f, indent=4) |
|
else: |
|
answers = json.load(open(answer_path)) |
|
|
|
|
|
|
|
for metric in dataset2metrics[dataset]: |
|
if dataset == "s2looking": |
|
metric(answer_path, dataset=dataset, verbose=verbose, split=split) |
|
else: |
|
metric(answer_path, dataset=dataset, verbose=verbose) |
|
|
|
|
|
if __name__ == '__main__': |
|
"""Example usage: |
|
export CUDA_VISIBLE_DEVICES=0; |
|
export PYTHONPATH=/path/to/aicc-win24-geo-vlm/videollava/:$PYTHONPATH; |
|
python videollava/eval/video/infer_eval.py infer_eval\ |
|
--dataset fmow\ |
|
--model_path /path/to/model\ |
|
""" |
|
fire.Fire() |
|
|