Spaces:
Running
Running
import sys | |
import os | |
project_dir = os.getcwd() | |
sys.path.append(project_dir) | |
import json | |
from tqdm import tqdm | |
from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds | |
import argparse | |
import json | |
import torch | |
import re | |
from tqdm import tqdm | |
from PIL import Image | |
from index import MemoryIndex | |
import torch | |
import random | |
import numpy as np | |
import torch.backends.cudnn as cudnn | |
import shutil | |
def str2bool(v): | |
if isinstance(v, bool): | |
return v | |
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
return True | |
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
return False | |
else: | |
raise argparse.ArgumentTypeError('Boolean value expected.') | |
def get_arguments(): | |
parser = argparse.ArgumentParser(description="Inference parameters") | |
parser.add_argument("--neighbours", type=int, default=-1) | |
parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment") | |
parser.add_argument("--add_unknown", action='store_true') | |
parser.add_argument("--use_chatgpt", action='store_true') | |
parser.add_argument("--use_choices_for_info", action='store_true') | |
parser.add_argument("--use_gt_information", action='store_true') | |
parser.add_argument("--inference_text", action='store_true') | |
parser.add_argument("--use_gt_information_with_distraction", action='store_true') | |
parser.add_argument("--num_distraction", type=int, default=2) | |
parser.add_argument("--add_confidance_score", action='store_true') | |
parser.add_argument("--use_original_video", action='store_true') | |
parser.add_argument("--use_video_embedding", action='store_true') | |
parser.add_argument("--use_clips_for_info", action='store_true') | |
parser.add_argument("--use_GT_video", action='store_true') | |
parser.add_argument("--use_gt_summary", action='store_true') | |
parser.add_argument("--index_subtitles", action='store_true') | |
parser.add_argument("--index_subtitles_together", action='store_true') | |
parser.add_argument("--ask_the_question_early", action='store_true') | |
parser.add_argument("--clip_in_ask_early", action='store_true') | |
parser.add_argument("--summary_with_subtitles_only", action='store_true') | |
parser.add_argument("--use_coherent_description", action='store_true') | |
parser.add_argument("--start", default=0, type=int) | |
parser.add_argument("--end", default=100000, type=int) | |
parser.add_argument("--exp_name", type=str,default="",help="name of eval folder") | |
parser.add_argument("--vision_only", action='store_true') | |
parser.add_argument("--model_summary_only", action='store_true') | |
parser.add_argument("--subtitles_only", action='store_true') | |
parser.add_argument("--info_only", action='store_true') | |
parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml") | |
parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth") | |
parser.add_argument("--add_subtitles", action='store_true') | |
parser.add_argument("--eval_opt", type=str, default='all') | |
parser.add_argument("--max_new_tokens", type=int, default=300) | |
parser.add_argument("--batch_size", type=int, default=8) | |
parser.add_argument("--lora_r", type=int, default=64) | |
parser.add_argument("--lora_alpha", type=int, default=16) | |
parser.add_argument("--video_path", type=str, help="path to the video") | |
parser.add_argument("--use_openai_embedding",type=str2bool, default=False) | |
parser.add_argument("--annotation_path", type=str, help="path to the annotation file") | |
parser.add_argument("--videos_path", type=str, help="path to the videos directory") | |
parser.add_argument("--subtitle_path", type=str, help="path to the subtitles directory") | |
parser.add_argument("--movienet_annotations_dir", type=str, help="path to the movienet annotations directory") | |
parser.add_argument("--video_clips_saving_path", type=str, help="path to save the splitted small video clips") | |
parser.add_argument("--save_path", type=str, help="path to save the results") | |
parser.add_argument("--options", nargs="+") | |
return parser.parse_args() | |
def time_to_seconds(subrip_time): | |
return subrip_time.hours * 3600 + subrip_time.minutes * 60 + subrip_time.seconds + subrip_time.milliseconds / 1000 | |
def clean_text(subtitles_text): | |
# Remove unwanted characters except for letters, digits, and single quotes | |
subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text) | |
# Replace multiple spaces with a single space | |
subtitles_text = re.sub(r'\s+', ' ', subtitles_text) | |
return subtitles_text.strip() | |
class LlamaVidQAEval (GoldFish_LV): | |
def __init__(self,args): | |
super().__init__(args) | |
self.save_json_path = "new_workspace/clips_summary/movienet" | |
if args.use_openai_embedding: | |
self.save_pkls_path = "new_workspace/open_ai_embedding/movienet" | |
else: | |
self.save_pkls_path = "new_workspace/embedding/movienet" | |
os.makedirs(self.save_json_path, exist_ok=True) | |
annotation_path=args.annotation_path | |
with open(annotation_path, 'r') as f: | |
self.movies_dict = json.load(f) | |
self.max_sub_len=400 | |
self.max_num_images=45 | |
def _get_movie_data(self,videoname): | |
video_images_path =f"{args.videos_path}/{videoname}" | |
movie_clips_path =f"{args.video_clips_saving_path}/{videoname}" | |
subtitle_path = f"{args.subtitle_path}/{videoname}.srt" | |
annotation_file=f"{args.movienet_annotations_dir}/{videoname}.json" | |
# load the annotation file | |
with open(annotation_file, 'r') as f: | |
movie_annotation = json.load(f) | |
return video_images_path,subtitle_path,movie_annotation,movie_clips_path | |
def _store_subtitles_paragraphs(self,subtitle_path,important_data,number_of_paragraphs): | |
paragraphs=[] | |
movie_name=subtitle_path.split('/')[-1].split('.')[0] | |
# if there is no story, split the subtitles into paragraphs | |
paragraphs = split_subtitles(subtitle_path, number_of_paragraphs) | |
for i,paragraph in enumerate(paragraphs): | |
paragraph=clean_text(paragraph) | |
important_data.update({f"subtitle_{i}__{movie_name}_clip_{str(i).zfill(2)}": paragraph}) | |
return important_data | |
def _get_shots_subtitles(self,movie_annotation): | |
shots_subtitles={} | |
if movie_annotation['story'] is not None: | |
for section in movie_annotation['story']: | |
for shot in section['subtitle']: | |
shot_number=shot['shot'] | |
shot_subtitle=' '.join(shot['sentences']) | |
shots_subtitles[shot_number]=clean_text(shot_subtitle) | |
return shots_subtitles | |
def prepare_input_images(self,clip_path,shots_subtitles,use_subtitles): | |
total_frames=len(os.listdir(clip_path)) | |
movie_name=clip_path.split('/')[-2] | |
clip_name=clip_path.split('/')[-1] | |
sampling_interval=int(total_frames//self.max_num_images) | |
if sampling_interval==0: | |
sampling_interval=1 | |
use_subtitles_save_name="subtitles" if use_subtitles else "no_subtitles" | |
video_frames_path = os.path.join(clip_path) | |
total_num_frames=len(os.listdir(video_frames_path)) | |
sampling_interval = round(total_num_frames / self.max_num_images) | |
if sampling_interval == 0: | |
sampling_interval = 1 | |
number_of_words=0 | |
video_images_list=sorted(os.listdir(video_frames_path)) | |
images = [] | |
img_placeholder = "" | |
for i,frame in enumerate(video_images_list): | |
if i % sampling_interval == 0: | |
frame = Image.open(os.path.join(video_frames_path,frame)).convert("RGB") | |
frame = self.vis_processor(frame) | |
images.append(frame) | |
img_placeholder += '<Img><ImageHere>' | |
shot_num=video_images_list[i].split('_')[1] | |
if shots_subtitles.get(shot_num) is not None: | |
sub=clean_text(shots_subtitles[shot_num]) | |
number_of_words+=len(sub.split(' ')) | |
if number_of_words<= self.max_sub_len and use_subtitles: | |
img_placeholder+=f'<Cap>{sub}' | |
if len(images) >= self.max_num_images: | |
break | |
if len(images) ==0: | |
print("Video not found",video_frames_path) | |
if 0 <len(images) < self.max_num_images: | |
last_item = images[-1] | |
while len(images) < self.max_num_images: | |
images.append(last_item) | |
img_placeholder += '<Img><ImageHere>' | |
images = torch.stack(images) | |
return images,img_placeholder | |
def _get_movie_summaries(self,video_images_path,use_subtitles,shots_subtitles,movie_clips_path): | |
video_images_list=sorted(os.listdir(video_images_path)) | |
max_caption_index = 0 | |
preds = {} | |
movie_name=movie_clips_path.split('/')[-1] | |
videos_summaries=[] | |
previous_caption="" | |
batch_size=args.batch_size | |
batch_images=[] | |
batch_instructions=[] | |
clip_numbers=[] | |
clip_number=0 | |
conversations=[] | |
for i in tqdm(range(0,len(video_images_list),135), desc="Inference video clips", total=len(video_images_list)/120): | |
images=[] | |
# Add the previous caption to the new video clip | |
# if batch_size==1: | |
# previous_caption="You are analysing a one long video of mutiple clips and this is the summary from all previous clips :"+videos_summaries[-1] +"\n\n"if len(videos_summaries)>0 else "" | |
if previous_caption != "": | |
img_placeholder = previous_caption+" " | |
else: | |
img_placeholder = "" | |
number_of_words=0 | |
max_num_words=400 | |
max_num_images=45 | |
clip_number_str=str(clip_number).zfill(2) | |
clip_path=os.path.join(movie_clips_path,f"{movie_name}_clip_{clip_number_str}") | |
os.makedirs(clip_path, exist_ok=True) | |
conversation="" | |
for j in range(i,i+135,3): | |
if j >= len(video_images_list): | |
break | |
image_path = os.path.join(video_images_path, video_images_list[j]) | |
# copy the images to clip folder | |
# if the image is already copied, skip it | |
if not os.path.exists(os.path.join(clip_path,video_images_list[j])): | |
shutil.copy(image_path,clip_path) | |
img=Image.open(image_path) | |
images.append(self.vis_processor(img)) | |
img_placeholder += '<Img><ImageHere>' | |
shot_num=int(video_images_list[j].split('_')[1]) | |
if use_subtitles: | |
if shots_subtitles.get(shot_num) is not None: | |
sub=clean_text(shots_subtitles[shot_num]) | |
number_of_words+=len(sub.split(' ')) | |
if number_of_words<= max_num_words and use_subtitles: | |
img_placeholder+=f'<Cap>{sub}' | |
conversation+=sub+" " | |
if len(images) >= max_num_images: | |
break | |
if len(images) ==0: | |
print("Video not found",video_images_path) | |
continue | |
if 0 <len(images) < max_num_images: | |
last_item = images[-1] | |
while len(images) < max_num_images: | |
images.append(last_item) | |
img_placeholder += '<Img><ImageHere>' | |
images = torch.stack(images) | |
print(images.shape) | |
clip_numbers.append(clip_number_str) | |
clip_number+=1 | |
conversations.append(clean_text(conversation)) | |
instruction = img_placeholder + '\n' + self.summary_instruction | |
batch_images.append(images) | |
batch_instructions.append(instruction) | |
if len(batch_images) < batch_size: | |
continue | |
# run inference for the batch | |
batch_images = torch.stack(batch_images) | |
batch_pred=self.run_images(batch_images,batch_instructions) | |
for i,pred in enumerate(batch_pred): | |
max_caption_index += 1 | |
videos_summaries.append(pred) | |
if args.use_coherent_description: | |
preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}" | |
else: | |
preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = pred | |
if conversations[i]!="" and use_subtitles: | |
preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = conversations[i] | |
batch_images=[] | |
batch_instructions=[] | |
clip_numbers=[] | |
conversations=[] | |
# run inference for the last batch | |
if len(batch_images)>0: | |
batch_images = torch.stack(batch_images) | |
batch_pred=self.run_images(batch_images,batch_instructions) | |
for k,pred in enumerate(batch_pred): | |
max_caption_index += 1 | |
videos_summaries.append(pred) | |
if args.use_coherent_description: | |
preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[k]}" | |
else: | |
preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = pred | |
if conversations[k]!="" and use_subtitles: | |
preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = conversations[k] | |
batch_images=[] | |
batch_instructions=[] | |
return preds | |
def movie_inference(self,videoname,use_subtitles): | |
embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl") | |
if args.index_subtitles_together: | |
file_path=os.path.join(self.save_json_path,f"{videoname}.json") | |
embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl") | |
else: | |
file_path=os.path.join(self.save_json_path,f"no_subtiltles_{videoname}.json") | |
embedding_path=os.path.join(self.save_pkls_path,f"no_subtiltles_{videoname}.pkl") | |
if args.subtitles_only: | |
file_path=os.path.join(self.save_json_path,f"subtiltles_only_{videoname}.json") | |
embedding_path=os.path.join(self.save_pkls_path,f"subtiltles_only_{videoname}.pkl") | |
if os.path.exists(file_path): | |
print("Already processed") | |
return file_path,embedding_path | |
important_data = {} | |
video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(videoname) | |
shots_subtitles={} | |
if use_subtitles: | |
if movie_annotation['story'] is not None: | |
shots_subtitles=self._get_shots_subtitles(movie_annotation) | |
if args.subtitles_only: | |
number_of_paragraphs=20 | |
important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs) | |
else: | |
preds=self._get_movie_summaries(video_images_path,use_subtitles,shots_subtitles,movie_clips_path) | |
if len(shots_subtitles)==0 and use_subtitles: | |
number_of_paragraphs=len(preds) | |
important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs) | |
important_data.update(preds) | |
with open(file_path, 'w') as file: | |
json.dump(important_data, file, indent=4) | |
return file_path,embedding_path | |
def answer_movie_questions_RAG(self,qa_list,information_RAG_path,embedding_path): | |
QA_external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding) | |
if os.path.exists(embedding_path): | |
QA_external_memory.load_embeddings_from_pkl(embedding_path) | |
else: | |
QA_external_memory.load_documents_from_json(information_RAG_path,embedding_path) | |
summarization_external_memory=MemoryIndex(-1, use_openai=args.use_openai_embedding) | |
if os.path.exists(embedding_path): | |
summarization_external_memory.load_embeddings_from_pkl(embedding_path) | |
else: | |
summarization_external_memory.load_documents_from_json(information_RAG_path,embedding_path) | |
# get the most similar context from the external memory to this instruction | |
general_related_context_keys_list=[] | |
general_related_context_documents_list=[] | |
summary_related_context_documents_list=[] | |
summary_related_context_keys_list=[] | |
total_batch_pred=[] | |
related_text=[] | |
qa_genearl_prompts=[] | |
qa_summary_prompts=[] | |
qa_general=[] | |
qa_summary=[] | |
for qa in qa_list: | |
if qa['q_type']=='summary': | |
related_context_documents,related_context_keys = summarization_external_memory.search_by_similarity(qa['Q']) | |
summary_related_context_documents_list.append(related_context_documents) | |
summary_related_context_keys_list.append(related_context_keys) | |
prompt=self.prepare_prompt(qa) | |
qa_summary_prompts.append(prompt) | |
qa_summary.append(qa) | |
else: | |
related_context_documents,related_context_keys = QA_external_memory.search_by_similarity(qa['Q']) | |
general_related_context_keys_list.append(related_context_keys) | |
general_related_context_documents_list.append(related_context_documents) | |
prompt=self.prepare_prompt(qa) | |
qa_genearl_prompts.append(prompt) | |
qa_general.append(qa) | |
# if I have summary questions answer first, without the need to use clips for information | |
if len(qa_summary_prompts)>0: | |
# Here the retrieved clips are all movie clips | |
context_information_list=[] | |
for related_context_keys in summary_related_context_keys_list: | |
most_related_clips=self.get_most_related_clips(related_context_keys) | |
context_information="" | |
for clip_name in most_related_clips: | |
clip_conversation="" | |
general_sum="" | |
for key in related_context_keys: | |
if clip_name in key and 'caption' in key: | |
general_sum="Clip Summary: "+summarization_external_memory.documents[key] | |
if clip_name in key and 'subtitle' in key: | |
clip_conversation="Clip Subtitles: "+summarization_external_memory.documents[key] | |
if args.use_coherent_description: | |
context_information+=f"{general_sum}\n" | |
else: | |
if args.model_summary_only: | |
context_information+=f"{general_sum}\n" | |
elif args.subtitles_only: | |
context_information+=f"{clip_conversation}\n" | |
else: | |
context_information+=f"{general_sum},{clip_conversation}\n" | |
context_information_list.append(context_information) | |
if args.use_chatgpt : | |
batch_pred=self.inference_RAG_chatGPT(qa_summary_prompts,context_information_list) | |
else: | |
batch_pred=self.inference_RAG(qa_summary_prompts,context_information_list) | |
total_batch_pred.extend(batch_pred) | |
related_text.extend(context_information_list) | |
if args.use_clips_for_info: | |
batch_pred,general_related_context_keys_list=self.use_clips_for_info(qa_general,general_related_context_keys_list,QA_external_memory) | |
total_batch_pred.extend(batch_pred) | |
related_text.extend(general_related_context_keys_list) | |
else: | |
related_context_documents_text_list=[] | |
for related_context_documents,related_context_keys in zip(general_related_context_documents_list,general_related_context_keys_list): | |
related_information="" | |
most_related_clips=self.get_most_related_clips(related_context_keys) | |
for clip_name in most_related_clips: | |
clip_conversation="" | |
general_sum="" | |
for key in QA_external_memory.documents.keys(): | |
if clip_name in key and 'caption' in key: | |
general_sum="Clip Summary: "+QA_external_memory.documents[key] | |
if clip_name in key and 'subtitle' in key: | |
clip_conversation="Clip Subtitles: "+QA_external_memory.documents[key] | |
if args.use_coherent_description: | |
related_information+=f"{general_sum}\n" | |
else: | |
if args.model_summary_only: | |
related_information+=f"{general_sum}\n" | |
elif args.subtitles_only: | |
related_information+=f"{clip_conversation}\n" | |
else: | |
related_information+=f"{general_sum},{clip_conversation}\n" | |
related_context_documents_text_list.append(related_information) | |
if len (qa_genearl_prompts) >0 and args.use_chatgpt : | |
batch_pred=self.inference_RAG_chatGPT(qa_genearl_prompts,related_context_documents_text_list) | |
elif len (qa_genearl_prompts) >0: | |
batch_pred=self.inference_RAG(qa_genearl_prompts,related_context_documents_text_list) | |
total_batch_pred.extend(batch_pred) | |
related_text.extend(related_context_documents_text_list) | |
assert len(total_batch_pred)==len(related_text) | |
return total_batch_pred, related_text | |
def get_most_related_clips(self,related_context_keys): | |
most_related_clips=[] | |
for context_key in related_context_keys: | |
if len(context_key.split('__'))>1: | |
most_related_clips.append(context_key.split('__')[1]) | |
if len(most_related_clips)==args.neighbours: | |
break | |
assert len(most_related_clips)!=0, f"No related clips found {related_context_keys}" | |
return most_related_clips | |
def clip_inference(self,clips_name,prompts): | |
setup_seeds(seed) | |
images_batch, instructions_batch = [], [] | |
for clip_name, prompt in zip(clips_name, prompts): | |
movie_name=clip_name.split('_')[0] | |
video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(movie_name) | |
clip_path=os.path.join(movie_clips_path,clip_name) | |
if movie_annotation['story'] is not None: | |
shots_subtitles=self._get_shots_subtitles(movie_annotation) | |
else: | |
shots_subtitles={} | |
images,img_placeholder=self.prepare_input_images(clip_path,shots_subtitles,use_subtitles=not args.vision_only) | |
instruction = img_placeholder + '\n' + prompt | |
images_batch.append(images) | |
instructions_batch.append(instruction) | |
# run inference for the batch | |
images_batch=torch.stack(images_batch) | |
batch_pred=self.run_images(images_batch,instructions_batch) | |
return batch_pred | |
def prepare_prompt(self,qa): | |
prompt=qa["Q"] | |
return prompt | |
def use_clips_for_info(self,qa_list,related_context_keys_list,external_memory): | |
total_batch_pred=[] | |
questions=[] | |
related_information_list=[] | |
related_context_keys_list_new=[] | |
for qa,related_context_keys in zip(qa_list,related_context_keys_list): | |
most_related_clips=self.get_most_related_clips(related_context_keys) | |
question=qa['Q'] | |
# prompt=self.prepare_prompt(qa) | |
# prompt+=" and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n" | |
prompt=f"From this video extract the related information to This question and provide an explaination for your answer and If you can't find related information, say 'I DON'T KNOW' as option 5 because maybe the questoin is not related to the video content.\n the question is :\n {question}\n your answer :" | |
# all_info=self.clip_inference(most_related_clips,[prompt]*len(most_related_clips)) | |
# make the most_related_clips has unique elements (if retrival from vision summary and conversations) | |
most_related_clips=list(set(most_related_clips)) | |
batch_inference=[] | |
all_info=[] | |
for related_clip in most_related_clips: | |
batch_inference.append(related_clip) | |
if len(batch_inference)<args.batch_size: | |
continue | |
all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference))) | |
batch_inference=[] | |
if len(batch_inference)>0: | |
all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference))) | |
related_information="" | |
for info,clip_name in zip(all_info,most_related_clips): | |
clip_conversation="" | |
general_sum="" | |
for key in external_memory.documents.keys(): | |
if clip_name in key and 'caption' in key: | |
general_sum="Clip Summary: "+external_memory.documents[key] | |
if clip_name in key and 'subtitle' in key: | |
clip_conversation="Clip Subtitles: "+external_memory.documents[key] | |
if args.use_coherent_description: | |
related_information+=f"question_related_information: {info},{general_sum}\n" | |
else: | |
if args.model_summary_only: | |
related_information+=f"{general_sum},question_related_information: {info}\n" | |
elif args.info_only: | |
related_information+=f"question_related_information: {info}\n" | |
elif args.subtitles_only: | |
related_information+=f"{clip_conversation},question_related_information: {info}\n" | |
else: | |
related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n" | |
# related_information+=f"question_related_information: {info},{clip_conversation}\n" | |
questions.append(question) | |
related_information_list.append(related_information) | |
related_context_keys.append(related_information) | |
related_context_keys_list_new.append(related_context_keys) | |
if len(questions)< args.batch_size: | |
continue | |
setup_seeds(seed) | |
if args.use_chatgpt : | |
batch_pred=self.inference_RAG_chatGPT(questions, related_information_list) | |
else: | |
batch_pred=self.inference_RAG(questions, related_information_list) | |
for pred in batch_pred: | |
total_batch_pred.append(pred) | |
questions=[] | |
related_information_list=[] | |
if len(questions)>0: | |
setup_seeds(seed) | |
if args.use_chatgpt : | |
batch_pred=self.inference_RAG_chatGPT(questions, related_information_list) | |
else: | |
batch_pred=self.inference_RAG(questions, related_information_list) | |
for pred in batch_pred: | |
total_batch_pred.append(pred) | |
return total_batch_pred,related_context_keys_list_new | |
def define_save_name(self): | |
save_name="subtitles" if args.index_subtitles_together else "no_subtitles" | |
save_name+="_clips_for_info" if args.use_clips_for_info else "" | |
save_name+="_chatgpt" if args.use_chatgpt else "" | |
save_name+="_vision_only" if args.vision_only else "" | |
save_name+="_model_summary_only" if args.model_summary_only else "" | |
save_name+="_subtitles_only" if args.subtitles_only else "" | |
save_name+="_info_only" if args.info_only else "" | |
print("save_name",save_name) | |
return save_name | |
def eval_llama_vid(self): | |
## LLAMa vid QA evaluation | |
full_questions_result=[] | |
movie_number=0 | |
start=args.start | |
end=args.end | |
save_name=self.define_save_name() | |
for movie in tqdm(self.movies_dict.keys()): | |
if args.start <=movie_number < args.end: | |
save_dir=f"new_workspace/results/llama_vid/{args.exp_name}/{save_name}_{args.neighbours}_neighbours" | |
if os.path.exists( f"{save_dir}/{movie}.json" ): | |
print(f"Movie {movie} already processed") | |
with open(f"{save_dir}/{movie}.json", 'r') as f: | |
pred_json = json.load(f) | |
full_questions_result.extend(pred_json) | |
continue | |
use_subtitles_while_generating_summary=not args.vision_only | |
information_RAG_path,embedding_path=self.movie_inference(movie,use_subtitles_while_generating_summary) | |
external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding) | |
if os.path.exists(embedding_path): | |
external_memory.load_embeddings_from_pkl(embedding_path) | |
else: | |
external_memory.load_documents_from_json(information_RAG_path,emdedding_path=embedding_path) | |
save_dir=f"new_workspace/results/llama_vid/{args.exp_name}/{save_name}_{args.neighbours}_neighbours" | |
os.makedirs(save_dir, exist_ok=True) | |
pred_json=[] | |
batch_questions=[] | |
for qa in tqdm(self.movies_dict[movie],desc="Inference questions"): | |
batch_questions.append(qa) | |
if len(batch_questions)<args.batch_size: | |
continue | |
model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,information_RAG_path,embedding_path) | |
for qa,ans,related_info in zip(batch_questions,model_ans,related_text): | |
qa.update({'pred':ans}) | |
qa.update({'related_info':related_info}) | |
pred_json.append(qa) | |
batch_questions=[] | |
if len(batch_questions)>0: | |
model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,information_RAG_path,embedding_path) | |
for qa,ans,related_info in zip(batch_questions,model_ans,related_text): | |
qa.update({'pred':ans}) | |
qa.update({'related_info':related_info}) | |
pred_json.append(qa) | |
full_questions_result.extend(pred_json) | |
with open(f"{save_dir}/{movie}.json", 'w') as fp: | |
json.dump(pred_json, fp) | |
print(f"Movie {movie} prediction saved to {save_dir}/{movie}.json") | |
movie_number+=1 | |
with open(f"{save_dir}/full_pred_s{start}_end{end}.json", 'w') as fp: | |
json.dump(full_questions_result, fp) | |
args=get_arguments() | |
def setup_seeds(seed): | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed(seed) | |
cudnn.benchmark = False | |
cudnn.deterministic = True | |
import yaml | |
# read this file test_configs/llama2_test_config.yaml | |
with open('test_configs/llama2_test_config.yaml') as file: | |
config = yaml.load(file, Loader=yaml.FullLoader) | |
seed=config['run']['seed'] | |
print("seed",seed) | |
if __name__ == "__main__": | |
setup_seeds(seed) | |
llama_vid_eval=LlamaVidQAEval(args) | |
llama_vid_eval.eval_llama_vid() |