video-llama2-8x7b / handler.py
Aliayub1995's picture
Upload 56 files
e30d941 verified
from typing import Dict, List, Any
import torch
from transformers import pipeline
from videollama2.conversation import conv_templates, SeparatorStyle
from videollama2.constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX
from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video, process_image
from videollama2.model.builder import load_pretrained_model
class EndpointHandler():
def __init__(self, path="DAMO-NLP-SG/VideoLLaMA2-8x7B"):
model_name = get_model_name_from_path(path)
self.tokenizer, self.model, self.processor, self.context_len = load_pretrained_model(path, None, model_name)
self.model = self.model.to('cuda:0')
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
# get inputs
paths = data.get("paths", [])
questions = data.get("questions", [])
modal_list = data.get("modal_list", [])
# check if modal_list and paths are provided
if not paths or not modal_list:
return [{"error": "Missing paths or modal_list"}]
# Visual preprocess (load & transform image or video)
if modal_list[0] == 'video':
tensor = process_video(paths[0], self.processor, self.model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True)
default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
else:
tensor = process_image(paths[0], self.processor, self.model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True)
default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"]
modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
tensor = [tensor]
# Text preprocess (tag process & generate prompt)
question = default_mm_token + "\n" + questions[0]
conv_mode = 'llama_2'
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_MMODAL_token(prompt, self.tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0')
# Generate a response according to visual signals and prompts
stop_str = conv.sep if conv.sep_style in [SeparatorStyle.SINGLE] else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images_or_videos=tensor,
modal_list=modal_list,
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria],
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
return [{"output": outputs[0]}]