|
from typing import Dict, List, Any |
|
import torch |
|
from transformers import pipeline |
|
from videollama2.conversation import conv_templates, SeparatorStyle |
|
from videollama2.constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX |
|
from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video, process_image |
|
from videollama2.model.builder import load_pretrained_model |
|
|
|
class EndpointHandler(): |
|
def __init__(self, path="DAMO-NLP-SG/VideoLLaMA2-8x7B"): |
|
model_name = get_model_name_from_path(path) |
|
self.tokenizer, self.model, self.processor, self.context_len = load_pretrained_model(path, None, model_name) |
|
self.model = self.model.to('cuda:0') |
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
|
|
paths = data.get("paths", []) |
|
questions = data.get("questions", []) |
|
modal_list = data.get("modal_list", []) |
|
|
|
|
|
if not paths or not modal_list: |
|
return [{"error": "Missing paths or modal_list"}] |
|
|
|
|
|
if modal_list[0] == 'video': |
|
tensor = process_video(paths[0], self.processor, self.model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True) |
|
default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"] |
|
modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"] |
|
else: |
|
tensor = process_image(paths[0], self.processor, self.model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True) |
|
default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"] |
|
modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"] |
|
tensor = [tensor] |
|
|
|
|
|
question = default_mm_token + "\n" + questions[0] |
|
conv_mode = 'llama_2' |
|
conv = conv_templates[conv_mode].copy() |
|
conv.append_message(conv.roles[0], question) |
|
conv.append_message(conv.roles[1], None) |
|
prompt = conv.get_prompt() |
|
input_ids = tokenizer_MMODAL_token(prompt, self.tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0') |
|
|
|
|
|
stop_str = conv.sep if conv.sep_style in [SeparatorStyle.SINGLE] else conv.sep2 |
|
keywords = [stop_str] |
|
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) |
|
with torch.inference_mode(): |
|
output_ids = self.model.generate( |
|
input_ids, |
|
images_or_videos=tensor, |
|
modal_list=modal_list, |
|
do_sample=True, |
|
temperature=0.2, |
|
max_new_tokens=1024, |
|
use_cache=True, |
|
stopping_criteria=[stopping_criteria], |
|
) |
|
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True) |
|
return [{"output": outputs[0]}] |
|
|