Aliayub1995
/

video-llama2-8x7b

Model card Files Files and versions

video-llama2-8x7b / handler.py

Aliayub1995's picture

Upload 56 files

e30d941 verified about 1 year ago

history blame contribute delete

3.06 kB

	from typing import Dict, List, Any
	import torch
	from transformers import pipeline
	from videollama2.conversation import conv_templates, SeparatorStyle
	from videollama2.constants import DEFAULT_MMODAL_TOKEN, MMODAL_TOKEN_INDEX
	from videollama2.mm_utils import get_model_name_from_path, tokenizer_MMODAL_token, KeywordsStoppingCriteria, process_video, process_image
	from videollama2.model.builder import load_pretrained_model

	class EndpointHandler():
	def __init__(self, path="DAMO-NLP-SG/VideoLLaMA2-8x7B"):
	model_name = get_model_name_from_path(path)
	self.tokenizer, self.model, self.processor, self.context_len = load_pretrained_model(path, None, model_name)
	self.model = self.model.to('cuda:0')

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	# get inputs
	paths = data.get("paths", [])
	questions = data.get("questions", [])
	modal_list = data.get("modal_list", [])

	# check if modal_list and paths are provided
	if not paths or not modal_list:
	return [{"error": "Missing paths or modal_list"}]

	# Visual preprocess (load & transform image or video)
	if modal_list[0] == 'video':
	tensor = process_video(paths[0], self.processor, self.model.config.image_aspect_ratio).to(dtype=torch.float16, device='cuda', non_blocking=True)
	default_mm_token = DEFAULT_MMODAL_TOKEN["VIDEO"]
	modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
	else:
	tensor = process_image(paths[0], self.processor, self.model.config.image_aspect_ratio)[0].to(dtype=torch.float16, device='cuda', non_blocking=True)
	default_mm_token = DEFAULT_MMODAL_TOKEN["IMAGE"]
	modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
	tensor = [tensor]

	# Text preprocess (tag process & generate prompt)
	question = default_mm_token + "\n" + questions[0]
	conv_mode = 'llama_2'
	conv = conv_templates[conv_mode].copy()
	conv.append_message(conv.roles[0], question)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()
	input_ids = tokenizer_MMODAL_token(prompt, self.tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to('cuda:0')

	# Generate a response according to visual signals and prompts
	stop_str = conv.sep if conv.sep_style in [SeparatorStyle.SINGLE] else conv.sep2
	keywords = [stop_str]
	stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
	with torch.inference_mode():
	output_ids = self.model.generate(
	input_ids,
	images_or_videos=tensor,
	modal_list=modal_list,
	do_sample=True,
	temperature=0.2,
	max_new_tokens=1024,
	use_cache=True,
	stopping_criteria=[stopping_criteria],
	)
	outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
	return [{"output": outputs[0]}]