|
import torch |
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
from typing import List, Dict |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
logger.info(f"Using device: {DEVICE}") |
|
|
|
class VideoAnalyzer: |
|
def __init__(self): |
|
if not torch.cuda.is_available(): |
|
raise RuntimeError("CUDA is required but not available!") |
|
|
|
logger.info("Initializing VideoAnalyzer") |
|
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" |
|
logger.info(f"Loading model from {self.model_path}") |
|
|
|
|
|
self.processor = AutoProcessor.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.bfloat16 |
|
) |
|
|
|
|
|
self.model = AutoModelForImageTextToText.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.bfloat16, |
|
_attn_implementation="flash_attention_2" |
|
).to(DEVICE) |
|
logger.info(f"Model loaded on device: {self.model.device} using attention implementation: flash_attention_2") |
|
|
|
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]: |
|
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}") |
|
try: |
|
|
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": "You are a detailed video analysis assistant that can understand videos. Your task is to provide comprehensive descriptions including all events, actions, and important details with their timestamps. Focus on being specific and thorough." |
|
} |
|
] |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "video", "path": video_path}, |
|
{"type": "text", "text": "Please provide a detailed analysis of this video. Include:\n1. All significant actions and events\n2. Temporal information and timestamps\n3. Important visual details and context\n4. Any text or speech content if present\n5. Scene transitions and changes\nBe thorough and specific so the description can be used for detailed searching later."} |
|
] |
|
} |
|
] |
|
|
|
|
|
inputs = self.processor.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
tokenize=True, |
|
return_dict=True, |
|
return_tensors="pt" |
|
).to(self.model.device) |
|
|
|
|
|
generated_ids = self.model.generate( |
|
**inputs, |
|
do_sample=True, |
|
temperature=0.7, |
|
max_new_tokens=512 |
|
) |
|
description = self.processor.batch_decode( |
|
generated_ids, |
|
skip_special_tokens=True |
|
)[0] |
|
|
|
return [{ |
|
"description": description.split("Assistant: ")[-1] |
|
}] |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing video: {str(e)}", exc_info=True) |
|
raise |