File size: 2,947 Bytes
5f42812 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict
import decord
import numpy as np
import logging
logger = logging.getLogger(__name__)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
class VideoAnalyzer:
def __init__(self):
if not torch.cuda.is_available():
raise RuntimeError("CUDA is required but not available!")
logger.info("Initializing VideoAnalyzer")
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
logger.info(f"Loading model from {self.model_path}")
cache_dir = "/models"
logger.info(f"Using cache directory: {cache_dir}")
# Load processor and model
self.processor = AutoProcessor.from_pretrained(
self.model_path,
cache_dir=cache_dir,
torch_dtype=torch.bfloat16
)
# Load model directly to CUDA
device_map = {"": 0} # Force model to GPU 0
self.model = AutoModelForImageTextToText.from_pretrained(
self.model_path,
torch_dtype=torch.bfloat16,
device_map=device_map,
_attn_implementation="flash_attention_2",
cache_dir=cache_dir
)
logger.info(f"Model loaded on device: {self.model.device}")
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
try:
# Create message for model
messages = [{
"role": "user",
"content": [
{"type": "video", "path": video_path},
{"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
]
}]
# Process video using chat template
inputs = self.processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(self.model.device)
# Generate description
generated_ids = self.model.generate(
**inputs,
do_sample=False,
max_new_tokens=100
)
description = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
return [{
"description": description
}]
except Exception as e:
logger.error(f"Error processing video: {str(e)}", exc_info=True)
raise |