smollvm / src /video_processor /processor.py
youssef
Initial setup for HF Space
5f42812
raw
history blame
2.95 kB
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict
import decord
import numpy as np
import logging
logger = logging.getLogger(__name__)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
class VideoAnalyzer:
def __init__(self):
if not torch.cuda.is_available():
raise RuntimeError("CUDA is required but not available!")
logger.info("Initializing VideoAnalyzer")
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
logger.info(f"Loading model from {self.model_path}")
cache_dir = "/models"
logger.info(f"Using cache directory: {cache_dir}")
# Load processor and model
self.processor = AutoProcessor.from_pretrained(
self.model_path,
cache_dir=cache_dir,
torch_dtype=torch.bfloat16
)
# Load model directly to CUDA
device_map = {"": 0} # Force model to GPU 0
self.model = AutoModelForImageTextToText.from_pretrained(
self.model_path,
torch_dtype=torch.bfloat16,
device_map=device_map,
_attn_implementation="flash_attention_2",
cache_dir=cache_dir
)
logger.info(f"Model loaded on device: {self.model.device}")
def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
try:
# Create message for model
messages = [{
"role": "user",
"content": [
{"type": "video", "path": video_path},
{"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
]
}]
# Process video using chat template
inputs = self.processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(self.model.device)
# Generate description
generated_ids = self.model.generate(
**inputs,
do_sample=False,
max_new_tokens=100
)
description = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
return [{
"description": description
}]
except Exception as e:
logger.error(f"Error processing video: {str(e)}", exc_info=True)
raise