Spaces:

becteur92
/

smollvm

Paused

File size: 2,947 Bytes

5f42812

import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict
import decord
import numpy as np
import logging

logger = logging.getLogger(__name__)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")

class VideoAnalyzer:
    def __init__(self):
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is required but not available!")
            
        logger.info("Initializing VideoAnalyzer")
        self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
        logger.info(f"Loading model from {self.model_path}")
        
        cache_dir = "/models"
        logger.info(f"Using cache directory: {cache_dir}")
        
        # Load processor and model
        self.processor = AutoProcessor.from_pretrained(
            self.model_path, 
            cache_dir=cache_dir,
            torch_dtype=torch.bfloat16
        )
        
        # Load model directly to CUDA
        device_map = {"": 0}  # Force model to GPU 0
        self.model = AutoModelForImageTextToText.from_pretrained(
            self.model_path,
            torch_dtype=torch.bfloat16,
            device_map=device_map,
            _attn_implementation="flash_attention_2",
            cache_dir=cache_dir
        )
        logger.info(f"Model loaded on device: {self.model.device}")
        
    def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
        logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
        try:
            # Create message for model
            messages = [{
                "role": "user",
                "content": [
                    {"type": "video", "path": video_path},
                    {"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
                ]
            }]
            
            # Process video using chat template
            inputs = self.processor.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(self.model.device)
            
            # Generate description
            generated_ids = self.model.generate(
                **inputs, 
                do_sample=False,
                max_new_tokens=100
            )
            description = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0]
            
            return [{
                "description": description
            }]
            
        except Exception as e:
            logger.error(f"Error processing video: {str(e)}", exc_info=True)
            raise