File size: 3,819 Bytes
5f42812
 
 
 
 
 
 
426a08c
 
 
 
 
 
 
 
 
5f42812
 
 
 
 
 
 
 
 
24c2f62
5f42812
 
24c2f62
f5765c8
5f42812
 
 
d200533
b75046f
24c2f62
5f42812
 
 
 
bc47c2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24c2f62
 
5f42812
 
 
 
 
 
 
 
fb1b414
d200533
24c2f62
bc47c2c
5f42812
 
bc47c2c
 
 
5f42812
24c2f62
 
5f42812
 
 
 
24c2f62
 
5f42812
 
bc47c2c
5f42812
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict
import logging

logger = logging.getLogger(__name__)

def _grab_best_device(use_gpu=True):
    if torch.cuda.device_count() > 0 and use_gpu:
        device = "cuda"
    else:
        device = "cpu"
    return device

DEVICE = _grab_best_device()

logger.info(f"Using device: {DEVICE}")

class VideoAnalyzer:
    def __init__(self):
        if not torch.cuda.is_available():
            raise RuntimeError("CUDA is required but not available!")
            
        logger.info("Initializing VideoAnalyzer")
        self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
        logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
        
        # Load processor and model
        self.processor = AutoProcessor.from_pretrained(self.model_path)

        self.model = AutoModelForImageTextToText.from_pretrained(
            self.model_path,
            torch_dtype=torch.bfloat16,
            # _attn_implementation="flash_attention_2"
        ).to(DEVICE)
        logger.info(f"Model loaded on device: {self.model.device}")
        
    def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
        logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
        try:
            # Create message for model with detailed system prompt
            messages = [
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text", 
                            "text": "You are a detailed video analysis assistant that can understand videos. Your task is to provide comprehensive descriptions including all events, actions, and important details with their timestamps. Focus on being specific and thorough."
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "video", "path": video_path},
                        {"type": "text", "text": "Please provide a detailed analysis of this video. Include:\n1. All significant actions and events\n2. Temporal information and timestamps\n3. Important visual details and context\n4. Any text or speech content if present\n5. Scene transitions and changes\nBe thorough and specific so the description can be used for detailed searching later."}
                    ]
                }
            ]

            logger.info(f"Applying chat template - message: {messages}")
            
            # Process video using chat template
            inputs = self.processor.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                return_tensors="pt"
            ).to(DEVICE, dtype=torch.bfloat16)
            
            logger.info(f"Generating IDs")
            # Generate description with increased token limit
            generated_ids = self.model.generate(
                **inputs, 
                do_sample=True,
                temperature=0.7,
                max_new_tokens=512  # Increased from 100 to get more detailed descriptions
            )

            logger.info(f"batch decoding...")
            description = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0]


            
            return [{
                "description": description.split("Assistant: ")[-1]  # Remove assistant prefix if present
            }]
            
        except Exception as e:
            logger.error(f"Error processing video: {str(e)}", exc_info=True)
            raise