|
import torch |
|
from transformers import AutoProcessor, AutoModelForImageTextToText |
|
from typing import List, Dict |
|
import logging |
|
import os |
|
import subprocess |
|
import json |
|
import tempfile |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def _grab_best_device(use_gpu=True): |
|
if torch.cuda.device_count() > 0 and use_gpu: |
|
device = "cuda" |
|
else: |
|
device = "cpu" |
|
return device |
|
|
|
def get_video_duration_seconds(video_path: str) -> float: |
|
"""Use ffprobe to get video duration in seconds.""" |
|
cmd = [ |
|
"ffprobe", |
|
"-v", "quiet", |
|
"-print_format", "json", |
|
"-show_format", |
|
video_path |
|
] |
|
result = subprocess.run(cmd, capture_output=True, text=True) |
|
info = json.loads(result.stdout) |
|
return float(info["format"]["duration"]) |
|
|
|
def format_duration(seconds: int) -> str: |
|
minutes = seconds // 60 |
|
secs = seconds % 60 |
|
return f"{minutes:02d}:{secs:02d}" |
|
|
|
DEVICE = _grab_best_device() |
|
|
|
logger.info(f"Using device: {DEVICE}") |
|
|
|
class VideoAnalyzer: |
|
def __init__(self): |
|
if not torch.cuda.is_available(): |
|
raise RuntimeError("CUDA is required but not available!") |
|
|
|
logger.info("Initializing VideoAnalyzer") |
|
self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" |
|
logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}") |
|
|
|
|
|
self.processor = AutoProcessor.from_pretrained(self.model_path) |
|
|
|
self.model = AutoModelForImageTextToText.from_pretrained( |
|
self.model_path, |
|
torch_dtype=torch.bfloat16, |
|
_attn_implementation="flash_attention_2" |
|
).to(DEVICE) |
|
logger.info(f"Model loaded on device: {self.model.device}") |
|
|
|
def analyze_segment(self, video_path: str, start_time: float) -> str: |
|
"""Analyze a single video segment.""" |
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": [{"type": "text", "text": """You are a detailed video analysis assistant with expertise in scene description. Your task is to: |
|
1. Describe the visual content with precise details |
|
2. Note any significant actions or movements |
|
3. Describe important objects, people, or elements in the scene |
|
4. Capture the mood, atmosphere, or emotional content if present |
|
5. Mention any scene transitions or camera movements |
|
Be specific and thorough, but focus only on what is visually present in this segment."""}] |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "video", "path": video_path}, |
|
{"type": "text", "text": """Describe this video segment in detail. Focus on: |
|
- What objects, people, or elements are visible? |
|
- What actions or movements are occurring? |
|
- What is the setting or environment? |
|
- Are there any notable visual effects or transitions? |
|
- What is the overall mood or atmosphere? |
|
Be specific about visual details but stay concise."""} |
|
] |
|
} |
|
] |
|
|
|
inputs = self.processor.apply_chat_template( |
|
messages, |
|
add_generation_prompt=True, |
|
tokenize=True, |
|
return_dict=True, |
|
return_tensors="pt" |
|
).to(DEVICE, dtype=torch.bfloat16) |
|
|
|
outputs = self.model.generate( |
|
**inputs, |
|
do_sample=True, |
|
temperature=0.7, |
|
max_new_tokens=256 |
|
) |
|
return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1] |
|
|
|
def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]: |
|
try: |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
segments_info = [] |
|
|
|
|
|
duration = get_video_duration_seconds(video_path) |
|
segments_processed = 0 |
|
total_segments = int(duration / segment_length) |
|
logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds") |
|
|
|
|
|
for segment_idx in range(total_segments): |
|
start_time = segment_idx * segment_length |
|
end_time = min(start_time + segment_length, duration) |
|
|
|
|
|
if start_time >= duration: |
|
break |
|
|
|
|
|
segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4") |
|
cmd = [ |
|
"ffmpeg", |
|
"-y", |
|
"-i", video_path, |
|
"-ss", str(start_time), |
|
"-t", str(end_time - start_time), |
|
"-c:v", "libx264", |
|
"-preset", "ultrafast", |
|
"-pix_fmt", "yuv420p", |
|
segment_path |
|
] |
|
subprocess.run(cmd, check=True) |
|
|
|
|
|
description = self.analyze_segment(segment_path, start_time) |
|
|
|
|
|
segments_info.append({ |
|
"timestamp": format_duration(start_time), |
|
"description": description |
|
}) |
|
|
|
|
|
os.remove(segment_path) |
|
|
|
logger.info(f"Processed segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s)") |
|
|
|
|
|
os.rmdir(temp_dir) |
|
|
|
return segments_info |
|
|
|
except Exception as e: |
|
logger.error(f"Error processing video: {str(e)}", exc_info=True) |
|
raise |