Spaces:

becteur92
/

smollvm

Paused

smollvm / src /video_processor /processor.py

youssef

Initial setup for HF Space

5f42812 5 months ago

2.95 kB

	import torch
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from typing import List, Dict
	import decord
	import numpy as np
	import logging

	logger = logging.getLogger(__name__)

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	logger.info(f"Using device: {DEVICE}")

	class VideoAnalyzer:
	def __init__(self):
	if not torch.cuda.is_available():
	raise RuntimeError("CUDA is required but not available!")

	logger.info("Initializing VideoAnalyzer")
	self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
	logger.info(f"Loading model from {self.model_path}")

	cache_dir = "/models"
	logger.info(f"Using cache directory: {cache_dir}")

	# Load processor and model
	self.processor = AutoProcessor.from_pretrained(
	self.model_path,
	cache_dir=cache_dir,
	torch_dtype=torch.bfloat16
	)

	# Load model directly to CUDA
	device_map = {"": 0} # Force model to GPU 0
	self.model = AutoModelForImageTextToText.from_pretrained(
	self.model_path,
	torch_dtype=torch.bfloat16,
	device_map=device_map,
	_attn_implementation="flash_attention_2",
	cache_dir=cache_dir
	)
	logger.info(f"Model loaded on device: {self.model.device}")

	def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
	logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
	try:
	# Create message for model
	messages = [{
	"role": "user",
	"content": [
	{"type": "video", "path": video_path},
	{"type": "text", "text": "Describe this video in detail - with all the timestamps and the actions happening in the video. I should be able to understand the video by reading the description, and search for it later."}
	]
	}]

	# Process video using chat template
	inputs = self.processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt"
	).to(self.model.device)

	# Generate description
	generated_ids = self.model.generate(
	**inputs,
	do_sample=False,
	max_new_tokens=100
	)
	description = self.processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	return [{
	"description": description
	}]

	except Exception as e:
	logger.error(f"Error processing video: {str(e)}", exc_info=True)
	raise