Chroma-Extra / vlm_captions.py
gokaygokay's picture
understanding
91840f8
raw
history blame
2.32 kB
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from decord import VideoReader, cpu
import spaces
class VLMCaptioning:
def __init__(self):
print("Loading MiniCPM-O model...")
self.model = AutoModel.from_pretrained(
'openbmb/MiniCPM-o-2_6',
trust_remote_code=True,
attn_implementation='sdpa',
torch_dtype=torch.bfloat16
)
self.model = self.model.eval().cuda()
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
@spaces.GPU()
def analyze_image(self, image_path, question="Describe this image in detail."):
"""Generate description for a single image"""
try:
image = Image.open(image_path).convert('RGB')
msgs = [{'role': 'user', 'content': [image, question]}]
response = self.model.chat(
image=None,
msgs=msgs,
tokenizer=self.tokenizer
)
return response
except Exception as e:
return f"Error analyzing image: {str(e)}"
@spaces.GPU()
def analyze_video_frames(self, video_path, frame_interval=30):
"""Extract and analyze frames from video"""
try:
# Load video
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
# Extract frames at intervals
frame_indices = list(range(0, total_frames, frame_interval))
frames = vr.get_batch(frame_indices).asnumpy()
descriptions = []
for frame in frames:
# Convert frame to PIL Image
frame_pil = Image.fromarray(frame)
# Generate description
msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
description = self.model.chat(
image=None,
msgs=msgs,
tokenizer=self.tokenizer
)
descriptions.append(description)
return descriptions
except Exception as e:
return [f"Error processing video: {str(e)}"]