Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
from PIL import Image | |
from transformers import AutoModel, AutoTokenizer | |
from decord import VideoReader, cpu | |
import spaces | |
class VLMCaptioning: | |
def __init__(self): | |
print("Loading MiniCPM-O model...") | |
self.model = AutoModel.from_pretrained( | |
'openbmb/MiniCPM-o-2_6', | |
trust_remote_code=True, | |
attn_implementation='sdpa', | |
torch_dtype=torch.bfloat16 | |
) | |
self.model = self.model.eval().cuda() | |
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True) | |
def analyze_image(self, image_path, question="Describe this image in detail."): | |
"""Generate description for a single image""" | |
try: | |
image = Image.open(image_path).convert('RGB') | |
msgs = [{'role': 'user', 'content': [image, question]}] | |
response = self.model.chat( | |
image=None, | |
msgs=msgs, | |
tokenizer=self.tokenizer | |
) | |
return response | |
except Exception as e: | |
return f"Error analyzing image: {str(e)}" | |
def analyze_video_frames(self, video_path, frame_interval=30): | |
"""Extract and analyze frames from video""" | |
try: | |
# Load video | |
vr = VideoReader(video_path, ctx=cpu(0)) | |
total_frames = len(vr) | |
# Extract frames at intervals | |
frame_indices = list(range(0, total_frames, frame_interval)) | |
frames = vr.get_batch(frame_indices).asnumpy() | |
descriptions = [] | |
for frame in frames: | |
# Convert frame to PIL Image | |
frame_pil = Image.fromarray(frame) | |
# Generate description | |
msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}] | |
description = self.model.chat( | |
image=None, | |
msgs=msgs, | |
tokenizer=self.tokenizer | |
) | |
descriptions.append(description) | |
return descriptions | |
except Exception as e: | |
return [f"Error processing video: {str(e)}"] |