Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,175 Bytes
91840f8 5e83df6 91840f8 b1c4e30 8030ab7 b1c4e30 91840f8 b1c4e30 91840f8 b1c4e30 91840f8 b1c4e30 8030ab7 91840f8 b1c4e30 91840f8 b1c4e30 8030ab7 b1c4e30 91840f8 b1c4e30 91840f8 b1c4e30 91840f8 b1c4e30 91840f8 b1c4e30 8030ab7 b1c4e30 91840f8 b1c4e30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from decord import VideoReader, cpu
import spaces
class VLMCaptioning:
def __init__(self):
print("Loading MiniCPM-O model...")
self.model = AutoModel.from_pretrained(
'openbmb/MiniCPM-o-2_6',
trust_remote_code=True,
attn_implementation='sdpa',
torch_dtype=torch.bfloat16,
init_vision=True,
)
self.model = self.model.eval().cuda()
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
@spaces.GPU()
def describe_image(
self,
image: str,
question: str = "Describe this image in detail.",
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 40,
max_new_tokens: int = 512,
stream=False,
sampling=False
) -> str:
"""
Generate description for a single image
Args:
image (str): Path to image file
question (str): Question to ask about the image
temperature (float): Sampling temperature
top_p (float): Nucleus sampling parameter
top_k (int): Top-k sampling parameter
max_new_tokens (int): Maximum new tokens to generate
Returns:
str: Generated description
"""
try:
if not image:
return "Please provide an image."
# Convert image to RGB
image = Image.open(image).convert('RGB')
# Prepare message
msgs = [{'role': 'user', 'content': [image, question]}]
# Generate response
response = self.model.chat(
image=None,
msgs=msgs,
tokenizer=self.tokenizer,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
stream=stream,
sampling=sampling
)
return response
except Exception as e:
return f"Error analyzing image: {str(e)}"
@spaces.GPU()
def describe_video(
self,
video_path: str,
frame_interval: int = 30,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 40,
max_new_tokens: int = 512,
stream=False,
sampling=False
) -> str:
"""
Generate description for video frames
Args:
video_path (str): Path to video file
frame_interval (int): Interval between frames to analyze
temperature (float): Sampling temperature
top_p (float): Nucleus sampling parameter
top_k (int): Top-k sampling parameter
max_new_tokens (int): Maximum new tokens to generate
Returns:
str: Generated description
"""
try:
# Load video and extract frames
vr = VideoReader(video_path, ctx=cpu(0))
total_frames = len(vr)
frame_indices = list(range(0, total_frames, frame_interval))
frames = vr.get_batch(frame_indices).asnumpy()
# Convert frames to PIL Images
frame_images = [Image.fromarray(frame) for frame in frames]
# Prepare messages for all frames
msgs = [
{'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
for frame in frame_images
]
# Generate response for all frames at once
response = self.model.chat(
image=None,
msgs=msgs,
tokenizer=self.tokenizer,
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_new_tokens=max_new_tokens,
stream=stream,
sampling=sampling
)
return response
except Exception as e:
return f"Error processing video: {str(e)}" |