Chroma-Extra

Running on Zero

App Files Files Community

Chroma-Extra / vlm_captions.py

gokaygokay

understanding

91840f8 7 months ago

raw

history blame

2.32 kB

	import torch
	from PIL import Image
	from transformers import AutoModel, AutoTokenizer
	from decord import VideoReader, cpu
	import spaces

	class VLMCaptioning:
	def __init__(self):
	print("Loading MiniCPM-O model...")
	self.model = AutoModel.from_pretrained(
	'openbmb/MiniCPM-o-2_6',
	trust_remote_code=True,
	attn_implementation='sdpa',
	torch_dtype=torch.bfloat16
	)
	self.model = self.model.eval().cuda()
	self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)

	@spaces.GPU()
	def analyze_image(self, image_path, question="Describe this image in detail."):
	"""Generate description for a single image"""
	try:
	image = Image.open(image_path).convert('RGB')
	msgs = [{'role': 'user', 'content': [image, question]}]

	response = self.model.chat(
	image=None,
	msgs=msgs,
	tokenizer=self.tokenizer
	)
	return response
	except Exception as e:
	return f"Error analyzing image: {str(e)}"

	@spaces.GPU()
	def analyze_video_frames(self, video_path, frame_interval=30):
	"""Extract and analyze frames from video"""
	try:
	# Load video
	vr = VideoReader(video_path, ctx=cpu(0))
	total_frames = len(vr)

	# Extract frames at intervals
	frame_indices = list(range(0, total_frames, frame_interval))
	frames = vr.get_batch(frame_indices).asnumpy()

	descriptions = []
	for frame in frames:
	# Convert frame to PIL Image
	frame_pil = Image.fromarray(frame)

	# Generate description
	msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
	description = self.model.chat(
	image=None,
	msgs=msgs,
	tokenizer=self.tokenizer
	)
	descriptions.append(description)

	return descriptions

	except Exception as e:
	return [f"Error processing video: {str(e)}"]