File size: 4,175 Bytes
91840f8
 
 
 
 
 
 
 
 
 
 
 
 
5e83df6
 
91840f8
 
 
 
 
b1c4e30
 
 
 
 
 
 
 
8030ab7
 
b1c4e30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91840f8
b1c4e30
 
 
 
 
 
 
91840f8
 
b1c4e30
91840f8
 
 
b1c4e30
 
 
 
8030ab7
 
 
91840f8
 
 
 
b1c4e30
91840f8
b1c4e30
 
 
 
 
 
 
 
8030ab7
 
b1c4e30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91840f8
b1c4e30
91840f8
 
 
 
b1c4e30
 
 
91840f8
b1c4e30
 
 
 
 
91840f8
b1c4e30
 
 
 
 
 
 
 
8030ab7
 
 
b1c4e30
 
91840f8
 
b1c4e30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from decord import VideoReader, cpu
import spaces

class VLMCaptioning:
    def __init__(self):
        print("Loading MiniCPM-O model...")
        self.model = AutoModel.from_pretrained(
            'openbmb/MiniCPM-o-2_6', 
            trust_remote_code=True,
            attn_implementation='sdpa', 
            torch_dtype=torch.bfloat16,
            init_vision=True,
        )
        self.model = self.model.eval().cuda()
        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)

    @spaces.GPU()
    def describe_image(
        self,
        image: str,
        question: str = "Describe this image in detail.",
        temperature: float = 0.7,
        top_p: float = 0.9,
        top_k: int = 40,
        max_new_tokens: int = 512,
        stream=False,
        sampling=False
    ) -> str:
        """
        Generate description for a single image

        Args:
            image (str): Path to image file
            question (str): Question to ask about the image
            temperature (float): Sampling temperature
            top_p (float): Nucleus sampling parameter
            top_k (int): Top-k sampling parameter
            max_new_tokens (int): Maximum new tokens to generate

        Returns:
            str: Generated description
        """
        try:
            if not image:
                return "Please provide an image."

            # Convert image to RGB
            image = Image.open(image).convert('RGB')
            
            # Prepare message
            msgs = [{'role': 'user', 'content': [image, question]}]
            
            # Generate response
            response = self.model.chat(
                image=None,
                msgs=msgs,
                tokenizer=self.tokenizer,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                max_new_tokens=max_new_tokens,
                stream=stream,
                sampling=sampling
            )
            return response
        except Exception as e:
            return f"Error analyzing image: {str(e)}"

    @spaces.GPU()
    def describe_video(
        self,
        video_path: str,
        frame_interval: int = 30,
        temperature: float = 0.7,
        top_p: float = 0.9,
        top_k: int = 40,
        max_new_tokens: int = 512,
        stream=False,
        sampling=False
    ) -> str:
        """
        Generate description for video frames

        Args:
            video_path (str): Path to video file
            frame_interval (int): Interval between frames to analyze
            temperature (float): Sampling temperature
            top_p (float): Nucleus sampling parameter
            top_k (int): Top-k sampling parameter
            max_new_tokens (int): Maximum new tokens to generate

        Returns:
            str: Generated description
        """
        try:
            # Load video and extract frames
            vr = VideoReader(video_path, ctx=cpu(0))
            total_frames = len(vr)
            frame_indices = list(range(0, total_frames, frame_interval))
            frames = vr.get_batch(frame_indices).asnumpy()

            # Convert frames to PIL Images
            frame_images = [Image.fromarray(frame) for frame in frames]
            
            # Prepare messages for all frames
            msgs = [
                {'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
                for frame in frame_images
            ]
            
            # Generate response for all frames at once
            response = self.model.chat(
                image=None,
                msgs=msgs,
                tokenizer=self.tokenizer,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                max_new_tokens=max_new_tokens,
                stream=stream,
                sampling=sampling
            )
            return response
            
        except Exception as e:
            return f"Error processing video: {str(e)}"