Chroma-Extra

Running on Zero

App Files Files Community

gokaygokay commited on Feb 5

Commit

b1c4e30

1 Parent(s): 91840f8

understanding

Browse files

Files changed (2) hide show

llm_inference_video.py +2 -2
vlm_captions.py +81 -25

llm_inference_video.py CHANGED Viewed

@@ -20,11 +20,11 @@ class VideoLLMInferenceNode:
     def analyze_image(self, image_path, question=None):
         """Analyze image using MiniCPM-O"""
-        return self.vlm.analyze_image(image_path, question)
     def analyze_video(self, video_path):
         """Analyze video using MiniCPM-O"""
-        return self.vlm.analyze_video_frames(video_path)
     def generate_video_prompt(
         self,

     def analyze_image(self, image_path, question=None):
         """Analyze image using MiniCPM-O"""
+        return self.vlm.describe_image(image_path, question)
     def analyze_video(self, video_path):
         """Analyze video using MiniCPM-O"""
+        return self.vlm.describe_video(video_path)
     def generate_video_prompt(
         self,

vlm_captions.py CHANGED Viewed

@@ -17,48 +17,104 @@ class VLMCaptioning:
         self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
     @spaces.GPU()
-    def analyze_image(self, image_path, question="Describe this image in detail."):
-        """Generate description for a single image"""
         try:
-            image = Image.open(image_path).convert('RGB')
             msgs = [{'role': 'user', 'content': [image, question]}]
             response = self.model.chat(
                 image=None,
                 msgs=msgs,
-                tokenizer=self.tokenizer
             )
             return response
         except Exception as e:
             return f"Error analyzing image: {str(e)}"
     @spaces.GPU()
-    def analyze_video_frames(self, video_path, frame_interval=30):
-        """Extract and analyze frames from video"""
         try:
-            # Load video
             vr = VideoReader(video_path, ctx=cpu(0))
             total_frames = len(vr)
-            # Extract frames at intervals
             frame_indices = list(range(0, total_frames, frame_interval))
             frames = vr.get_batch(frame_indices).asnumpy()
-            descriptions = []
-            for frame in frames:
-                # Convert frame to PIL Image
-                frame_pil = Image.fromarray(frame)
-                # Generate description
-                msgs = [{'role': 'user', 'content': [frame_pil, "Describe the main action in this scene."]}]
-                description = self.model.chat(
-                    image=None,
-                    msgs=msgs,
-                    tokenizer=self.tokenizer
-                )
-                descriptions.append(description)
-            return descriptions
         except Exception as e:
-            return [f"Error processing video: {str(e)}"]

         self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
     @spaces.GPU()
+    def describe_image(
+        self,
+        image: str,
+        question: str = "Describe this image in detail.",
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 40,
+        max_new_tokens: int = 512,
+    ) -> str:
+        """
+        Generate description for a single image
+        Args:
+            image (str): Path to image file
+            question (str): Question to ask about the image
+            temperature (float): Sampling temperature
+            top_p (float): Nucleus sampling parameter
+            top_k (int): Top-k sampling parameter
+            max_new_tokens (int): Maximum new tokens to generate
+        Returns:
+            str: Generated description
+        """
         try:
+            if not image:
+                return "Please provide an image."
+            # Convert image to RGB
+            image = Image.open(image).convert('RGB')
+            # Prepare message
             msgs = [{'role': 'user', 'content': [image, question]}]
+            # Generate response
             response = self.model.chat(
                 image=None,
                 msgs=msgs,
+                tokenizer=self.tokenizer,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                max_new_tokens=max_new_tokens
             )
             return response
         except Exception as e:
             return f"Error analyzing image: {str(e)}"
     @spaces.GPU()
+    def describe_video(
+        self,
+        video_path: str,
+        frame_interval: int = 30,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 40,
+        max_new_tokens: int = 512,
+    ) -> str:
+        """
+        Generate description for video frames
+        Args:
+            video_path (str): Path to video file
+            frame_interval (int): Interval between frames to analyze
+            temperature (float): Sampling temperature
+            top_p (float): Nucleus sampling parameter
+            top_k (int): Top-k sampling parameter
+            max_new_tokens (int): Maximum new tokens to generate
+        Returns:
+            str: Generated description
+        """
         try:
+            # Load video and extract frames
             vr = VideoReader(video_path, ctx=cpu(0))
             total_frames = len(vr)
             frame_indices = list(range(0, total_frames, frame_interval))
             frames = vr.get_batch(frame_indices).asnumpy()
+            # Convert frames to PIL Images
+            frame_images = [Image.fromarray(frame) for frame in frames]
+            # Prepare messages for all frames
+            msgs = [
+                {'role': 'user', 'content': [frame, "Describe the main action in this scene."]}
+                for frame in frame_images
+            ]
+            # Generate response for all frames at once
+            response = self.model.chat(
+                image=None,
+                msgs=msgs,
+                tokenizer=self.tokenizer,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                max_new_tokens=max_new_tokens
+            )
+            return response
         except Exception as e:
+            return f"Error processing video: {str(e)}"