Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 22, 2024

Commit

9d6df4b

verified ·

1 Parent(s): 5a09cf2

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -13

app.py CHANGED Viewed

@@ -1,11 +1,91 @@
-import gradio as gr
-from video_rag_tool import VideoRAGTool
-import tempfile
-import os
-from PIL import Image
 import cv2
 import numpy as np
 import torch
 class VideoRAGApp:
     def __init__(self):
@@ -18,7 +98,6 @@ class VideoRAGApp:
         if video_file is None:
             return "Please upload a video first."
-        # Save uploaded video to temporary file
         temp_dir = tempfile.mkdtemp()
         temp_path = os.path.join(temp_dir, "uploaded_video.mp4")
@@ -37,12 +116,11 @@ class VideoRAGApp:
     def query_video(self, query_text):
         """Query the video and return relevant frames with descriptions"""
         if not self.processed:
-            return "Please process a video first."
         try:
             results = self.rag_tool.query_video(query_text, k=4)
-            # Extract frames for display
             frames = []
             captions = []
@@ -63,10 +141,10 @@ class VideoRAGApp:
             cap.release()
-            return frames, captions
         except Exception as e:
-            return f"Error querying video: {str(e)}"
     def create_interface(self):
         """Create and return Gradio interface"""
@@ -108,7 +186,6 @@ class VideoRAGApp:
                 interactive=False
             )
-            # Set up event handlers
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
@@ -123,10 +200,10 @@ class VideoRAGApp:
         return interface
-# For Hugging Face Spaces deployment
 app = VideoRAGApp()
 interface = app.create_interface()
-# Launch the app (for local testing)
 if __name__ == "__main__":
     interface.launch()

 import cv2
 import numpy as np
+from transformers import CLIPProcessor, CLIPModel
 import torch
+from PIL import Image
+import faiss
+import pickle
+from typing import List, Dict, Tuple
+import logging
+import gradio as gr
+import tempfile
+import os
+class VideoRAGTool:
+    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
+        """
+        Initialize the Video RAG Tool with CLIP model for frame analysis.
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = CLIPModel.from_pretrained(model_name).to(self.device)
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        self.frame_index = None
+        self.frame_data = []
+        self.logger = self._setup_logger()
+    def _setup_logger(self) -> logging.Logger:
+        logger = logging.getLogger('VideoRAGTool')
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    def process_video(self, video_path: str, frame_interval: int = 30) -> None:
+        """Process video file and extract features from frames."""
+        self.logger.info(f"Processing video: {video_path}")
+        cap = cv2.VideoCapture(video_path)
+        frame_count = 0
+        features_list = []
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_count % frame_interval == 0:
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                image = Image.fromarray(frame_rgb)
+                inputs = self.processor(images=image, return_tensors="pt").to(self.device)
+                image_features = self.model.get_image_features(**inputs)
+                self.frame_data.append({
+                    'frame_number': frame_count,
+                    'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS)
+                })
+                features_list.append(image_features.cpu().detach().numpy())
+            frame_count += 1
+        cap.release()
+        features_array = np.vstack(features_list)
+        self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
+        self.frame_index.add(features_array)
+        self.logger.info(f"Processed {len(self.frame_data)} frames from video")
+    def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
+        """Query the video using natural language and return relevant frames."""
+        self.logger.info(f"Processing query: {query_text}")
+        inputs = self.processor(text=[query_text], return_tensors="pt").to(self.device)
+        text_features = self.model.get_text_features(**inputs)
+        distances, indices = self.frame_index.search(
+            text_features.cpu().detach().numpy(),
+            k
+        )
+        results = []
+        for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
+            frame_info = self.frame_data[idx].copy()
+            frame_info['relevance_score'] = float(1 / (1 + distance))
+            results.append(frame_info)
+        return results
 class VideoRAGApp:
     def __init__(self):
         if video_file is None:
             return "Please upload a video first."
         temp_dir = tempfile.mkdtemp()
         temp_path = os.path.join(temp_dir, "uploaded_video.mp4")
     def query_video(self, query_text):
         """Query the video and return relevant frames with descriptions"""
         if not self.processed:
+            return None, "Please process a video first."
         try:
             results = self.rag_tool.query_video(query_text, k=4)
             frames = []
             captions = []
             cap.release()
+            return frames, "\n\n".join(captions)
         except Exception as e:
+            return None, f"Error querying video: {str(e)}"
     def create_interface(self):
         """Create and return Gradio interface"""
                 interactive=False
             )
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
         return interface
+# Initialize and create the interface
 app = VideoRAGApp()
 interface = app.create_interface()
+# Launch the app
 if __name__ == "__main__":
     interface.launch()