Sa2VA-simple-demo

Runtime error

App Files Files Community

aiqcamp commited on Jan 11

Commit

e4f7d0b

verified ·

1 Parent(s): 6feb9f3

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -39

app.py CHANGED Viewed

@@ -6,14 +6,18 @@ import os
 import tempfile
 import spaces
 import gradio as gr
 import subprocess
 import sys
 def install_flash_attn_wheel():
     flash_attn_wheel_url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
     try:
-        # Call pip to install the wheel file
         subprocess.check_call([sys.executable, "-m", "pip", "install", flash_attn_wheel_url])
         print("Wheel installed successfully!")
     except subprocess.CalledProcessError as e:
@@ -21,7 +25,6 @@ def install_flash_attn_wheel():
 install_flash_attn_wheel()
-import cv2
 try:
     from mmengine.visualization import Visualizer
 except ImportError:
@@ -43,25 +46,75 @@ tokenizer = AutoTokenizer.from_pretrained(
     trust_remote_code = True,
 )
 from third_parts import VideoReader
 def read_video(video_path, video_interval):
     vid_frames = VideoReader(video_path)[::video_interval]
     temp_dir = tempfile.mkdtemp()
     os.makedirs(temp_dir, exist_ok=True)
-    image_paths = []  # List to store paths of saved images
     for frame_idx in range(len(vid_frames)):
         frame_image = vid_frames[frame_idx]
-        frame_image = frame_image[..., ::-1]  # BGR (opencv system) to RGB (numpy system)
         frame_image = Image.fromarray(frame_image)
         vid_frames[frame_idx] = frame_image
-        # Save the frame as a .jpg file in the temporary folder
         image_path = os.path.join(temp_dir, f"frame_{frame_idx:04d}.jpg")
         frame_image.save(image_path, format="JPEG")
-        # Append the image path to the list
         image_paths.append(image_path)
     return vid_frames, image_paths
@@ -71,17 +124,10 @@ def visualize(pred_mask, image_path, work_dir):
     visualizer.set_image(img)
     visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
     visual_result = visualizer.get_image()
     output_path = os.path.join(work_dir, os.path.basename(image_path))
     cv2.imwrite(output_path, visual_result)
     return output_path
-# 코드 상단에 import 추가
-from deep_translator import GoogleTranslator
-# 번역 함수 수정
 def translate_to_korean(text):
     try:
         translator = GoogleTranslator(source='en', target='ko')
@@ -92,7 +138,6 @@ def translate_to_korean(text):
 @spaces.GPU
 def image_vision(image_input_path, prompt):
-    # 한글 입력 확인
     is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     image_path = image_input_path
@@ -109,9 +154,7 @@ def image_vision(image_input_path, prompt):
     print(return_dict)
     answer = return_dict["prediction"]
-    # 한글 프롬프트인 경우 응답을 한글로 번역
     if is_korean:
-        # [SEG]는 보존하면서 나머지 텍스트만 번역
         if '[SEG]' in answer:
             parts = answer.split('[SEG]')
             translated_parts = [translate_to_korean(part.strip()) for part in parts]
@@ -133,7 +176,6 @@ def image_vision(image_input_path, prompt):
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
-    # 한글 입력 확인
     is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     cap = cv2.VideoCapture(video_input_path)
@@ -151,7 +193,6 @@ def video_vision(video_input_path, prompt, video_interval):
     prediction = result['prediction']
     print(prediction)
-    # 한글 프롬프트인 경우 응답을 한글로 번역
     if is_korean:
         if '[SEG]' in prediction:
             parts = prediction.split('[SEG]')
@@ -185,31 +226,38 @@ def video_vision(video_input_path, prompt, video_interval):
         print(f"Video created successfully at {output_video}")
         return prediction, output_video
     else:
         return prediction, None
-# Gradio UI
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Column():
         gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos")
-        gr.HTML("""
-        <div style="display:flex;column-gap:4px;">
-            <a href="https://github.com/magic-research/Sa2VA">
-                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
-            <a href="https://arxiv.org/abs/2501.04001">
-                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
-            </a>
-            <a href="https://huggingface.co/spaces/fffiloni/Sa2VA-simple-demo?duplicate=true">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
-            </a>
-            <a href="https://huggingface.co/fffiloni">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
-            </a>
-        </div>
-        """)
         with gr.Tab("Single Image"):
             with gr.Row():
                 with gr.Column():
@@ -226,6 +274,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 inputs = [image_input, instruction],
                 outputs = [output_res, output_image]
             )
         with gr.Tab("Video"):
             with gr.Row():
                 with gr.Column():
@@ -243,5 +292,34 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 inputs = [video_input, vid_instruction, frame_interval],
                 outputs = [vid_output_res, output_video]
             )
 demo.queue().launch(show_api=False, show_error=True)

 import tempfile
 import spaces
 import gradio as gr
 import subprocess
 import sys
+import cv2
+import threading
+import queue
+import time
+from collections import deque
+from deep_translator import GoogleTranslator
 def install_flash_attn_wheel():
     flash_attn_wheel_url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
     try:
         subprocess.check_call([sys.executable, "-m", "pip", "install", flash_attn_wheel_url])
         print("Wheel installed successfully!")
     except subprocess.CalledProcessError as e:
 install_flash_attn_wheel()
 try:
     from mmengine.visualization import Visualizer
 except ImportError:
     trust_remote_code = True,
 )
+class WebcamProcessor:
+    def __init__(self, model, tokenizer, fps_target=15, buffer_size=5):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.fps_target = fps_target
+        self.frame_interval = 1.0 / fps_target
+        self.buffer_size = buffer_size
+        self.frame_buffer = deque(maxlen=buffer_size)
+        self.result_queue = queue.Queue()
+        self.is_running = False
+        self.last_process_time = 0
+    def start(self):
+        self.is_running = True
+        self.capture = cv2.VideoCapture(0)
+        self.capture_thread = threading.Thread(target=self._capture_loop)
+        self.process_thread = threading.Thread(target=self._process_loop)
+        self.capture_thread.start()
+        self.process_thread.start()
+    def stop(self):
+        self.is_running = False
+        if hasattr(self, 'capture_thread'):
+            self.capture_thread.join()
+            self.process_thread.join()
+            self.capture.release()
+    def _capture_loop(self):
+        while self.is_running:
+            ret, frame = self.capture.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame = cv2.resize(frame, (640, 480))
+                current_time = time.time()
+                if current_time - self.last_process_time >= self.frame_interval:
+                    self.frame_buffer.append(frame)
+                    self.last_process_time = current_time
+    def _process_loop(self):
+        while self.is_running:
+            if len(self.frame_buffer) >= self.buffer_size:
+                frames = list(self.frame_buffer)
+                try:
+                    result = self.model.predict_forward(
+                        video=frames,
+                        text="<image>Describe what you see",
+                        tokenizer=self.tokenizer
+                    )
+                    self.result_queue.put(result)
+                except Exception as e:
+                    print(f"Processing error: {e}")
+                self.frame_buffer.clear()
+            time.sleep(0.1)
 from third_parts import VideoReader
 def read_video(video_path, video_interval):
     vid_frames = VideoReader(video_path)[::video_interval]
     temp_dir = tempfile.mkdtemp()
     os.makedirs(temp_dir, exist_ok=True)
+    image_paths = []
     for frame_idx in range(len(vid_frames)):
         frame_image = vid_frames[frame_idx]
+        frame_image = frame_image[..., ::-1]
         frame_image = Image.fromarray(frame_image)
         vid_frames[frame_idx] = frame_image
         image_path = os.path.join(temp_dir, f"frame_{frame_idx:04d}.jpg")
         frame_image.save(image_path, format="JPEG")
         image_paths.append(image_path)
     return vid_frames, image_paths
     visualizer.set_image(img)
     visualizer.draw_binary_masks(pred_mask, colors='g', alphas=0.4)
     visual_result = visualizer.get_image()
     output_path = os.path.join(work_dir, os.path.basename(image_path))
     cv2.imwrite(output_path, visual_result)
     return output_path
 def translate_to_korean(text):
     try:
         translator = GoogleTranslator(source='en', target='ko')
 @spaces.GPU
 def image_vision(image_input_path, prompt):
     is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     image_path = image_input_path
     print(return_dict)
     answer = return_dict["prediction"]
     if is_korean:
         if '[SEG]' in answer:
             parts = answer.split('[SEG]')
             translated_parts = [translate_to_korean(part.strip()) for part in parts]
 @spaces.GPU(duration=80)
 def video_vision(video_input_path, prompt, video_interval):
     is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
     cap = cv2.VideoCapture(video_input_path)
     prediction = result['prediction']
     print(prediction)
     if is_korean:
         if '[SEG]' in prediction:
             parts = prediction.split('[SEG]')
         print(f"Video created successfully at {output_video}")
         return prediction, output_video
     else:
         return prediction, None
+@spaces.GPU
+def webcam_vision(prompt):
+    is_korean = any(ord('가') <= ord(char) <= ord('힣') for char in prompt)
+    if not hasattr(webcam_vision, 'processor'):
+        webcam_vision.processor = WebcamProcessor(model, tokenizer)
+    if not webcam_vision.processor.is_running:
+        webcam_vision.processor.start()
+    try:
+        result = webcam_vision.processor.result_queue.get(timeout=5)
+        prediction = result['prediction']
+        if is_korean:
+            prediction = translate_to_korean(prediction)
+        return prediction
+    except queue.Empty:
+        return "No results available yet"
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Gradio UI
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Column():
         gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos")
         with gr.Tab("Single Image"):
             with gr.Row():
                 with gr.Column():
                 inputs = [image_input, instruction],
                 outputs = [output_res, output_image]
             )
         with gr.Tab("Video"):
             with gr.Row():
                 with gr.Column():
                 inputs = [video_input, vid_instruction, frame_interval],
                 outputs = [vid_output_res, output_video]
             )
+        with gr.Tab("Webcam"):
+            with gr.Row():
+                with gr.Column():
+                    webcam_input = gr.Image(source="webcam", streaming=True)
+                    with gr.Row():
+                        webcam_instruction = gr.Textbox(
+                            label="Instruction",
+                            placeholder="Enter instruction here...",
+                            scale=4
+                        )
+                        start_button = gr.Button("Start", scale=1)
+                        stop_button = gr.Button("Stop", scale=1)
+                with gr.Column():
+                    webcam_output = gr.Textbox(label="Response")
+                    processed_view = gr.Image(label="Processed View")
+            status_text = gr.Textbox(label="Status", value="Ready")
+            start_button.click(
+                fn=lambda x: webcam_vision(x),
+                inputs=[webcam_instruction],
+                outputs=[webcam_output]
+            )
+            stop_button.click(
+                fn=lambda: "Stopped" if hasattr(webcam_vision, 'processor') and webcam_vision.processor.stop() else "Not running",
+                outputs=[status_text]
+            )
 demo.queue().launch(show_api=False, show_error=True)