Spaces:

ginigen
/

OmniParser-v2-pro

Running on Zero

App Files Files Community

ginipick commited on 22 days ago

Commit

a3e6550

verified ·

1 Parent(s): 2187315

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -139

app.py CHANGED Viewed

@@ -9,118 +9,203 @@ import base64, os
 from huggingface_hub import snapshot_download
 import traceback
 import warnings
 # Suppress specific warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", message=".*_supports_sdpa.*")
-# Import 유틸리티 함수들
-from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 # Download repository (if not already downloaded)
-repo_id = "microsoft/OmniParser-v2.0"  # HF repository ID
-local_dir = "weights"  # Local directory for weights
-# Check if weights already exist to avoid re-downloading
 if not os.path.exists(local_dir):
     snapshot_download(repo_id=repo_id, local_dir=local_dir)
     print(f"Repository downloaded to: {local_dir}")
 else:
     print(f"Weights already exist at: {local_dir}")
-# Monkey patch for Florence2 model compatibility
-def patch_florence2_model():
-    """Patch Florence2 model to fix compatibility issues with newer transformers"""
     try:
-        import transformers
-        from transformers import AutoModelForCausalLM
-        # Try to import the Florence2 model class
-        try:
-            from transformers_modules.microsoft.Florence_2_base_ft.modeling_florence2 import Florence2ForConditionalGeneration
-        except ImportError:
-            # If not available, we'll patch it when loaded
-            pass
-        # Patch the model loading process
-        original_from_pretrained = AutoModelForCausalLM.from_pretrained
-        def patched_from_pretrained(model_name_or_path, *args, **kwargs):
-            # Force trust_remote_code and add config overrides for Florence2
-            if "florence" in model_name_or_path.lower() or "Florence" in model_name_or_path:
-                kwargs['trust_remote_code'] = True
-                # Add config to avoid SDPA issues
-                kwargs['attn_implementation'] = "eager"
-                kwargs['use_cache'] = False
-            model = original_from_pretrained(model_name_or_path, *args, **kwargs)
-            # Add missing attributes if needed
-            if not hasattr(model, '_supports_sdpa'):
-                model._supports_sdpa = False
-            return model
-        AutoModelForCausalLM.from_pretrained = patched_from_pretrained
-        print("Applied Florence2 compatibility patch")
     except Exception as e:
-        print(f"Warning: Could not apply Florence2 patch: {e}")
-# Apply the patch before loading models
-patch_florence2_model()
-# Load models with error handling
 try:
     print("Loading YOLO model...")
     yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
     print("YOLO model loaded successfully")
     print("Loading caption model...")
-    # Try loading with fallback options
-    try:
-        caption_model_processor = get_caption_model_processor(
-            model_name="florence2",
-            model_name_or_path="weights/icon_caption"
-        )
-        print("Florence2 caption model loaded successfully")
-    except Exception as e:
-        print(f"Error loading Florence2, trying alternative approach: {e}")
-        # Alternative loading method
-        import sys
-        sys.path.insert(0, "weights/icon_caption")
-        from transformers import AutoProcessor, AutoModelForCausalLM
-        # Load with specific configurations to avoid SDPA issues
-        processor = AutoProcessor.from_pretrained(
-            "weights/icon_caption",
-            trust_remote_code=True,
-            revision="main"
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            "weights/icon_caption",
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            trust_remote_code=True,
-            revision="main",
-            attn_implementation="eager",  # Avoid SDPA issues
-            device_map="auto" if torch.cuda.is_available() else None
-        )
-        # Add missing attribute
-        if not hasattr(model, '_supports_sdpa'):
-            model._supports_sdpa = False
-        caption_model_processor = {'model': model, 'processor': processor}
-        print("Caption model loaded with alternative method")
 except Exception as e:
     print(f"Critical error loading models: {e}")
     print(traceback.format_exc())
-    # Try to continue with a dummy model for testing
     caption_model_processor = None
-    raise RuntimeError(f"Failed to load models: {e}")
 # Markdown header text
 MARKDOWN = """
@@ -149,22 +234,6 @@ button:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(0,0,0,0.
 .gr-padded { padding: 16px; }
 """
-def safe_process_wrapper(*args, **kwargs):
-    """Wrapper to handle SDPA attribute errors"""
-    try:
-        return process(*args, **kwargs)
-    except AttributeError as e:
-        if '_supports_sdpa' in str(e):
-            # Try to fix the model on the fly
-            global caption_model_processor
-            if caption_model_processor and 'model' in caption_model_processor:
-                model = caption_model_processor['model']
-                if not hasattr(model, '_supports_sdpa'):
-                    model._supports_sdpa = False
-            return process(*args, **kwargs)
-        else:
-            raise
 @spaces.GPU
 @torch.inference_mode()
 def process(
@@ -182,7 +251,7 @@ def process(
     # Check if caption model is loaded
     if caption_model_processor is None:
-        return None, "⚠️ Caption model not loaded. Please restart the application."
     try:
         # Log processing parameters
@@ -191,7 +260,7 @@ def process(
         # Calculate overlay ratio based on input image width
         image_width = image_input.size[0]
-        box_overlay_ratio = max(0.5, min(2.0, image_width / 3200))  # Clamp ratio between 0.5 and 2.0
         draw_bbox_config = {
             'text_scale': 0.8 * box_overlay_ratio,
@@ -200,7 +269,7 @@ def process(
             'thickness': max(int(3 * box_overlay_ratio), 1),
         }
-        # Run OCR bounding box detection with error handling
         try:
             ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
                 image_input,
@@ -230,9 +299,9 @@ def process(
             print(f"OCR error: {e}, continuing with empty OCR results")
             text, ocr_bbox = [], []
-        # Get labeled image and parsed content via SOM (YOLO + caption model)
         try:
-            # Fix model attributes before calling
             if isinstance(caption_model_processor, dict) and 'model' in caption_model_processor:
                 model = caption_model_processor['model']
                 if not hasattr(model, '_supports_sdpa'):
@@ -243,10 +312,10 @@ def process(
                 yolo_model,
                 BOX_TRESHOLD=box_threshold,
                 output_coord_in_ratio=True,
-                ocr_bbox=ocr_bbox if ocr_bbox else [],  # Ensure it's never None
                 draw_bbox_config=draw_bbox_config,
                 caption_model_processor=caption_model_processor,
-                ocr_text=text if text else [],  # Ensure it's never None
                 iou_threshold=iou_threshold,
                 imgsz=imgsz
             )
@@ -254,24 +323,9 @@ def process(
             if dino_labled_img is None:
                 raise ValueError("Failed to generate labeled image")
-        except AttributeError as e:
-            if '_supports_sdpa' in str(e):
-                print(f"SDPA attribute error, attempting to fix: {e}")
-                # Try to fix and retry
-                if isinstance(caption_model_processor, dict) and 'model' in caption_model_processor:
-                    caption_model_processor['model']._supports_sdpa = False
-                # Retry the operation
-                dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
-                    image_input, yolo_model, BOX_TRESHOLD=box_threshold,
-                    output_coord_in_ratio=True, ocr_bbox=ocr_bbox if ocr_bbox else [],
-                    draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor,
-                    ocr_text=text if text else [], iou_threshold=iou_threshold, imgsz=imgsz
-                )
-            else:
-                raise
         except Exception as e:
             print(f"Error in SOM processing: {e}")
-            # Return original image with error message if SOM fails
             return image_input, f"⚠️ Error during element detection: {str(e)}"
         # Decode processed image from base64
@@ -282,7 +336,7 @@ def process(
             print(f"Error decoding image: {e}")
             return image_input, f"⚠️ Error decoding processed image: {str(e)}"
-        # Format parsed content list into a multi-line string
         if parsed_content_list and len(parsed_content_list) > 0:
             parsed_text = "🎯 **Detected Elements:**\n\n"
             for i, v in enumerate(parsed_content_list):
@@ -300,10 +354,14 @@ def process(
         print(traceback.format_exc())
         return None, error_msg
-# Build Gradio UI with enhanced layout and functionality
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro") as demo:
     gr.Markdown(MARKDOWN)
     with gr.Row():
         # Left sidebar: Upload and settings
         with gr.Column(scale=1):
@@ -323,7 +381,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
                         maximum=1.0,
                         step=0.01,
                         value=0.05,
-                        info="Lower values detect more elements (may include false positives)"
                     )
                     iou_threshold_component = gr.Slider(
@@ -332,13 +390,13 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
                         maximum=1.0,
                         step=0.01,
                         value=0.1,
-                        info="Controls overlap filtering (lower = less filtering)"
                     )
                     use_paddleocr_component = gr.Checkbox(
                         label='🔤 Use PaddleOCR',
                         value=True,
-                        info="✓ PaddleOCR (faster) | ✗ EasyOCR (more languages)"
                     )
                     imgsz_component = gr.Slider(
@@ -347,7 +405,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
                         maximum=1920,
                         step=32,
                         value=640,
-                        info="Higher = better accuracy but slower (640 recommended)"
                     )
                 submit_button_component = gr.Button(
@@ -356,13 +414,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
                     size='lg'
                 )
-                # Add examples section
                 gr.Markdown("### 💡 Quick Tips")
                 gr.Markdown("""
-                - **For mobile apps:** Use default settings
-                - **For desktop apps:** Try image size 1280
-                - **For complex UIs:** Lower box threshold to 0.03
-                - **Too many boxes?** Increase IOU threshold
                 """)
         # Right main area: Results tabs
@@ -380,13 +437,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
                         value="*Parsed elements will appear here after processing...*",
                         elem_classes=["parsed-text"]
                     )
-            # Add status indicator
-            status_text = gr.Markdown("", visible=True)
-    # Button click event with loading spinner
     submit_button_component.click(
-        fn=safe_process_wrapper,  # Use wrapper function
         inputs=[
             image_input_component,
             box_threshold_component,
@@ -398,13 +452,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro"
         show_progress=True
     )
-# Launch with queue support and error handling
 if __name__ == "__main__":
     try:
-        # Set environment variables for better compatibility
         os.environ['TRANSFORMERS_OFFLINE'] = '0'
         os.environ['HF_HUB_OFFLINE'] = '0'
-        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # For better error messages
         demo.queue(max_size=10)
         demo.launch(
@@ -415,5 +468,4 @@ if __name__ == "__main__":
         )
     except Exception as e:
         print(f"Failed to launch app: {e}")
-        print(traceback.format_exc())
-        raise

 from huggingface_hub import snapshot_download
 import traceback
 import warnings
+import sys
 # Suppress specific warnings
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", message=".*_supports_sdpa.*")
+# CRITICAL: Fix Florence2 model before any imports
+def fix_florence2_import():
+    """Pre-patch the Florence2 model class before it's imported"""
+    import importlib.util
+    import types
+    # Create a custom import hook
+    class Florence2ImportHook:
+        def find_spec(self, fullname, path, target=None):
+            if "florence2" in fullname.lower() or "modeling_florence2" in fullname:
+                return importlib.util.spec_from_loader(fullname, Florence2Loader())
+            return None
+    class Florence2Loader:
+        def create_module(self, spec):
+            return None
+        def exec_module(self, module):
+            # Load the original module
+            import importlib.machinery
+            import importlib.util
+            # Find the actual florence2 module
+            for path in sys.path:
+                florence_path = os.path.join(path, "modeling_florence2.py")
+                if os.path.exists(florence_path):
+                    spec = importlib.util.spec_from_file_location("modeling_florence2", florence_path)
+                    if spec and spec.loader:
+                        spec.loader.exec_module(module)
+                        # Patch the module after loading
+                        if hasattr(module, 'Florence2ForConditionalGeneration'):
+                            original_init = module.Florence2ForConditionalGeneration.__init__
+                            def patched_init(self, config):
+                                # Add the missing attribute before calling super().__init__
+                                self._supports_sdpa = False
+                                original_init(self, config)
+                            module.Florence2ForConditionalGeneration.__init__ = patched_init
+                            module.Florence2ForConditionalGeneration._supports_sdpa = False
+                        break
+    # Install the import hook
+    hook = Florence2ImportHook()
+    sys.meta_path.insert(0, hook)
+# Apply the fix before any model imports
+try:
+    fix_florence2_import()
+except Exception as e:
+    print(f"Warning: Could not apply import hook: {e}")
+# Alternative fix: Monkey-patch transformers before importing utils
+def monkey_patch_transformers():
+    """Monkey patch transformers to handle _supports_sdpa"""
+    try:
+        import transformers.modeling_utils as modeling_utils
+        original_check = modeling_utils.PreTrainedModel._check_and_adjust_attn_implementation
+        def patched_check(self, *args, **kwargs):
+            # Add the attribute if missing
+            if not hasattr(self, '_supports_sdpa'):
+                self._supports_sdpa = False
+            try:
+                return original_check(self, *args, **kwargs)
+            except AttributeError as e:
+                if '_supports_sdpa' in str(e):
+                    # Return a safe default
+                    return "eager"
+                raise
+        modeling_utils.PreTrainedModel._check_and_adjust_attn_implementation = patched_check
+        # Also patch the getter
+        original_getattr = modeling_utils.PreTrainedModel.__getattribute__
+        def patched_getattr(self, name):
+            if name == '_supports_sdpa' and not hasattr(self, '_supports_sdpa'):
+                return False
+            return original_getattr(self, name)
+        modeling_utils.PreTrainedModel.__getattribute__ = patched_getattr
+        print("Successfully patched transformers for Florence2 compatibility")
+    except Exception as e:
+        print(f"Warning: Could not patch transformers: {e}")
+# Apply the monkey patch
+monkey_patch_transformers()
+# Now import the utils after patching
+from util.utils import check_ocr_box, get_yolo_model, get_som_labeled_img
 # Download repository (if not already downloaded)
+repo_id = "microsoft/OmniParser-v2.0"
+local_dir = "weights"
 if not os.path.exists(local_dir):
     snapshot_download(repo_id=repo_id, local_dir=local_dir)
     print(f"Repository downloaded to: {local_dir}")
 else:
     print(f"Weights already exist at: {local_dir}")
+# Custom function to load caption model with proper error handling
+def load_caption_model_safe(model_name="florence2", model_name_or_path="weights/icon_caption"):
+    """Safely load caption model with multiple fallback methods"""
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    try:
+        # Method 1: Try the original function with patching
+        from util.utils import get_caption_model_processor
+        return get_caption_model_processor(model_name, model_name_or_path)
+    except AttributeError as e:
+        if '_supports_sdpa' in str(e):
+            print(f"SDPA error detected, trying alternative loading method...")
+        else:
+            raise
+    # Method 2: Load directly with specific configuration
     try:
+        from transformers import AutoProcessor, AutoModelForCausalLM
+        print(f"Loading caption model from {model_name_or_path} with alternative method...")
+        # Load processor
+        processor = AutoProcessor.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=True,
+            revision="main"
+        )
+        # Try to load model with different configurations
+        configs_to_try = [
+            {"attn_implementation": "eager", "use_cache": False},
+            {"use_flash_attention_2": False, "use_cache": False},
+            {"torch_dtype": torch.float32},  # Try float32 instead of float16
+        ]
+        model = None
+        for config in configs_to_try:
+            try:
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_name_or_path,
+                    trust_remote_code=True,
+                    device_map="auto" if torch.cuda.is_available() else None,
+                    **config
+                )
+                # Ensure the attribute exists
+                if not hasattr(model, '_supports_sdpa'):
+                    model._supports_sdpa = False
+                print(f"Model loaded successfully with config: {config}")
+                break
+            except Exception as e:
+                print(f"Failed with config {config}: {e}")
+                continue
+        if model is None:
+            raise RuntimeError("Could not load model with any configuration")
+        # Move to device if needed
+        if device.type == 'cuda' and not next(model.parameters()).is_cuda:
+            model = model.to(device)
+        return {'model': model, 'processor': processor}
     except Exception as e:
+        print(f"Error in alternative loading: {e}")
+        raise
+# Load models
 try:
     print("Loading YOLO model...")
     yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
     print("YOLO model loaded successfully")
     print("Loading caption model...")
+    caption_model_processor = load_caption_model_safe()
+    print("Caption model loaded successfully")
 except Exception as e:
     print(f"Critical error loading models: {e}")
     print(traceback.format_exc())
     caption_model_processor = None
+    # Don't raise here, let the UI handle it
 # Markdown header text
 MARKDOWN = """
 .gr-padded { padding: 16px; }
 """
 @spaces.GPU
 @torch.inference_mode()
 def process(
     # Check if caption model is loaded
     if caption_model_processor is None:
+        return None, "⚠️ Caption model not loaded. There was an error during initialization. Please check the logs."
     try:
         # Log processing parameters
         # Calculate overlay ratio based on input image width
         image_width = image_input.size[0]
+        box_overlay_ratio = max(0.5, min(2.0, image_width / 3200))
         draw_bbox_config = {
             'text_scale': 0.8 * box_overlay_ratio,
             'thickness': max(int(3 * box_overlay_ratio), 1),
         }
+        # Run OCR bounding box detection
         try:
             ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
                 image_input,
             print(f"OCR error: {e}, continuing with empty OCR results")
             text, ocr_bbox = [], []
+        # Get labeled image and parsed content
         try:
+            # Ensure the model has the required attribute
             if isinstance(caption_model_processor, dict) and 'model' in caption_model_processor:
                 model = caption_model_processor['model']
                 if not hasattr(model, '_supports_sdpa'):
                 yolo_model,
                 BOX_TRESHOLD=box_threshold,
                 output_coord_in_ratio=True,
+                ocr_bbox=ocr_bbox if ocr_bbox else [],
                 draw_bbox_config=draw_bbox_config,
                 caption_model_processor=caption_model_processor,
+                ocr_text=text if text else [],
                 iou_threshold=iou_threshold,
                 imgsz=imgsz
             )
             if dino_labled_img is None:
                 raise ValueError("Failed to generate labeled image")
         except Exception as e:
             print(f"Error in SOM processing: {e}")
+            print(traceback.format_exc())
             return image_input, f"⚠️ Error during element detection: {str(e)}"
         # Decode processed image from base64
             print(f"Error decoding image: {e}")
             return image_input, f"⚠️ Error decoding processed image: {str(e)}"
+        # Format parsed content list
         if parsed_content_list and len(parsed_content_list) > 0:
             parsed_text = "🎯 **Detected Elements:**\n\n"
             for i, v in enumerate(parsed_content_list):
         print(traceback.format_exc())
         return None, error_msg
+# Build Gradio UI
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="OmniParser V2 Pro") as demo:
     gr.Markdown(MARKDOWN)
+    # Check if models loaded successfully
+    if caption_model_processor is None:
+        gr.Markdown("### ⚠️ Warning: Caption model failed to load. Some features may not work.")
     with gr.Row():
         # Left sidebar: Upload and settings
         with gr.Column(scale=1):
                         maximum=1.0,
                         step=0.01,
                         value=0.05,
+                        info="Lower values detect more elements"
                     )
                     iou_threshold_component = gr.Slider(
                         maximum=1.0,
                         step=0.01,
                         value=0.1,
+                        info="Controls overlap filtering"
                     )
                     use_paddleocr_component = gr.Checkbox(
                         label='🔤 Use PaddleOCR',
                         value=True,
+                        info="✓ PaddleOCR | ✗ EasyOCR"
                     )
                     imgsz_component = gr.Slider(
                         maximum=1920,
                         step=32,
                         value=640,
+                        info="Higher = better accuracy but slower"
                     )
                 submit_button_component = gr.Button(
                     size='lg'
                 )
                 gr.Markdown("### 💡 Quick Tips")
                 gr.Markdown("""
+                - **Mobile apps:** Use default settings
+                - **Desktop apps:** Try image size 1280
+                - **Complex UIs:** Lower box threshold to 0.03
+                - **Too many boxes:** Increase IOU threshold
                 """)
         # Right main area: Results tabs
                         value="*Parsed elements will appear here after processing...*",
                         elem_classes=["parsed-text"]
                     )
+    # Button click event
     submit_button_component.click(
+        fn=process,
         inputs=[
             image_input_component,
             box_threshold_component,
         show_progress=True
     )
+# Launch with queue support
 if __name__ == "__main__":
     try:
+        # Set environment variables
         os.environ['TRANSFORMERS_OFFLINE'] = '0'
         os.environ['HF_HUB_OFFLINE'] = '0'
         demo.queue(max_size=10)
         demo.launch(
         )
     except Exception as e:
         print(f"Failed to launch app: {e}")
+        print(traceback.format_exc())