Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Running

App Files Files Community

Shak33l-UiRev commited on Nov 13, 2024

Commit

63da31e

verified ·

1 Parent(s): 95816fe

correct paths

Browse files

Using the correct model paths for OmniParser:

Icon detection: "microsoft/OmniParser-icon-detection"
Caption generation: "microsoft/OmniParser-caption"

Added better error handling and debug information:

Timestamps for debug messages
Color-coded messages by level
More detailed error information

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -33,14 +33,14 @@ def load_model(model_name):
         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
-            yolo_model = YOLO('microsoft/OmniParser', task='detect')
-            # Load Florence-2 model for captioning
-            processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
             model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/OmniParser",
-                torch_dtype=torch.float16,
                 trust_remote_code=True
             )
             return {
                 'yolo': yolo_model,
                 'processor': processor,
@@ -48,6 +48,7 @@ def load_model(model_name):
             }
         return model, processor
     except Exception as e:
         st.error(f"Error loading model {model_name}: {str(e)}")
         return None, None
@@ -61,15 +62,14 @@ def analyze_document(image, model_name, model, processor):
             image.save(temp_path)
             # Configure box detection parameters
-            box_threshold = 0.05  # Can be made configurable
-            iou_threshold = 0.1   # Can be made configurable
             # Run YOLO detection
             yolo_results = model['yolo'](
                 temp_path,
                 conf=box_threshold,
-                iou=iou_threshold,
-                device='cpu' if not torch.cuda.is_available() else 'cuda'
             )
             # Process detections
@@ -80,7 +80,7 @@ def analyze_document(image, model_name, model, processor):
                 # Get region of interest
                 roi = image.crop((x1, y1, x2, y2))
-                # Generate caption using Florence-2
                 inputs = processor(images=roi, return_tensors="pt")
                 outputs = model['model'].generate(**inputs, max_length=50)
                 caption = processor.decode(outputs[0], skip_special_tokens=True)
@@ -97,8 +97,8 @@ def analyze_document(image, model_name, model, processor):
                 "elements": results
             }
-        # [Previous model handling remains the same...]
         elif model_name == "Donut":
             pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
             decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
@@ -125,6 +125,7 @@ def analyze_document(image, model_name, model, processor):
                 result = {"raw_text": sequence}
         elif model_name == "LayoutLMv3":
             encoded_inputs = processor(
                 image,
                 return_tensors="pt",
@@ -154,9 +155,8 @@ def analyze_document(image, model_name, model, processor):
         return result
     except Exception as e:
-        error_msg = str(e)
-        st.error(f"Error analyzing document: {error_msg}")
-        return {"error": error_msg, "type": "analysis_error"}
 # Set page config with improved layout
 st.set_page_config(

         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
+            yolo_model = YOLO("microsoft/OmniParser-icon-detection")
+            # Load BLIP-2 model for captioning
+            processor = AutoProcessor.from_pretrained("microsoft/OmniParser-caption")
             model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/OmniParser-caption",
                 trust_remote_code=True
             )
             return {
                 'yolo': yolo_model,
                 'processor': processor,
             }
         return model, processor
     except Exception as e:
         st.error(f"Error loading model {model_name}: {str(e)}")
         return None, None
             image.save(temp_path)
             # Configure box detection parameters
+            box_threshold = 0.05
+            iou_threshold = 0.1
             # Run YOLO detection
             yolo_results = model['yolo'](
                 temp_path,
                 conf=box_threshold,
+                iou=iou_threshold
             )
             # Process detections
                 # Get region of interest
                 roi = image.crop((x1, y1, x2, y2))
+                # Generate caption using the model
                 inputs = processor(images=roi, return_tensors="pt")
                 outputs = model['model'].generate(**inputs, max_length=50)
                 caption = processor.decode(outputs[0], skip_special_tokens=True)
                 "elements": results
             }
         elif model_name == "Donut":
+            # Previous Donut code remains the same
             pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
             decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
                 result = {"raw_text": sequence}
         elif model_name == "LayoutLMv3":
+            # Previous LayoutLMv3 code remains the same
             encoded_inputs = processor(
                 image,
                 return_tensors="pt",
         return result
     except Exception as e:
+        st.error(f"Error analyzing document: {str(e)}")
+        return {"error": str(e), "type": "analysis_error"}
 # Set page config with improved layout
 st.set_page_config(