Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Running

App Files Files Community

Shak33l-UiRev commited on Nov 14, 2024

Commit

956f2af

verified ·

1 Parent(s): 932131a

donut str error & omniparser path error

Browse files

Files changed (1) hide show

app.py +65 -65

app.py CHANGED Viewed

@@ -45,17 +45,17 @@ def load_model(model_name):
         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
-            yolo_model = YOLO("microsoft/OmniParser")
             # Load Florence-2 processor and model for captioning
             processor = AutoProcessor.from_pretrained(
-                "microsoft/Florence-2-base",
                 trust_remote_code=True
             )
             # Load the captioning model
             caption_model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/OmniParser",
                 trust_remote_code=True
             )
@@ -75,16 +75,7 @@ def load_model(model_name):
 @spaces.GPU
 @torch.inference_mode()
 def analyze_document(image, model_name, models_dict):
-    """Analyze document using selected model
-    Args:
-        image (PIL.Image): Input image to analyze
-        model_name (str): Name of the model to use ("Donut", "LayoutLMv3", or "OmniParser")
-        models_dict (dict): Dictionary containing loaded model components
-    Returns:
-        dict: Analysis results including detected elements, text, and/or coordinates
-    """
     try:
         if models_dict is None:
             return {"error": "Model failed to load", "type": "model_error"}
@@ -98,77 +89,82 @@ def analyze_document(image, model_name, models_dict):
             temp_path = "temp_image.png"
             image.save(temp_path)
-            # Run YOLO detection
-            yolo_results = models_dict['yolo'](
-                temp_path,
-                conf=box_threshold,
-                iou=iou_threshold
-            )
-            # Process detections and generate captions
-            results = []
-            for det in yolo_results[0].boxes.data:
-                x1, y1, x2, y2, conf, cls = det
-                # Get region of interest
-                roi = image.crop((int(x1), int(y1), int(x2), int(y2)))
-                # Generate caption using the model
-                inputs = models_dict['processor'](
-                    images=roi,
-                    return_tensors="pt"
-                )
-                outputs = models_dict['model'].generate(
-                    **inputs,
-                    max_length=50,
-                    num_beams=4,
-                    temperature=0.7
                 )
-                caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)
-                results.append({
-                    "bbox": [float(x) for x in [x1, y1, x2, y2]],
-                    "confidence": float(conf),
-                    "class": int(cls),
-                    "caption": caption
-                })
-            # Clean up temporary file
-            if os.path.exists(temp_path):
-                os.remove(temp_path)
-            return {
-                "detected_elements": len(results),
-                "elements": results
-            }
         elif model_name == "Donut":
             # Process image with Donut
-            pixel_values = models_dict['processor'](image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
-            decoder_input_ids = models_dict['processor'].tokenizer(
                 task_prompt,
                 add_special_tokens=False,
                 return_tensors="pt"
             ).input_ids
-            outputs = models_dict['model'].generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
                 max_length=512,
                 early_stopping=True,
-                pad_token_id=models_dict['processor'].tokenizer.pad_token_id,
-                eos_token_id=models_dict['processor'].tokenizer.eos_token_id,
                 use_cache=True,
                 num_beams=4,
-                bad_words_ids=[[models_dict['processor'].tokenizer.unk_token_id]],
                 return_dict_in_generate=True
             )
-            sequence = models_dict['processor'].batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
             try:
@@ -179,19 +175,22 @@ def analyze_document(image, model_name, models_dict):
             return result
         elif model_name == "LayoutLMv3":
             # Process image with LayoutLMv3
-            encoded_inputs = models_dict['processor'](
                 image,
                 return_tensors="pt",
                 add_special_tokens=True,
                 return_offsets_mapping=True
             )
-            outputs = models_dict['model'](**encoded_inputs)
             predictions = outputs.logits.argmax(-1).squeeze().tolist()
             # Convert predictions to labels
-            words = models_dict['processor'].tokenizer.convert_ids_to_tokens(
                 encoded_inputs.input_ids.squeeze().tolist()
             )
@@ -215,6 +214,7 @@ def analyze_document(image, model_name, models_dict):
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
         return {
             "error": str(e),
             "type": "processing_error",

         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
+            yolo_model = YOLO("microsoft/OmniParser-icon-detection")
             # Load Florence-2 processor and model for captioning
             processor = AutoProcessor.from_pretrained(
+                "microsoft/OmniParser-caption",
                 trust_remote_code=True
             )
             # Load the captioning model
             caption_model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/OmniParser-caption",
                 trust_remote_code=True
             )
 @spaces.GPU
 @torch.inference_mode()
 def analyze_document(image, model_name, models_dict):
+    """Analyze document using selected model"""
     try:
         if models_dict is None:
             return {"error": "Model failed to load", "type": "model_error"}
             temp_path = "temp_image.png"
             image.save(temp_path)
+            try:
+                # Run YOLO detection
+                yolo_results = models_dict['yolo'](
+                    temp_path,
+                    conf=box_threshold,
+                    iou=iou_threshold
                 )
+                # Process detections and generate captions
+                results = []
+                for det in yolo_results[0].boxes.data:
+                    x1, y1, x2, y2, conf, cls = det
+                    # Get region of interest
+                    roi = image.crop((int(x1), int(y1), int(x2), int(y2)))
+                    # Generate caption using the model
+                    inputs = models_dict['processor'](
+                        images=roi,
+                        return_tensors="pt"
+                    )
+                    outputs = models_dict['model'].generate(
+                        **inputs,
+                        max_length=50,
+                        num_beams=4,
+                        temperature=0.7
+                    )
+                    caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)
+                    results.append({
+                        "bbox": [float(x) for x in [x1, y1, x2, y2]],
+                        "confidence": float(conf),
+                        "class": int(cls),
+                        "caption": caption
+                    })
+                return {
+                    "detected_elements": len(results),
+                    "elements": results
+                }
+            finally:
+                # Clean up temporary file
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
         elif model_name == "Donut":
+            model = models_dict['model']
+            processor = models_dict['processor']
             # Process image with Donut
+            pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
+            decoder_input_ids = processor.tokenizer(
                 task_prompt,
                 add_special_tokens=False,
                 return_tensors="pt"
             ).input_ids
+            outputs = model.generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
                 max_length=512,
                 early_stopping=True,
+                pad_token_id=processor.tokenizer.pad_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id,
                 use_cache=True,
                 num_beams=4,
+                bad_words_ids=[[processor.tokenizer.unk_token_id]],
                 return_dict_in_generate=True
             )
+            sequence = processor.batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
             try:
             return result
         elif model_name == "LayoutLMv3":
+            model = models_dict['model']
+            processor = models_dict['processor']
             # Process image with LayoutLMv3
+            encoded_inputs = processor(
                 image,
                 return_tensors="pt",
                 add_special_tokens=True,
                 return_offsets_mapping=True
             )
+            outputs = model(**encoded_inputs)
             predictions = outputs.logits.argmax(-1).squeeze().tolist()
             # Convert predictions to labels
+            words = processor.tokenizer.convert_ids_to_tokens(
                 encoded_inputs.input_ids.squeeze().tolist()
             )
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
+        logger.error(f"Analysis error: {str(e)}\n{error_details}")
         return {
             "error": str(e),
             "type": "processing_error",