Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Running

App Files Files Community

Shak33l-UiRev commited on Nov 14, 2024

Commit

dea33ff

verified ·

1 Parent(s): 63da31e

updated device management

Browse files

Files changed (1) hide show

app.py +104 -43

app.py CHANGED Viewed

@@ -17,7 +17,14 @@ from datetime import datetime
 @st.cache_resource
 def load_model(model_name):
-    """Load the selected model and processor"""
     try:
         if model_name == "Donut":
             processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
@@ -27,63 +34,98 @@ def load_model(model_name):
             model.config.pad_token_id = processor.tokenizer.pad_token_id
             model.config.vocab_size = len(processor.tokenizer)
         elif model_name == "LayoutLMv3":
             processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
             model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
-            yolo_model = YOLO("microsoft/OmniParser-icon-detection")
-            # Load BLIP-2 model for captioning
-            processor = AutoProcessor.from_pretrained("microsoft/OmniParser-caption")
-            model = AutoModelForCausalLM.from_pretrained(
-                "microsoft/OmniParser-caption",
                 trust_remote_code=True
             )
             return {
                 'yolo': yolo_model,
                 'processor': processor,
-                'model': model
             }
-        return model, processor
     except Exception as e:
         st.error(f"Error loading model {model_name}: {str(e)}")
-        return None, None
-def analyze_document(image, model_name, model, processor):
-    """Analyze document using selected model"""
     try:
         if model_name == "OmniParser":
-            # Save image temporarily
             temp_path = "temp_image.png"
             image.save(temp_path)
-            # Configure box detection parameters
-            box_threshold = 0.05
-            iou_threshold = 0.1
             # Run YOLO detection
-            yolo_results = model['yolo'](
                 temp_path,
                 conf=box_threshold,
                 iou=iou_threshold
             )
-            # Process detections
             results = []
             for det in yolo_results[0].boxes.data:
                 x1, y1, x2, y2, conf, cls = det
                 # Get region of interest
-                roi = image.crop((x1, y1, x2, y2))
                 # Generate caption using the model
-                inputs = processor(images=roi, return_tensors="pt")
-                outputs = model['model'].generate(**inputs, max_length=50)
-                caption = processor.decode(outputs[0], skip_special_tokens=True)
                 results.append({
                     "bbox": [float(x) for x in [x1, y1, x2, y2]],
@@ -92,31 +134,40 @@ def analyze_document(image, model_name, model, processor):
                     "caption": caption
                 })
             return {
                 "detected_elements": len(results),
                 "elements": results
             }
         elif model_name == "Donut":
-            # Previous Donut code remains the same
-            pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
-            decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-            outputs = model.generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
                 max_length=512,
                 early_stopping=True,
-                pad_token_id=processor.tokenizer.pad_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id,
                 use_cache=True,
                 num_beams=4,
-                bad_words_ids=[[processor.tokenizer.unk_token_id]],
                 return_dict_in_generate=True
             )
-            sequence = processor.batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
             try:
@@ -124,19 +175,22 @@ def analyze_document(image, model_name, model, processor):
             except json.JSONDecodeError:
                 result = {"raw_text": sequence}
         elif model_name == "LayoutLMv3":
-            # Previous LayoutLMv3 code remains the same
-            encoded_inputs = processor(
                 image,
                 return_tensors="pt",
                 add_special_tokens=True,
                 return_offsets_mapping=True
             )
-            outputs = model(**encoded_inputs)
             predictions = outputs.logits.argmax(-1).squeeze().tolist()
-            words = processor.tokenizer.convert_ids_to_tokens(
                 encoded_inputs.input_ids.squeeze().tolist()
             )
@@ -152,11 +206,19 @@ def analyze_document(image, model_name, model, processor):
                 "confidence_scores": outputs.logits.softmax(-1).max(-1).values.squeeze().tolist()
             }
-        return result
     except Exception as e:
-        st.error(f"Error analyzing document: {str(e)}")
-        return {"error": str(e), "type": "analysis_error"}
 # Set page config with improved layout
 st.set_page_config(
@@ -372,6 +434,7 @@ st.markdown("""
 """)
 # Add performance metrics section
 if st.checkbox("Show Performance Metrics"):
     st.markdown("""
     ### Model Performance Metrics
@@ -379,8 +442,7 @@ if st.checkbox("Show Performance Metrics"):
     |-------|---------------------|--------------|-----------|
     | Donut | 2-3 seconds | 6-8GB | 85-90% |
     | LayoutLMv3 | 3-4 seconds | 12-15GB | 88-93% |
-    | BROS | 1-2 seconds | 4-6GB | 82-87% |
-    | LLaVA-1.5 | 4-5 seconds | 25-40GB | 90-95% |
     *Accuracy varies based on document type and quality
     """)
@@ -389,7 +451,7 @@ if st.checkbox("Show Performance Metrics"):
 st.markdown("---")
 st.markdown("""
 v1.1 - Created with Streamlit
-\nFor issues or feedback, please visit our [GitHub repository](https://github.com/yourusername/doc-analysis)
 """)
 # Add model selection guidance
@@ -398,6 +460,5 @@ if st.checkbox("Show Model Selection Guide"):
     ### How to Choose the Right Model
     1. **Donut**: Choose for structured documents with clear layouts
     2. **LayoutLMv3**: Best for documents with complex layouts and relationships
-    3. **BROS**: Ideal for quick analysis and simple documents
-    4. **LLaVA-1.5**: Perfect for complex documents requiring deep understanding
     """)

 @st.cache_resource
 def load_model(model_name):
+    """Load the selected model and processor
+    Args:
+        model_name (str): Name of the model to load ("Donut", "LayoutLMv3", or "OmniParser")
+    Returns:
+        dict: Dictionary containing model components
+    """
     try:
         if model_name == "Donut":
             processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
             model.config.pad_token_id = processor.tokenizer.pad_token_id
             model.config.vocab_size = len(processor.tokenizer)
+            return {'model': model, 'processor': processor}
         elif model_name == "LayoutLMv3":
             processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
             model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
+            return {'model': model, 'processor': processor}
         elif model_name == "OmniParser":
             # Load YOLO model for icon detection
+            yolo_model = YOLO("microsoft/OmniParser")
+            # Load Florence-2 processor and model for captioning
+            processor = AutoProcessor.from_pretrained(
+                "microsoft/Florence-2-base",
+                trust_remote_code=True
+            )
+            # Load the captioning model
+            caption_model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/OmniParser",
                 trust_remote_code=True
             )
             return {
                 'yolo': yolo_model,
                 'processor': processor,
+                'model': caption_model
             }
+        else:
+            raise ValueError(f"Unknown model name: {model_name}")
     except Exception as e:
         st.error(f"Error loading model {model_name}: {str(e)}")
+        return None
+@spaces.GPU
+@torch.inference_mode()
+def analyze_document(image, model_name, models_dict):
+    """Analyze document using selected model
+    Args:
+        image (PIL.Image): Input image to analyze
+        model_name (str): Name of the model to use ("Donut", "LayoutLMv3", or "OmniParser")
+        models_dict (dict): Dictionary containing loaded model components
+    Returns:
+        dict: Analysis results including detected elements, text, and/or coordinates
+    """
     try:
+        if models_dict is None:
+            return {"error": "Model failed to load", "type": "model_error"}
         if model_name == "OmniParser":
+            # Configure detection parameters
+            box_threshold = 0.05  # Confidence threshold for detection
+            iou_threshold = 0.1   # IoU threshold for NMS
+            # Save image temporarily for YOLO processing
             temp_path = "temp_image.png"
             image.save(temp_path)
             # Run YOLO detection
+            yolo_results = models_dict['yolo'](
                 temp_path,
                 conf=box_threshold,
                 iou=iou_threshold
             )
+            # Process detections and generate captions
             results = []
             for det in yolo_results[0].boxes.data:
                 x1, y1, x2, y2, conf, cls = det
                 # Get region of interest
+                roi = image.crop((int(x1), int(y1), int(x2), int(y2)))
                 # Generate caption using the model
+                inputs = models_dict['processor'](
+                    images=roi,
+                    return_tensors="pt"
+                )
+                outputs = models_dict['model'].generate(
+                    **inputs,
+                    max_length=50,
+                    num_beams=4,
+                    temperature=0.7
+                )
+                caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)
                 results.append({
                     "bbox": [float(x) for x in [x1, y1, x2, y2]],
                     "caption": caption
                 })
+            # Clean up temporary file
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
             return {
                 "detected_elements": len(results),
                 "elements": results
             }
         elif model_name == "Donut":
+            # Process image with Donut
+            pixel_values = models_dict['processor'](image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
+            decoder_input_ids = models_dict['processor'].tokenizer(
+                task_prompt,
+                add_special_tokens=False,
+                return_tensors="pt"
+            ).input_ids
+            outputs = models_dict['model'].generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
                 max_length=512,
                 early_stopping=True,
+                pad_token_id=models_dict['processor'].tokenizer.pad_token_id,
+                eos_token_id=models_dict['processor'].tokenizer.eos_token_id,
                 use_cache=True,
                 num_beams=4,
+                bad_words_ids=[[models_dict['processor'].tokenizer.unk_token_id]],
                 return_dict_in_generate=True
             )
+            sequence = models_dict['processor'].batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
             try:
             except json.JSONDecodeError:
                 result = {"raw_text": sequence}
+            return result
         elif model_name == "LayoutLMv3":
+            # Process image with LayoutLMv3
+            encoded_inputs = models_dict['processor'](
                 image,
                 return_tensors="pt",
                 add_special_tokens=True,
                 return_offsets_mapping=True
             )
+            outputs = models_dict['model'](**encoded_inputs)
             predictions = outputs.logits.argmax(-1).squeeze().tolist()
+            # Convert predictions to labels
+            words = models_dict['processor'].tokenizer.convert_ids_to_tokens(
                 encoded_inputs.input_ids.squeeze().tolist()
             )
                 "confidence_scores": outputs.logits.softmax(-1).max(-1).values.squeeze().tolist()
             }
+            return result
+        else:
+            return {"error": f"Unknown model: {model_name}", "type": "model_error"}
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return {
+            "error": str(e),
+            "type": "processing_error",
+            "details": error_details
+        }
 # Set page config with improved layout
 st.set_page_config(
 """)
 # Add performance metrics section
 if st.checkbox("Show Performance Metrics"):
     st.markdown("""
     ### Model Performance Metrics
     |-------|---------------------|--------------|-----------|
     | Donut | 2-3 seconds | 6-8GB | 85-90% |
     | LayoutLMv3 | 3-4 seconds | 12-15GB | 88-93% |
+    | OmniParser | 2-3 seconds | 8-10GB | 85-90% |
     *Accuracy varies based on document type and quality
     """)
 st.markdown("---")
 st.markdown("""
 v1.1 - Created with Streamlit
+\nPowered by Hugging Face Spaces 🤗
 """)
 # Add model selection guidance
     ### How to Choose the Right Model
     1. **Donut**: Choose for structured documents with clear layouts
     2. **LayoutLMv3**: Best for documents with complex layouts and relationships
+    3. **OmniParser**: Best for UI elements and screen parsing
     """)