Spaces:

Keemoz0
/

my-table-transformer-structure-recognition

Running

App Files Files Community

Keemoz0 commited on Sep 23, 2024

Commit

a6fc7d1

1 Parent(s): 470e893

Grab column boxes and ocr the text in it

Browse files

Files changed (2) hide show

app.py +45 -6
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -2,33 +2,72 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 from PIL import Image
 import torch
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
 # Load the processor and model for table structure recognition
 processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
 model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
-# Define the inference function
 def predict(image):
     # Preprocess the input image
-    inputs = processor(images=image, return_tensors="pt")
     # Perform object detection using the model
     with torch.no_grad():
         outputs = model(**inputs)
-    # Extract bounding boxes and class labels
     predicted_boxes = outputs.pred_boxes[0].cpu().numpy()  # First image
     predicted_classes = outputs.logits.argmax(-1).cpu().numpy()  # Class predictions
-    # Return the bounding boxes for display
-    return {"boxes": predicted_boxes.tolist(), "classes": predicted_classes.tolist()}
 # Set up the Gradio interface
 interface = gr.Interface(
     fn=predict,  # The function that gets called when an image is uploaded
     inputs=gr.Image(type="pil"),  # Image input (as PIL image)
-    outputs="json",  # Outputting a JSON with the boxes and classes
 )
 # Launch the Gradio app

 from huggingface_hub import hf_hub_download
 from PIL import Image
 import torch
+import pytesseract
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
 # Load the processor and model for table structure recognition
 processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
 model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
+# Check if GPU is available and use it; otherwise, use CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+# Define the inference and OCR function
 def predict(image):
     # Preprocess the input image
+    inputs = processor(images=image, return_tensors="pt").to(device)
     # Perform object detection using the model
     with torch.no_grad():
         outputs = model(**inputs)
+    # Extract bounding boxes and filter for columns
     predicted_boxes = outputs.pred_boxes[0].cpu().numpy()  # First image
     predicted_classes = outputs.logits.argmax(-1).cpu().numpy()  # Class predictions
+    # Prepare OCR results
+    ocr_results = []
+    image_width, image_height = image.size  # Get original image dimensions
+    # Iterate over detected boxes and perform OCR on columns
+    for box in predicted_boxes:
+        # Unpack the normalized bounding box (x_min, y_min, x_max, y_max)
+        x_min, y_min, x_max, y_max = box
+        # Calculate width and height (denormalize)
+        width = x_max - x_min
+        height = y_max - y_min
+        # Filter for columns based on aspect ratio (height > width)
+        if height / width > 2:  # A threshold for vertical aspect ratio (adjust if needed)
+            # Convert normalized coordinates to pixel values
+            left = int(x_min * image_width)
+            top = int(y_min * image_height)
+            right = int(x_max * image_width)
+            bottom = int(y_max * image_height)
+            # Crop the image to the bounding box area
+            cropped_image = image.crop((left, top, right, bottom))
+            # Perform OCR on the cropped image
+            ocr_text = pytesseract.image_to_string(cropped_image)
+            # Append OCR result for this box
+            ocr_results.append({
+                "box": [left, top, right, bottom],
+                "text": ocr_text
+            })
+    # Return OCR results
+    return {"ocr_results": ocr_results}
 # Set up the Gradio interface
 interface = gr.Interface(
     fn=predict,  # The function that gets called when an image is uploaded
     inputs=gr.Image(type="pil"),  # Image input (as PIL image)
+    outputs="json",  # Outputting a JSON with the OCR results
 )
 # Launch the Gradio app

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ torch
 transformers
 gradio
 Pillow
-timm

 transformers
 gradio
 Pillow
+timm
+pytesseract