Spaces:

Keemoz0
/

my-table-transformer-structure-recognition

Runtime error

App Files Files Community

Keemoz0 commited on Sep 23, 2024

Commit

fa8646f

1 Parent(s): 02ce220

RevertToOriginal

Browse files

Files changed (1) hide show

app.py +6 -53

app.py CHANGED Viewed

@@ -2,81 +2,34 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 from PIL import Image
 import torch
-import pytesseract
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
-import os
-os.system('chmod 777 /tmp')
-os.system('apt-get update -y')
-os.system('apt-get install tesseract-ocr -y')
-os.system('pip install -q pytesseract')
 # Load the processor and model for table structure recognition
 processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
 model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
-# Check if GPU is available and use it; otherwise, use CPU
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-# Define the inference and OCR function
 def predict(image):
     # Preprocess the input image
-    inputs = processor(images=image, return_tensors="pt").to(device)
     # Perform object detection using the model
     with torch.no_grad():
         outputs = model(**inputs)
-    # Extract bounding boxes and filter for columns
     predicted_boxes = outputs.pred_boxes[0].cpu().numpy()  # First image
     predicted_classes = outputs.logits.argmax(-1).cpu().numpy()  # Class predictions
-    # Prepare OCR results
-    ocr_results = []
-    image_width, image_height = image.size  # Get original image dimensions
-    # Iterate over detected boxes and perform OCR on columns
-    for box in predicted_boxes:
-        # Unpack the normalized bounding box (x_min, y_min, x_max, y_max)
-        x_min, y_min, x_max, y_max = box
-        # Ensure the coordinates are valid (x_max > x_min, y_max > y_min)
-        if x_min >= x_max or y_min >= y_max:
-            continue
-        # Convert normalized coordinates to pixel values
-        left = max(int(x_min * image_width), 0)
-        top = max(int(y_min * image_height), 0)
-        right = min(int(x_max * image_width), image_width)
-        bottom = min(int(y_max * image_height), image_height)
-        # Double-check that the coordinates are valid after conversion
-        if right <= left or bottom <= top:
-            continue
-        # Crop the image to the bounding box area
-        cropped_image = image.crop((left, top, right, bottom))
-        # Perform OCR on the cropped image
-        ocr_text = pytesseract.image_to_string(cropped_image)
-        # Append OCR result for this box
-        ocr_results.append({
-            "box": [left, top, right, bottom],
-            "text": ocr_text
-        })
-    # Return OCR results
-    return {"ocr_results": ocr_results}
-#relaunch
 # Set up the Gradio interface
 interface = gr.Interface(
     fn=predict,  # The function that gets called when an image is uploaded
     inputs=gr.Image(type="pil"),  # Image input (as PIL image)
-    outputs="json",  # Outputting a JSON with the OCR results
 )
 # Launch the Gradio app
 interface.launch()
-#recheck gradio bugging

 from huggingface_hub import hf_hub_download
 from PIL import Image
 import torch
 from transformers import AutoImageProcessor, AutoModelForObjectDetection
 # Load the processor and model for table structure recognition
 processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
 model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
+# Define the inference function
 def predict(image):
     # Preprocess the input image
+    inputs = processor(images=image, return_tensors="pt")
     # Perform object detection using the model
     with torch.no_grad():
         outputs = model(**inputs)
+    # Extract bounding boxes and class labels
     predicted_boxes = outputs.pred_boxes[0].cpu().numpy()  # First image
     predicted_classes = outputs.logits.argmax(-1).cpu().numpy()  # Class predictions
+    # Return the bounding boxes for display
+    return {"boxes": predicted_boxes.tolist(), "classes": predicted_classes.tolist()}
 # Set up the Gradio interface
 interface = gr.Interface(
     fn=predict,  # The function that gets called when an image is uploaded
     inputs=gr.Image(type="pil"),  # Image input (as PIL image)
+    outputs="json",  # Outputting a JSON with the boxes and classes
 )
 # Launch the Gradio app
 interface.launch()