|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from PIL import Image |
|
import torch |
|
import pytesseract |
|
from transformers import AutoImageProcessor, AutoModelForObjectDetection |
|
import os |
|
|
|
os.system('chmod 777 /tmp') |
|
os.system('apt-get update -y') |
|
os.system('apt-get install tesseract-ocr -y') |
|
os.system('pip install -q pytesseract') |
|
|
|
processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition") |
|
model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition") |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
|
|
def predict(image): |
|
|
|
inputs = processor(images=image, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
predicted_boxes = outputs.pred_boxes[0].cpu().numpy() |
|
predicted_classes = outputs.logits.argmax(-1).cpu().numpy() |
|
|
|
|
|
ocr_results = [] |
|
|
|
image_width, image_height = image.size |
|
|
|
|
|
for box in predicted_boxes: |
|
|
|
x_min, y_min, x_max, y_max = box |
|
|
|
|
|
if x_min >= x_max or y_min >= y_max: |
|
continue |
|
|
|
|
|
left = max(int(x_min * image_width), 0) |
|
top = max(int(y_min * image_height), 0) |
|
right = min(int(x_max * image_width), image_width) |
|
bottom = min(int(y_max * image_height), image_height) |
|
|
|
|
|
if right <= left or bottom <= top: |
|
continue |
|
|
|
|
|
cropped_image = image.crop((left, top, right, bottom)) |
|
|
|
|
|
ocr_text = pytesseract.image_to_string(cropped_image) |
|
|
|
|
|
ocr_results.append({ |
|
"box": [left, top, right, bottom], |
|
"text": ocr_text |
|
}) |
|
|
|
|
|
return {"ocr_results": ocr_results} |
|
|
|
|
|
interface = gr.Interface( |
|
fn=predict, |
|
inputs=gr.Image(type="pil"), |
|
outputs="json", |
|
) |
|
|
|
|
|
interface.launch() |
|
|