import gradio as gr
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import torch

# Load the processor and model for table structure recognition
processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")

# Define the inference function
def predict(image):
    # Preprocess the input image
    inputs = processor(images=image, return_tensors="pt")
    
    # Perform object detection using the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract bounding boxes and class labels
    predicted_boxes = outputs.pred_boxes[0].cpu().numpy()  # First image
    predicted_class_logits = outputs.logits[0].cpu().numpy()  # Class logits for the first image
    predicted_classes = predicted_class_logits.argmax(-1)  # Get class predictions
    class_names = model.config.id2label  # Get the class name mapping

    # Collect the class IDs and labels along with the bounding boxes
    result = []
    for idx, class_id in enumerate(predicted_classes):
        class_name = class_names[class_id]
        result.append({
            "class_id": int(class_id),
            "class_name": class_name,
            "bounding_box": predicted_boxes[idx].tolist()  # Convert to list for JSON serialization
        })

    # Return the bounding boxes and classes
    return result

# Set up the Gradio interface
interface = gr.Interface(
    fn=predict,  # The function that gets called when an image is uploaded
    inputs=gr.Image(type="pil"),  # Image input (as PIL image)
    outputs="json",  # Outputting a JSON with the class labels, IDs, and bounding boxes
    title="Table Structure Recognition",  # Add title for clarity
    description="Upload an image and see the detected table columns and their corresponding class IDs.",
)

# Launch the Gradio app
interface.launch()