# Credits to IDEA Research for the model:
# https://huggingface.co/IDEA-Research/grounding-dino-tiny

from base64 import b64decode
from io import BytesIO

import gradio as gr
import spaces
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 

model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

def predict(base64: str, queries: str, box_threshold: float, text_threshold: float):
    decoded_img = b64decode(base64)
    image_stream = BytesIO(decoded_img)
    image = Image.open(image_stream)
    
    inputs = processor(images=image, text=queries, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        target_sizes=[image.size[::-1]]
    )
    return results

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Image (B64)"),
        gr.Text(label="Queries", placeholder="A photo of a dog,A photo of a cat")
    ],
    outputs=gr.JSON(label="Predictions"),
)
demo.launch()