# Credits to IDEA Research for the model: # https://huggingface.co/IDEA-Research/grounding-dino-tiny from base64 import b64decode from io import BytesIO import gradio as gr import spaces from PIL import Image import torch from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection model_id = "IDEA-Research/grounding-dino-tiny" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) def predict(base64: str, queries: str, box_threshold: float, text_threshold: float): decoded_img = b64decode(base64) image_stream = BytesIO(decoded_img) image = Image.open(image_stream) inputs = processor(images=image, text=queries, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) results = processor.post_process_grounded_object_detection( outputs, inputs.input_ids, box_threshold=box_threshold, text_threshold=text_threshold, target_sizes=[image.size[::-1]] ) return results demo = gr.Interface( fn=predict, inputs=[ gr.Text(label="Image (B64)"), gr.Text(label="Queries", placeholder="A photo of a dog,A photo of a cat") ], outputs=gr.JSON(label="Predictions"), ) demo.launch()