from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor import torch import supervision as sv import cv2 import numpy as np from PIL import Image import gradio as gr import spaces BOX_ANNOTATOR = sv.BoxAnnotator() LABEL_ANNOTATOR = sv.LabelAnnotator() MASK_ANNOTATOR = sv.MaskAnnotator() DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_id = "google/paligemma2-3b-pt-448" model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(DEVICE) processor = PaliGemmaProcessor.from_pretrained(model_id) @spaces.GPU def process_image(input_image,input_text,class_names): class_list = class_names.split(',') cv_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR) model_inputs = processor(text=input_text, images=input_image, return_tensors="pt").to(torch.bfloat16).to(model.device) input_len = model_inputs["input_ids"].shape[-1] with torch.inference_mode(): generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False) generation = generation[0][input_len:] result = processor.decode(generation, skip_special_tokens=True) detections = sv.Detections.from_lmm( sv.LMM.PALIGEMMA, result, resolution_wh=(input_image.width, input_image.height), classes=class_list ) annotated_image = BOX_ANNOTATOR.annotate( scene=cv_image.copy(), detections=detections ) annotated_image = LABEL_ANNOTATOR.annotate( scene=annotated_image, detections=detections ) annotated_image = MASK_ANNOTATOR.annotate( scene=annotated_image, detections=detections ) annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB) annotated_image = Image.fromarray(annotated_image) return annotated_image, result with gr.Blocks() as app: gr.Markdown( """ ## PaliGemma 2 Detection with Supervision - Demo \n\n

\n\n PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question answering, text reading, object detection and object segmentation. This space show how to use PaliGemma 2 for object detection with supervision. You can input an image and a text prompt """) with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Input Image") input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter prompt for example 'detect person;dog") class_names = gr.Textbox(lines=1, placeholder="Enter class names separated by commas...", label="Class Names") with gr.Column(): annotated_image = gr.Image(type="pil", label="Annotated Image") detection_result = gr.Textbox(label="Detection Result") gr.Button("Submit").click( fn=process_image, inputs=[input_image, input_text, class_names], outputs=[annotated_image, detection_result] ) if __name__ == "__main__": app.launch()