import spaces import gradio as gr import cv2 import tempfile from ultralytics import YOLOv10 from PIL import Image, ImageDraw, ImageFont image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") def draw_bounding_boxes(image, results, model, threshold=0.3): draw = ImageDraw.Draw(image) for result in results: for score, label_id, box in zip( result["scores"], result["labels"], result["boxes"] ): if score > threshold: label = model.config.id2label[label_id.item()] box = [round(i) for i in box.tolist()] draw.rectangle(box, outline="red", width=3) draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red") return image @spaces.GPU def inference(image, conf_threshold): inputs = image_processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) results = image_processor.post_process_object_detection( outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3 ) return draw_bounding_boxes(image, results, model, threshold=conf_threshold) def app(): with gr.Blocks(): with gr.Row(): with gr.Column(): image = gr.Image( type="pil", label="Image", visible=True, sources="webcam", height=500, width=500, ) conf_threshold = gr.Slider( label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.25, ) image.stream( fn=yolov10_inference, inputs=[image, conf_threshold], outputs=[image], stream_every=0.2, time_limit=30, ) css = """.my-group {max-width: 600px !important; max-height: 600 !important;} .my-column {display: flex !important; justify-content: center !important; align-items: center !important};""" with gr.Blocks(css=css) as app: gr.HTML( """

Near Real-Time Webcam Stream with RT-DETR

""" ) gr.HTML( """

arXiv | github

""" ) with gr.Column(elem_classes=["my-column"]): with gr.Group(elem_classes=["my-group"]): image = gr.Image( type="pil", label="Image", visible=True, sources="webcam", height=500, width=500, ) conf_threshold = gr.Slider( label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.85, ) image.stream( fn=inference, inputs=[image, conf_threshold], outputs=[image], stream_every=0.2, time_limit=30, ) if __name__ == "__main__": app.launch()