import spaces import gradio as gr import cv2 import tempfile from ultralytics import YOLOv10 from PIL import Image, ImageDraw, ImageFont image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") def draw_bounding_boxes(image, results, model, threshold=0.3): draw = ImageDraw.Draw(image) for result in results: for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): if score > threshold: label = model.config.id2label[label_id.item()] box = [round(i) for i in box.tolist()] draw.rectangle(box, outline="red", width=3) draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red") return image @spaces.GPU def yolov10_inference(image, conf_threshold): inputs = image_processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3) return draw_bounding_boxes(image, results, model, threshold=conf_threshold) def app(): with gr.Blocks(): with gr.Row(): with gr.Column(): image = gr.Image(type="pil", label="Image", visible=True, sources="webcam", height=500, width=500) conf_threshold = gr.Slider( label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.25, ) image.stream( fn=yolov10_inference, inputs=[image, conf_threshold], outputs=[image], stream_every=0.2, time_limit=30 ) css=""".my-group {max-width: 600px !important; max-height: 600 !important;} .my-column {display: flex !important; justify-content: center !important; align-items: center !important};""" with gr.Blocks(css=css) as app: gr.HTML( """

Near Real-Time Webcam Stream with RTDetr

""") gr.HTML( """

arXiv | github

""") with gr.Row(): with gr.Column(): app() if __name__ == '__main__': app.launch()