Near Real-Time Webcam Stream with RT-DETR

import spaces
import gradio as gr
import cv2
import tempfile
from ultralytics import YOLOv10
from PIL import Image, ImageDraw, ImageFont

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")


def draw_bounding_boxes(image, results, model, threshold=0.3):
    draw = ImageDraw.Draw(image)
    for result in results:
        for score, label_id, box in zip(
            result["scores"], result["labels"], result["boxes"]
        ):
            if score > threshold:
                label = model.config.id2label[label_id.item()]
                box = [round(i) for i in box.tolist()]
                draw.rectangle(box, outline="red", width=3)
                draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red")
    return image


@spaces.GPU
def inference(image, conf_threshold):
    inputs = image_processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    results = image_processor.post_process_object_detection(
        outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3
    )

    return draw_bounding_boxes(image, results, model, threshold=conf_threshold)


def app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                image = gr.Image(
                    type="pil",
                    label="Image",
                    visible=True,
                    sources="webcam",
                    height=500,
                    width=500,
                )
                conf_threshold = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.25,
                )
        image.stream(
            fn=yolov10_inference,
            inputs=[image, conf_threshold],
            outputs=[image],
            stream_every=0.2,
            time_limit=30,
        )


css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Near Real-Time Webcam Stream with RT-DETR
    </h1>
    """
    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://github.com/lyuwenyu/RT-DETR' target='_blank'>github</a>
        </h3>
        """
    )
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            image = gr.Image(
                type="pil",
                label="Image",
                visible=True,
                sources="webcam",
                height=500,
                width=500,
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.85,
            )
            image.stream(
                fn=inference,
                inputs=[image, conf_threshold],
                outputs=[image],
                stream_every=0.2,
                time_limit=30,
            )
if __name__ == "__main__":
    app.launch()