YOLOv10-webcam-stream

Sleeping

File size: 3,442 Bytes

780389c
8b2cbe6
1e8e71b
 
794b1a6
ccc35d4
8b2cbe6
385e56e
 
1e8e71b
790227b
ccc35d4
 
 
790227b
 
 
ccc35d4
 
 
 
 
 
 
 
1e8e71b
66947f7
385e56e
 
 
 
 
790227b
 
 
385e56e
ccc35d4
8b2cbe6
 
 
 
 
 
790227b
 
 
 
 
 
 
 
8b2cbe6
 
2172fc2
8b2cbe6
2172fc2
8b2cbe6
 
f8727f7
 
 
 
8337710
790227b
8b2cbe6
 
790227b
 
ccc35d4
8b2cbe6
ccc35d4
8b2cbe6
 
 
66947f7
8b2cbe6
790227b
 
8b2cbe6
 
 
66947f7
8b2cbe6
790227b
 
 
 
 
 
 
 
 
 
 
 
66947f7
 
 
 
 
 
 
 
 
 
 
 
790227b
66947f7
790227b
ccc35d4

import spaces
import gradio as gr
import cv2
import tempfile
from ultralytics import YOLOv10
from PIL import Image, ImageDraw, ImageFont

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")


def draw_bounding_boxes(image, results, model, threshold=0.3):
    draw = ImageDraw.Draw(image)
    for result in results:
        for score, label_id, box in zip(
            result["scores"], result["labels"], result["boxes"]
        ):
            if score > threshold:
                label = model.config.id2label[label_id.item()]
                box = [round(i) for i in box.tolist()]
                draw.rectangle(box, outline="red", width=3)
                draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red")
    return image


@spaces.GPU
def inference(image, conf_threshold):
    inputs = image_processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    results = image_processor.post_process_object_detection(
        outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3
    )

    return draw_bounding_boxes(image, results, model, threshold=conf_threshold)


def app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                image = gr.Image(
                    type="pil",
                    label="Image",
                    visible=True,
                    sources="webcam",
                    height=500,
                    width=500,
                )
                conf_threshold = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.25,
                )
        image.stream(
            fn=yolov10_inference,
            inputs=[image, conf_threshold],
            outputs=[image],
            stream_every=0.2,
            time_limit=30,
        )


css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Near Real-Time Webcam Stream with RT-DETR
    </h1>
    """
    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://github.com/lyuwenyu/RT-DETR' target='_blank'>github</a>
        </h3>
        """
    )
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
            image = gr.Image(
                type="pil",
                label="Image",
                visible=True,
                sources="webcam",
                height=500,
                width=500,
            )
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.85,
            )
            image.stream(
                fn=inference,
                inputs=[image, conf_threshold],
                outputs=[image],
                stream_every=0.2,
                time_limit=30,
            )
if __name__ == "__main__":
    app.launch()