YOLOv10-webcam-stream

Sleeping

File size: 2,573 Bytes

780389c
8b2cbe6
1e8e71b
 
794b1a6
ccc35d4
8b2cbe6
385e56e
 
1e8e71b
ccc35d4
 
 
 
 
 
 
 
 
 
 
 
1e8e71b
f8727f7
385e56e
 
 
 
 
 
 
 
ccc35d4
8b2cbe6
 
 
 
 
 
1535831
8b2cbe6
 
2172fc2
8b2cbe6
2172fc2
8b2cbe6
 
f8727f7
 
 
 
8337710
f8727f7
8b2cbe6
 
ccc35d4
 
8b2cbe6
ccc35d4
8b2cbe6
 
 
ccc35d4
8b2cbe6
 
 
 
 
ccc35d4
8b2cbe6
 
 
 
 
c5b9185
ccc35d4

import spaces
import gradio as gr
import cv2
import tempfile
from ultralytics import YOLOv10
from PIL import Image, ImageDraw, ImageFont

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")

def draw_bounding_boxes(image, results, model, threshold=0.3):
    draw = ImageDraw.Draw(image)
    for result in results:
        for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
            if score > threshold:
                label = model.config.id2label[label_id.item()]
                box = [round(i) for i in box.tolist()]
                draw.rectangle(box, outline="red", width=3)
                draw.text((box[0], box[1]), f"{label}: {score:.2f}", fill="red")
    return image


@spaces.GPU
def yolov10_inference(image, conf_threshold):

    inputs = image_processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)

    return draw_bounding_boxes(image, results, model, threshold=conf_threshold)


def app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                image = gr.Image(type="pil", label="Image", visible=True, sources="webcam", height=500, width=500)
                conf_threshold = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.25,
                )
        image.stream(
            fn=yolov10_inference,
            inputs=[image, conf_threshold],
            outputs=[image],
            stream_every=0.2,
            time_limit=30
        )

css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""

with gr.Blocks(css=css) as app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    Near Real-Time Webcam Stream with RTDetr
    </h1>
    """)
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2304.08069' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
        </h3>
        """)
    with gr.Row():
        with gr.Column():
            app()
if __name__ == '__main__':
    app.launch()