Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,945 Bytes
45439d0 b0a31e1 79d95c0 1d11470 9c39e47 79d95c0 1d11470 9c39e47 45439d0 9c39e47 45439d0 9c39e47 45439d0 9c39e47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
import spaces
import cv2
import numpy as np
import tempfile
@spaces.GPU
def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
if input_type == "Image":
if image is None:
width, height = 640, 480
blank_image = Image.new("RGB", (width, height), color="white")
draw = ImageDraw.Draw(blank_image)
message = "No image provided"
font = ImageFont.load_default(size=40)
bbox = draw.textbbox((0, 0), message, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
text_x = (width - text_width) / 2
text_y = (height - text_height) / 2
draw.text((text_x, text_y), message, fill="black", font=font)
return blank_image, None
model = YOLO(model_id)
results = model.predict(
source=image,
conf=conf_threshold,
iou=iou_threshold,
imgsz=640,
max_det=max_detection,
show_labels=True,
show_conf=True,
)
for r in results:
image_array = r.plot()
annotated_image = Image.fromarray(image_array[..., ::-1])
return annotated_image, None
elif input_type == "Video":
if video is None:
width, height = 640, 480
blank_image = Image.new("RGB", (width, height), color="white")
draw = ImageDraw.Draw(blank_image)
message = "No video provided"
font = ImageFont.load_default(size=40)
bbox = draw.textbbox((0, 0), message, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
text_x = (width - text_width) / 2
text_y = (height - text_height) / 2
draw.text((text_x, text_y), message, fill="black", font=font)
temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
out.write(frame)
out.release()
return None, temp_video_file
model = YOLO(model_id)
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
frames = []
while True:
ret, frame = cap.read()
if not ret:
break
pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
results = model.predict(
source=pil_frame,
conf=conf_threshold,
iou=iou_threshold,
imgsz=640,
max_det=max_detection,
show_labels=True,
show_conf=True,
)
for r in results:
annotated_frame_array = r.plot()
annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
frames.append(annotated_frame)
cap.release()
if len(frames) == 0:
return None, None
height_out, width_out, _ = frames[0].shape
temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
for f in frames:
f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
out.write(f_bgr)
out.release()
return None, temp_video_file
else:
return None, None
def update_visibility(input_type):
"""
Show/hide image/video input and output depending on input_type.
"""
if input_type == "Image":
# image, video, output_image, output_video
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
"""
This is called by gr.Examples. We force the radio to 'Image'
and then do a standard image inference, returning both updated radio
value and the annotated image.
"""
annotated_image, _ = yolo_inference(
input_type="Image",
image=image,
video=None,
model_id=model_id,
conf_threshold=conf_threshold,
iou_threshold=iou_threshold,
max_detection=max_detection
)
return gr.update(value="Image"), annotated_image
with gr.Blocks() as app:
gr.Markdown("# Yolo11: Object Detection, Instance Segmentation, Pose/Keypoints, Oriented Detection, Classification")
gr.Markdown("Upload image(s) or video(s) for inference using the latest Ultralytics YOLO11 models.")
with gr.Row():
with gr.Column():
image = gr.Image(type="pil", label="Image", visible=True)
video = gr.Video(label="Video", visible=False)
input_type = gr.Radio(
choices=["Image", "Video"],
value="Image",
label="Input Type",
)
model_id = gr.Dropdown(
label="Model Name",
choices=[
'yolo11n.pt', 'yolo11s.pt', 'yolo11m.pt', 'yolo11l.pt', 'yolo11x.pt',
'yolo11n-seg.pt', 'yolo11s-seg.pt', 'yolo11m-seg.pt', 'yolo11l-seg.pt', 'yolo11x-seg.pt',
'yolo11n-pose.pt', 'yolo11s-pose.pt', 'yolo11m-pose.pt', 'yolo11l-pose.pt', 'yolo11x-pose.pt',
'yolo11n-obb.pt', 'yolo11s-obb.pt', 'yolo11m-obb.pt', 'yolo11l-obb.pt', 'yolo11x-obb.pt',
'yolo11n-cls.pt', 'yolo11s-cls.pt', 'yolo11m-cls.pt', 'yolo11l-cls.pt', 'yolo11x-cls.pt'
],
value="yolo11n.pt",
)
conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
infer_button = gr.Button("Detect Objects")
with gr.Column():
output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
output_video = gr.Video(label="Annotated Video", visible=False)
# Toggle input/output visibility
input_type.change(
fn=update_visibility,
inputs=input_type,
outputs=[image, video, output_image, output_video],
)
# Main inference for button click
infer_button.click(
fn=yolo_inference,
inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
outputs=[output_image, output_video],
)
# Examples for images only
gr.Examples(
examples=[
["zidane.jpg", "yolo11s.pt", 0.25, 0.45, 300],
["bus.jpg", "yolo11m.pt", 0.25, 0.45, 300],
["yolo_vision.jpg", "yolo11x.pt", 0.25, 0.45, 300],
["Tricycle.jpg", "yolo11x-cls.pt", 0.25, 0.45, 300],
["tcganadolu.jpg", "yolo11m-obb.pt", 0.25, 0.45, 300],
["San Diego Airport.jpg", "yolo11x-seg.pt", 0.25, 0.45, 300],
["Theodore_Roosevelt.png", "yolo11l-pose.pt", 0.25, 0.45, 300],
],
fn=yolo_inference_for_examples,
inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
outputs=[input_type, output_image],
label="Examples (Images)",
cache_examples=True,
)
if __name__ == '__main__':
app.launch()
|