Spaces:

atalaydenknalbant
/

Budgerigar_Gender_Determination

Running on Zero

App Files Files Community

atalaydenknalbant commited on Jul 19

Commit

c5e5663

verified ·

1 Parent(s): 6248745

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -72

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from PIL import Image, ImageDraw, ImageFont
 from ultralytics import YOLO, RTDETR
 import spaces
 import os
 from huggingface_hub import hf_hub_download
 def get_model_path(model_name):
@@ -26,18 +29,22 @@ def get_model_path(model_name):
     return model_cache_path
 @spaces.GPU
-def yolo_inference(images, model_id, conf_threshold, iou_threshold, max_detection):
     """
-    Performs budgerigar gender determination inference on an image using a selected YOLO or RTDETR model.
-    This function handles input images, loads the appropriate model (YOLO or RTDETR)
-    based on the `model_id`, and then runs inference to detect budgerigars and
-    determine their gender. The results are then plotted onto the original image.
-    If no image is provided, it returns a blank image with a message.
     Args:
-        images (PIL.Image.Image or None): The input image on which to perform detection.
-                                          Can be None if no image is uploaded.
         model_id (str): The identifier of the model to use (e.g., 'budgerigar_yolo11x.pt',
                         'budgerigar_rtdetr-x.pt').
         conf_threshold (float): The confidence threshold for filtering detections.
@@ -47,78 +54,213 @@ def yolo_inference(images, model_id, conf_threshold, iou_threshold, max_detectio
         max_detection (int): The maximum number of detections to return and display.
     Returns:
-        PIL.Image.Image: The input image annotated with detection results, including
-                         bounding boxes and gender labels. Returns a blank image
-                         with a message if no input image is provided.
     """
-    if images is None:
-        # Create a blank image
-        width, height = 640, 480
-        blank_image = Image.new("RGB", (width, height), color="white")
-        draw = ImageDraw.Draw(blank_image)
-        message = "No image provided"
-        font = ImageFont.load_default(size=40)
-        bbox = draw.textbbox((0, 0), message, font=font)
-        text_width = bbox[2] - bbox[0]
-        text_height = bbox[3] - bbox[1]
-        text_x = (width - text_width) / 2
-        text_y = (height - text_height) / 2
-        draw.text((text_x, text_y), message, fill="black", font=font)
-        return blank_image
-    model_path = get_model_path(model_id)  # Download model
     model_type = RTDETR if 'rtdetr' in model_id.lower() else YOLO
     model = model_type(model_path)
-    results = model.predict(
-        source=images,
-        conf=conf_threshold,
-        iou=iou_threshold,
-        imgsz=640,
-        max_det=max_detection,
-        show_labels=True,
-        show_conf=True,
     )
-    # Process results and convert to PIL Image
-    for r in results:
-        image_array = r.plot()
-        image = Image.fromarray(image_array[..., ::-1])
-    return image
-interface = gr.Interface(
-    fn=yolo_inference,
-    inputs=[
-        gr.Image(type="pil", label="Example Image", interactive=True),
-        gr.Radio(
-            choices=[
-                'budgerigar_yolo11x.pt', 'budgerigar_yolov9e.pt',
-                'budgerigar_yolo11l.pt', 'budgerigar_yolo11m.pt',
-                'budgerigar_yolo11s.pt', 'budgerigar_yolo11n.pt',
-                'budgerigar_rtdetr-x.pt'
-            ],
-            label="Model Name",
-            value="budgerigar_yolo11x.pt",
-        ),
-        gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold"),
-        gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold"),
-        gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection"),
-    ],
-    outputs=gr.Image(type="pil", label="Annotated Image"),
-    cache_examples=True,
-    title="Budgerigar Gender Determination",
-    description=(
         "Pretrained object detection models for determining budgerigar gender based on cere color variations. "
-        "Upload image(s) for inference. For more details, refer to the paper: "
         '<a href="https://ieeexplore.ieee.org/document/10773570" target="_blank">'
         '"Advanced Computer Vision Techniques for Reliable Gender Determination in Budgerigars (Melopsittacus Undulatus)"</a>'
         "<br><br>"
         "To help us improve, please report any incorrect gender determinations by sending the original image and details to -> <a href='mailto:[email protected]'>Email</a>."
         "Your feedback is important for retraining and improving the model."
-    ),
-    examples=[
-        ["both.jpg", "budgerigar_rtdetr-x.pt", 0.25, 0.45, 300],
-        ["Male.png", "budgerigar_yolov9e.pt", 0.25, 0.45, 300],
-        ["Female.png", "budgerigar_yolo11x.pt", 0.25, 0.45, 300],
-    ],
-)
-interface.launch(mcp_server=True)

 from ultralytics import YOLO, RTDETR
 import spaces
 import os
+import cv2
+import numpy as np
+import tempfile
 from huggingface_hub import hf_hub_download
 def get_model_path(model_name):
     return model_cache_path
 @spaces.GPU
+def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
     """
+    Performs budgerigar gender determination inference on an image or video
+    using a selected YOLO or RTDETR model.
+    This function handles both image and video inputs. For images, it loads the
+    appropriate model and annotates the image. For videos, it processes each
+    frame, performs detection, and then reconstructs an annotated video.
+    Error handling for missing inputs is included, returning blank outputs with messages.
     Args:
+        input_type (str): Specifies the input type, either "Image" or "Video".
+        image (PIL.Image.Image or None): The input image if `input_type` is "Image".
+                                         None otherwise.
+        video (str or None): The path to the input video file if `input_type` is "Video".
+                             None otherwise.
         model_id (str): The identifier of the model to use (e.g., 'budgerigar_yolo11x.pt',
                         'budgerigar_rtdetr-x.pt').
         conf_threshold (float): The confidence threshold for filtering detections.
         max_detection (int): The maximum number of detections to return and display.
     Returns:
+        tuple: A tuple containing two elements:
+            - PIL.Image.Image or None: The annotated image if `input_type` was "Image",
+                                       otherwise None.
+            - str or None: The path to the annotated video file if `input_type` was "Video",
+                           otherwise None.
     """
+    model_path = get_model_path(model_id)
     model_type = RTDETR if 'rtdetr' in model_id.lower() else YOLO
     model = model_type(model_path)
+    if input_type == "Image":
+        if image is None:
+            width, height = 640, 480
+            blank_image = Image.new("RGB", (width, height), color="white")
+            draw = ImageDraw.Draw(blank_image)
+            message = "No image provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), message, font=font)
+            text_width = bbox[2] - bbox[0]
+            text_height = bbox[3] - bbox[1]
+            text_x = (width - text_width) / 2
+            text_y = (height - text_height) / 2
+            draw.text((text_x, text_y), message, fill="black", font=font)
+            return blank_image, None
+        results = model.predict(
+            source=image,
+            conf=conf_threshold,
+            iou=iou_threshold,
+            imgsz=640,
+            max_det=max_detection,
+            show_labels=True,
+            show_conf=True,
+        )
+        for r in results:
+            image_array = r.plot()
+            annotated_image = Image.fromarray(image_array[..., ::-1])
+        return annotated_image, None
+    elif input_type == "Video":
+        if video is None:
+            width, height = 640, 480
+            blank_image = Image.new("RGB", (width, height), color="white")
+            draw = ImageDraw.Draw(blank_image)
+            message = "No video provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), message, font=font)
+            text_width = bbox[2] - bbox[0]
+            text_height = bbox[3] - bbox[1]
+            text_x = (width - text_width) / 2
+            text_y = (height - text_height) / 2
+            draw.text((text_x, text_y), message, fill="black", font=font)
+            temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            out = cv2.VideoWriter(temp_video_file, fourcc, 1, (width, height))
+            frame = cv2.cvtColor(np.array(blank_image), cv2.COLOR_RGB2BGR)
+            out.write(frame)
+            out.release()
+            return None, temp_video_file
+        cap = cv2.VideoCapture(video)
+        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            results = model.predict(
+                source=pil_frame,
+                conf=conf_threshold,
+                iou=iou_threshold,
+                imgsz=640,
+                max_det=max_detection,
+                show_labels=True,
+                show_conf=True,
+            )
+            for r in results:
+                annotated_frame_array = r.plot()
+                annotated_frame = cv2.cvtColor(annotated_frame_array, cv2.COLOR_BGR2RGB)
+            frames.append(annotated_frame)
+        cap.release()
+        if not frames:
+            return None, None
+        height_out, width_out, _ = frames[0].shape
+        temp_video_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out = cv2.VideoWriter(temp_video_file, fourcc, fps, (width_out, height_out))
+        for f in frames:
+            f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
+            out.write(f_bgr)
+        out.release()
+        return None, temp_video_file
+    return None, None
+def update_visibility(input_type):
+    """
+    Adjusts the visibility of Gradio components based on the selected input type.
+    This function dynamically shows or hides the image and video input/output
+    components in the Gradio interface to ensure only relevant fields are visible.
+    Args:
+        input_type (str): The selected input type, either "Image" or "Video".
+    Returns:
+        tuple: A tuple of `gr.update` objects for the visibility of:
+               (image input, video input, image output, video output).
+    """
+    if input_type == "Image":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
+    """
+    Wrapper function for `yolo_inference` specifically for Gradio examples that use images.
+    This function simplifies the `yolo_inference` call for the `gr.Examples` component,
+    ensuring only image-based inference is performed for predefined examples.
+    Args:
+        image (PIL.Image.Image): The input image for the example.
+        model_id (str): The identifier of the YOLO model to use.
+        conf_threshold (float): The confidence threshold.
+        iou_threshold (float): The IoU threshold.
+        max_detection (int): The maximum number of detections.
+    Returns:
+        PIL.Image.Image or None: The annotated image. Returns None if no image is processed.
+    """
+    annotated_image, _ = yolo_inference(
+        input_type="Image",
+        image=image,
+        video=None,
+        model_id=model_id,
+        conf_threshold=conf_threshold,
+        iou_threshold=iou_threshold,
+        max_detection=max_detection
     )
+    return annotated_image
+with gr.Blocks(title="Budgerigar Gender Determination") as app:
+    gr.Markdown("# Budgerigar Gender Determination")
+    gr.Markdown(
         "Pretrained object detection models for determining budgerigar gender based on cere color variations. "
+        "Upload image(s) or video(s) for inference. For more details, refer to the paper: "
         '<a href="https://ieeexplore.ieee.org/document/10773570" target="_blank">'
         '"Advanced Computer Vision Techniques for Reliable Gender Determination in Budgerigars (Melopsittacus Undulatus)"</a>'
         "<br><br>"
         "To help us improve, please report any incorrect gender determinations by sending the original image and details to -> <a href='mailto:[email protected]'>Email</a>."
         "Your feedback is important for retraining and improving the model."
+    )
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(type="pil", label="Image Input", visible=True)
+            video = gr.Video(label="Video Input", visible=False)
+            input_type = gr.Radio(
+                choices=["Image", "Video"],
+                value="Image",
+                label="Input Type",
+            )
+            model_id = gr.Radio(
+                choices=[
+                    'budgerigar_yolo11x.pt', 'budgerigar_yolov9e.pt',
+                    'budgerigar_yolo11l.pt', 'budgerigar_yolo11m.pt',
+                    'budgerigar_yolo11s.pt', 'budgerigar_yolo11n.pt',
+                    'budgerigar_rtdetr-x.pt'
+                ],
+                label="Model Name",
+                value="budgerigar_yolo11x.pt",
+            )
+            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
+            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
+            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
+            infer_button = gr.Button("Detect Objects")
+        with gr.Column():
+            output_image = gr.Image(type="pil", label="Annotated Image", visible=True)
+            output_video = gr.Video(label="Annotated Video", visible=False)
+            gr.DeepLinkButton()
+    input_type.change(
+        fn=update_visibility,
+        inputs=input_type,
+        outputs=[image, video, output_image, output_video],
+    )
+    infer_button.click(
+        fn=yolo_inference,
+        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
+        outputs=[output_image, output_video],
+    )
+    gr.Examples(
+        examples=[
+            ["both.jpg", "budgerigar_rtdetr-x.pt", 0.25, 0.45, 300],
+            ["Male.png", "budgerigar_yolov9e.pt", 0.25, 0.45, 300],
+            ["Female.png", "budgerigar_yolo11x.pt", 0.25, 0.45, 300],
+        ],
+        fn=yolo_inference_for_examples,
+        inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
+        outputs=[output_image],
+        label="Examples (Images)",
+    )
+app.launch(mcp_server=True)