Spaces:

Taizun
/

Object-detection

Running

App Files Files Community

aleehassan commited on Jan 17

Commit

6e2faa9

verified ·

1 Parent(s): 919a251

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -8

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
 import scipy.io.wavfile as wavfile
 from transformers import pipeline
@@ -7,6 +8,45 @@ from transformers import pipeline
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
 # Function to generate audio from text
 def generate_audio(text):
     narrated_text = narrator(text)
@@ -46,7 +86,7 @@ def draw_bounding_boxes(image, detections):
         label = detection['label']
         score = detection['score']
-        text = f"{label} {score:.2f}"
         text_size = draw.textbbox((xmin, ymin), text, font=font)
         draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
         draw.text((xmin, ymin), text, fill="white", font=font)
@@ -56,27 +96,30 @@ def draw_bounding_boxes(image, detections):
 # Main function to process the image
 def detect_object(image):
     detections = object_detector(image)
-    processed_image = draw_bounding_boxes(image, detections)
-    description_text = read_objects(detections)
     processed_audio = generate_audio(description_text)
     return processed_image, processed_audio
 description_text = """
 Upload an image to detect objects and hear a natural language description.
 ### Credits:
 Developed by Taizun S
 """
-# Your Google Analytics script
 ga_script = """
 <script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
 <script>
   window.dataLayer = window.dataLayer || [];
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'G-WEYXHDZ3GQ');
 </script>
 """
@@ -99,4 +142,3 @@ with gr.Blocks() as demo:
 # Launch the Blocks interface
 demo.launch()

 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
+import numpy as np
 import scipy.io.wavfile as wavfile
 from transformers import pipeline
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
+# Function to apply Non-Maximum Suppression (NMS)
+def compute_iou(box1, boxes):
+    x1 = np.maximum(box1['xmin'], boxes[:, 0])
+    y1 = np.maximum(box1['ymin'], boxes[:, 1])
+    x2 = np.minimum(box1['xmax'], boxes[:, 2])
+    y2 = np.minimum(box1['ymax'], boxes[:, 3])
+    intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
+    box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
+    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    union = box1_area + boxes_area - intersection
+    return intersection / union
+def nms(detections, iou_threshold=0.5):
+    if len(detections) == 0:
+        return []
+    boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
+    scores = np.array([d['score'] for d in detections])
+    indices = np.argsort(scores)[::-1]
+    keep = []
+    while len(indices) > 0:
+        current = indices[0]
+        keep.append(current)
+        rest = indices[1:]
+        ious = compute_iou({
+            'xmin': boxes[current, 0],
+            'ymin': boxes[current, 1],
+            'xmax': boxes[current, 2],
+            'ymax': boxes[current, 3]
+        }, boxes[rest])
+        indices = rest[np.where(ious < iou_threshold)[0]]
+    return [detections[i] for i in keep]
 # Function to generate audio from text
 def generate_audio(text):
     narrated_text = narrator(text)
         label = detection['label']
         score = detection['score']
+        text = f"{label}: {score:.2f}"
         text_size = draw.textbbox((xmin, ymin), text, font=font)
         draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
         draw.text((xmin, ymin), text, fill="white", font=font)
 # Main function to process the image
 def detect_object(image):
     detections = object_detector(image)
+    # Apply confidence threshold and NMS
+    confidence_threshold = 0.5
+    filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
+    filtered_detections = nms(filtered_detections)
+    processed_image = draw_bounding_boxes(image, filtered_detections)
+    description_text = read_objects(filtered_detections)
     processed_audio = generate_audio(description_text)
     return processed_image, processed_audio
 description_text = """
 Upload an image to detect objects and hear a natural language description.
 ### Credits:
 Developed by Taizun S
 """
+# Google Analytics script
 ga_script = """
 <script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
 <script>
   window.dataLayer = window.dataLayer || [];
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'G-WEYXHDZ3GQ');
 </script>
 """
 # Launch the Blocks interface
 demo.launch()