aleehassan commited on
Commit
6e2faa9
·
verified ·
1 Parent(s): 919a251

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -8
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from PIL import Image, ImageDraw, ImageFont
 
3
  import scipy.io.wavfile as wavfile
4
  from transformers import pipeline
5
 
@@ -7,6 +8,45 @@ from transformers import pipeline
7
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
8
  object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Function to generate audio from text
11
  def generate_audio(text):
12
  narrated_text = narrator(text)
@@ -46,7 +86,7 @@ def draw_bounding_boxes(image, detections):
46
 
47
  label = detection['label']
48
  score = detection['score']
49
- text = f"{label} {score:.2f}"
50
  text_size = draw.textbbox((xmin, ymin), text, font=font)
51
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
52
  draw.text((xmin, ymin), text, fill="white", font=font)
@@ -56,27 +96,30 @@ def draw_bounding_boxes(image, detections):
56
  # Main function to process the image
57
  def detect_object(image):
58
  detections = object_detector(image)
59
- processed_image = draw_bounding_boxes(image, detections)
60
- description_text = read_objects(detections)
 
 
 
 
 
 
61
  processed_audio = generate_audio(description_text)
62
  return processed_image, processed_audio
63
 
64
-
65
  description_text = """
66
  Upload an image to detect objects and hear a natural language description.
67
-
68
  ### Credits:
69
  Developed by Taizun S
70
  """
71
 
72
- # Your Google Analytics script
73
  ga_script = """
74
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
75
  <script>
76
  window.dataLayer = window.dataLayer || [];
77
  function gtag(){dataLayer.push(arguments);}
78
  gtag('js', new Date());
79
-
80
  gtag('config', 'G-WEYXHDZ3GQ');
81
  </script>
82
  """
@@ -99,4 +142,3 @@ with gr.Blocks() as demo:
99
 
100
  # Launch the Blocks interface
101
  demo.launch()
102
-
 
1
  import gradio as gr
2
  from PIL import Image, ImageDraw, ImageFont
3
+ import numpy as np
4
  import scipy.io.wavfile as wavfile
5
  from transformers import pipeline
6
 
 
8
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
9
  object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
10
 
11
+ # Function to apply Non-Maximum Suppression (NMS)
12
+ def compute_iou(box1, boxes):
13
+ x1 = np.maximum(box1['xmin'], boxes[:, 0])
14
+ y1 = np.maximum(box1['ymin'], boxes[:, 1])
15
+ x2 = np.minimum(box1['xmax'], boxes[:, 2])
16
+ y2 = np.minimum(box1['ymax'], boxes[:, 3])
17
+
18
+ intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
19
+ box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
20
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
21
+
22
+ union = box1_area + boxes_area - intersection
23
+ return intersection / union
24
+
25
+ def nms(detections, iou_threshold=0.5):
26
+ if len(detections) == 0:
27
+ return []
28
+
29
+ boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
30
+ scores = np.array([d['score'] for d in detections])
31
+ indices = np.argsort(scores)[::-1]
32
+
33
+ keep = []
34
+ while len(indices) > 0:
35
+ current = indices[0]
36
+ keep.append(current)
37
+ rest = indices[1:]
38
+
39
+ ious = compute_iou({
40
+ 'xmin': boxes[current, 0],
41
+ 'ymin': boxes[current, 1],
42
+ 'xmax': boxes[current, 2],
43
+ 'ymax': boxes[current, 3]
44
+ }, boxes[rest])
45
+
46
+ indices = rest[np.where(ious < iou_threshold)[0]]
47
+
48
+ return [detections[i] for i in keep]
49
+
50
  # Function to generate audio from text
51
  def generate_audio(text):
52
  narrated_text = narrator(text)
 
86
 
87
  label = detection['label']
88
  score = detection['score']
89
+ text = f"{label}: {score:.2f}"
90
  text_size = draw.textbbox((xmin, ymin), text, font=font)
91
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
92
  draw.text((xmin, ymin), text, fill="white", font=font)
 
96
  # Main function to process the image
97
  def detect_object(image):
98
  detections = object_detector(image)
99
+
100
+ # Apply confidence threshold and NMS
101
+ confidence_threshold = 0.5
102
+ filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
103
+ filtered_detections = nms(filtered_detections)
104
+
105
+ processed_image = draw_bounding_boxes(image, filtered_detections)
106
+ description_text = read_objects(filtered_detections)
107
  processed_audio = generate_audio(description_text)
108
  return processed_image, processed_audio
109
 
 
110
  description_text = """
111
  Upload an image to detect objects and hear a natural language description.
 
112
  ### Credits:
113
  Developed by Taizun S
114
  """
115
 
116
+ # Google Analytics script
117
  ga_script = """
118
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
119
  <script>
120
  window.dataLayer = window.dataLayer || [];
121
  function gtag(){dataLayer.push(arguments);}
122
  gtag('js', new Date());
 
123
  gtag('config', 'G-WEYXHDZ3GQ');
124
  </script>
125
  """
 
142
 
143
  # Launch the Blocks interface
144
  demo.launch()