Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
from transformers import pipeline
|
5 |
|
@@ -7,6 +8,45 @@ from transformers import pipeline
|
|
7 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
8 |
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Function to generate audio from text
|
11 |
def generate_audio(text):
|
12 |
narrated_text = narrator(text)
|
@@ -46,7 +86,7 @@ def draw_bounding_boxes(image, detections):
|
|
46 |
|
47 |
label = detection['label']
|
48 |
score = detection['score']
|
49 |
-
text = f"{label} {score:.2f}"
|
50 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
51 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
52 |
draw.text((xmin, ymin), text, fill="white", font=font)
|
@@ -56,27 +96,30 @@ def draw_bounding_boxes(image, detections):
|
|
56 |
# Main function to process the image
|
57 |
def detect_object(image):
|
58 |
detections = object_detector(image)
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
processed_audio = generate_audio(description_text)
|
62 |
return processed_image, processed_audio
|
63 |
|
64 |
-
|
65 |
description_text = """
|
66 |
Upload an image to detect objects and hear a natural language description.
|
67 |
-
|
68 |
### Credits:
|
69 |
Developed by Taizun S
|
70 |
"""
|
71 |
|
72 |
-
#
|
73 |
ga_script = """
|
74 |
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
|
75 |
<script>
|
76 |
window.dataLayer = window.dataLayer || [];
|
77 |
function gtag(){dataLayer.push(arguments);}
|
78 |
gtag('js', new Date());
|
79 |
-
|
80 |
gtag('config', 'G-WEYXHDZ3GQ');
|
81 |
</script>
|
82 |
"""
|
@@ -99,4 +142,3 @@ with gr.Blocks() as demo:
|
|
99 |
|
100 |
# Launch the Blocks interface
|
101 |
demo.launch()
|
102 |
-
|
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image, ImageDraw, ImageFont
|
3 |
+
import numpy as np
|
4 |
import scipy.io.wavfile as wavfile
|
5 |
from transformers import pipeline
|
6 |
|
|
|
8 |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
|
9 |
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
|
10 |
|
11 |
+
# Function to apply Non-Maximum Suppression (NMS)
|
12 |
+
def compute_iou(box1, boxes):
|
13 |
+
x1 = np.maximum(box1['xmin'], boxes[:, 0])
|
14 |
+
y1 = np.maximum(box1['ymin'], boxes[:, 1])
|
15 |
+
x2 = np.minimum(box1['xmax'], boxes[:, 2])
|
16 |
+
y2 = np.minimum(box1['ymax'], boxes[:, 3])
|
17 |
+
|
18 |
+
intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
|
19 |
+
box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
|
20 |
+
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
21 |
+
|
22 |
+
union = box1_area + boxes_area - intersection
|
23 |
+
return intersection / union
|
24 |
+
|
25 |
+
def nms(detections, iou_threshold=0.5):
|
26 |
+
if len(detections) == 0:
|
27 |
+
return []
|
28 |
+
|
29 |
+
boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
|
30 |
+
scores = np.array([d['score'] for d in detections])
|
31 |
+
indices = np.argsort(scores)[::-1]
|
32 |
+
|
33 |
+
keep = []
|
34 |
+
while len(indices) > 0:
|
35 |
+
current = indices[0]
|
36 |
+
keep.append(current)
|
37 |
+
rest = indices[1:]
|
38 |
+
|
39 |
+
ious = compute_iou({
|
40 |
+
'xmin': boxes[current, 0],
|
41 |
+
'ymin': boxes[current, 1],
|
42 |
+
'xmax': boxes[current, 2],
|
43 |
+
'ymax': boxes[current, 3]
|
44 |
+
}, boxes[rest])
|
45 |
+
|
46 |
+
indices = rest[np.where(ious < iou_threshold)[0]]
|
47 |
+
|
48 |
+
return [detections[i] for i in keep]
|
49 |
+
|
50 |
# Function to generate audio from text
|
51 |
def generate_audio(text):
|
52 |
narrated_text = narrator(text)
|
|
|
86 |
|
87 |
label = detection['label']
|
88 |
score = detection['score']
|
89 |
+
text = f"{label}: {score:.2f}"
|
90 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
91 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
92 |
draw.text((xmin, ymin), text, fill="white", font=font)
|
|
|
96 |
# Main function to process the image
|
97 |
def detect_object(image):
|
98 |
detections = object_detector(image)
|
99 |
+
|
100 |
+
# Apply confidence threshold and NMS
|
101 |
+
confidence_threshold = 0.5
|
102 |
+
filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
|
103 |
+
filtered_detections = nms(filtered_detections)
|
104 |
+
|
105 |
+
processed_image = draw_bounding_boxes(image, filtered_detections)
|
106 |
+
description_text = read_objects(filtered_detections)
|
107 |
processed_audio = generate_audio(description_text)
|
108 |
return processed_image, processed_audio
|
109 |
|
|
|
110 |
description_text = """
|
111 |
Upload an image to detect objects and hear a natural language description.
|
|
|
112 |
### Credits:
|
113 |
Developed by Taizun S
|
114 |
"""
|
115 |
|
116 |
+
# Google Analytics script
|
117 |
ga_script = """
|
118 |
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
|
119 |
<script>
|
120 |
window.dataLayer = window.dataLayer || [];
|
121 |
function gtag(){dataLayer.push(arguments);}
|
122 |
gtag('js', new Date());
|
|
|
123 |
gtag('config', 'G-WEYXHDZ3GQ');
|
124 |
</script>
|
125 |
"""
|
|
|
142 |
|
143 |
# Launch the Blocks interface
|
144 |
demo.launch()
|
|