Update app.py
Browse files
app.py
CHANGED
@@ -3,14 +3,10 @@ from PIL import Image, ImageDraw, ImageFont
|
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
5 |
|
6 |
-
|
7 |
from transformers import pipeline
|
8 |
|
9 |
-
|
10 |
-
# "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
|
11 |
-
#
|
12 |
-
# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
|
13 |
-
# "/3bcb8321394f671bd948ebf0d086d694dda95464")
|
14 |
|
15 |
|
16 |
narrator = pipeline("text-to-speech",
|
@@ -19,35 +15,25 @@ narrator = pipeline("text-to-speech",
|
|
19 |
object_detector = pipeline("object-detection",
|
20 |
model="facebook/detr-resnet-50")
|
21 |
|
22 |
-
# object_detector = pipeline("object-detection",
|
23 |
-
# model=model_path)
|
24 |
-
#
|
25 |
-
# narrator = pipeline("text-to-speech",
|
26 |
-
# model=tts_model_path)
|
27 |
|
28 |
-
# [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
29 |
-
|
30 |
-
# Define the function to generate audio from text
|
31 |
def generate_audio(text):
|
32 |
-
|
33 |
narrated_text = narrator(text)
|
34 |
|
35 |
-
|
36 |
wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
|
37 |
data=narrated_text["audio"][0])
|
38 |
|
39 |
-
|
40 |
return "output.wav"
|
41 |
|
42 |
-
# Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
|
43 |
-
# The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
|
44 |
|
45 |
|
46 |
def read_objects(detection_objects):
|
47 |
-
|
48 |
object_counts = {}
|
49 |
|
50 |
-
|
51 |
for detection in detection_objects:
|
52 |
label = detection['label']
|
53 |
if label in object_counts:
|
@@ -55,7 +41,7 @@ def read_objects(detection_objects):
|
|
55 |
else:
|
56 |
object_counts[label] = 1
|
57 |
|
58 |
-
|
59 |
response = "This picture contains"
|
60 |
labels = list(object_counts.keys())
|
61 |
for i, label in enumerate(labels):
|
@@ -85,18 +71,17 @@ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
|
|
85 |
:param font_size: Size of the font to use for text.
|
86 |
:return: PIL.Image object with bounding boxes drawn.
|
87 |
"""
|
88 |
-
|
89 |
draw_image = image.copy()
|
90 |
draw = ImageDraw.Draw(draw_image)
|
91 |
|
92 |
-
|
93 |
if font_path:
|
94 |
font = ImageFont.truetype(font_path, font_size)
|
95 |
else:
|
96 |
-
|
97 |
font = ImageFont.load_default()
|
98 |
-
|
99 |
-
|
100 |
for detection in detections:
|
101 |
box = detection['box']
|
102 |
xmin = box['xmin']
|
@@ -104,19 +89,19 @@ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
|
|
104 |
xmax = box['xmax']
|
105 |
ymax = box['ymax']
|
106 |
|
107 |
-
|
108 |
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
|
109 |
|
110 |
-
|
111 |
label = detection['label']
|
112 |
score = detection['score']
|
113 |
text = f"{label} {score:.2f}"
|
114 |
|
115 |
-
|
116 |
-
if font_path:
|
117 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
118 |
else:
|
119 |
-
|
120 |
text_size = draw.textbbox((xmin, ymin), text)
|
121 |
|
122 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|
|
|
3 |
import scipy.io.wavfile as wavfile
|
4 |
|
5 |
|
6 |
+
|
7 |
from transformers import pipeline
|
8 |
|
9 |
+
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
narrator = pipeline("text-to-speech",
|
|
|
15 |
object_detector = pipeline("object-detection",
|
16 |
model="facebook/detr-resnet-50")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
|
|
|
|
|
|
|
19 |
def generate_audio(text):
|
20 |
+
|
21 |
narrated_text = narrator(text)
|
22 |
|
23 |
+
|
24 |
wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
|
25 |
data=narrated_text["audio"][0])
|
26 |
|
27 |
+
|
28 |
return "output.wav"
|
29 |
|
|
|
|
|
30 |
|
31 |
|
32 |
def read_objects(detection_objects):
|
33 |
+
|
34 |
object_counts = {}
|
35 |
|
36 |
+
|
37 |
for detection in detection_objects:
|
38 |
label = detection['label']
|
39 |
if label in object_counts:
|
|
|
41 |
else:
|
42 |
object_counts[label] = 1
|
43 |
|
44 |
+
|
45 |
response = "This picture contains"
|
46 |
labels = list(object_counts.keys())
|
47 |
for i, label in enumerate(labels):
|
|
|
71 |
:param font_size: Size of the font to use for text.
|
72 |
:return: PIL.Image object with bounding boxes drawn.
|
73 |
"""
|
74 |
+
|
75 |
draw_image = image.copy()
|
76 |
draw = ImageDraw.Draw(draw_image)
|
77 |
|
78 |
+
|
79 |
if font_path:
|
80 |
font = ImageFont.truetype(font_path, font_size)
|
81 |
else:
|
82 |
+
|
83 |
font = ImageFont.load_default()
|
84 |
+
|
|
|
85 |
for detection in detections:
|
86 |
box = detection['box']
|
87 |
xmin = box['xmin']
|
|
|
89 |
xmax = box['xmax']
|
90 |
ymax = box['ymax']
|
91 |
|
92 |
+
|
93 |
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
|
94 |
|
95 |
+
|
96 |
label = detection['label']
|
97 |
score = detection['score']
|
98 |
text = f"{label} {score:.2f}"
|
99 |
|
100 |
+
|
101 |
+
if font_path:
|
102 |
text_size = draw.textbbox((xmin, ymin), text, font=font)
|
103 |
else:
|
104 |
+
|
105 |
text_size = draw.textbbox((xmin, ymin), text)
|
106 |
|
107 |
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
|