Files changed (1) hide show
  1. app.py +17 -32
app.py CHANGED
@@ -3,14 +3,10 @@ from PIL import Image, ImageDraw, ImageFont
3
  import scipy.io.wavfile as wavfile
4
 
5
 
6
- # Use a pipeline as a high-level helper
7
  from transformers import pipeline
8
 
9
- # model_path = ("../Models/models--facebook--detr-resnet-50/snapshots"
10
- # "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
11
- #
12
- # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
13
- # "/3bcb8321394f671bd948ebf0d086d694dda95464")
14
 
15
 
16
  narrator = pipeline("text-to-speech",
@@ -19,35 +15,25 @@ narrator = pipeline("text-to-speech",
19
  object_detector = pipeline("object-detection",
20
  model="facebook/detr-resnet-50")
21
 
22
- # object_detector = pipeline("object-detection",
23
- # model=model_path)
24
- #
25
- # narrator = pipeline("text-to-speech",
26
- # model=tts_model_path)
27
 
28
- # [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
29
-
30
- # Define the function to generate audio from text
31
  def generate_audio(text):
32
- # Generate the narrated text
33
  narrated_text = narrator(text)
34
 
35
- # Save the audio to a WAV file
36
  wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
37
  data=narrated_text["audio"][0])
38
 
39
- # Return the path to the saved audio file
40
  return "output.wav"
41
 
42
- # Could you please write me a python code that will take list of detection object as an input and it will give the response that will include all the objects (labels) provided in the input. For example if the input is like this: [{'score': 0.9996405839920044, 'label': 'person', 'box': {'xmin': 435, 'ymin': 282, 'xmax': 636, 'ymax': 927}}, {'score': 0.9995879530906677, 'label': 'dog', 'box': {'xmin': 570, 'ymin': 694, 'xmax': 833, 'ymax': 946}}]
43
- # The output should be, This pictuture contains 1 person and 1 dog. If there are multiple objects, do not add 'and' between every objects but 'and' should be at the end only
44
 
45
 
46
  def read_objects(detection_objects):
47
- # Initialize counters for each object label
48
  object_counts = {}
49
 
50
- # Count the occurrences of each label
51
  for detection in detection_objects:
52
  label = detection['label']
53
  if label in object_counts:
@@ -55,7 +41,7 @@ def read_objects(detection_objects):
55
  else:
56
  object_counts[label] = 1
57
 
58
- # Generate the response string
59
  response = "This picture contains"
60
  labels = list(object_counts.keys())
61
  for i, label in enumerate(labels):
@@ -85,18 +71,17 @@ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
85
  :param font_size: Size of the font to use for text.
86
  :return: PIL.Image object with bounding boxes drawn.
87
  """
88
- # Make a copy of the image to draw on
89
  draw_image = image.copy()
90
  draw = ImageDraw.Draw(draw_image)
91
 
92
- # Load custom font or default font if path not provided
93
  if font_path:
94
  font = ImageFont.truetype(font_path, font_size)
95
  else:
96
- # When font_path is not provided, load default font but it's size is fixed
97
  font = ImageFont.load_default()
98
- # Increase font size workaround by using a TTF font file, if needed, can download and specify the path
99
-
100
  for detection in detections:
101
  box = detection['box']
102
  xmin = box['xmin']
@@ -104,19 +89,19 @@ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
104
  xmax = box['xmax']
105
  ymax = box['ymax']
106
 
107
- # Draw the bounding box
108
  draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
109
 
110
- # Optionally, you can also draw the label and score
111
  label = detection['label']
112
  score = detection['score']
113
  text = f"{label} {score:.2f}"
114
 
115
- # Draw text with background rectangle for visibility
116
- if font_path: # Use the custom font with increased size
117
  text_size = draw.textbbox((xmin, ymin), text, font=font)
118
  else:
119
- # Calculate text size using the default font
120
  text_size = draw.textbbox((xmin, ymin), text)
121
 
122
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
 
3
  import scipy.io.wavfile as wavfile
4
 
5
 
6
+
7
  from transformers import pipeline
8
 
9
+
 
 
 
 
10
 
11
 
12
  narrator = pipeline("text-to-speech",
 
15
  object_detector = pipeline("object-detection",
16
  model="facebook/detr-resnet-50")
17
 
 
 
 
 
 
18
 
 
 
 
19
  def generate_audio(text):
20
+
21
  narrated_text = narrator(text)
22
 
23
+
24
  wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
25
  data=narrated_text["audio"][0])
26
 
27
+
28
  return "output.wav"
29
 
 
 
30
 
31
 
32
  def read_objects(detection_objects):
33
+
34
  object_counts = {}
35
 
36
+
37
  for detection in detection_objects:
38
  label = detection['label']
39
  if label in object_counts:
 
41
  else:
42
  object_counts[label] = 1
43
 
44
+
45
  response = "This picture contains"
46
  labels = list(object_counts.keys())
47
  for i, label in enumerate(labels):
 
71
  :param font_size: Size of the font to use for text.
72
  :return: PIL.Image object with bounding boxes drawn.
73
  """
74
+
75
  draw_image = image.copy()
76
  draw = ImageDraw.Draw(draw_image)
77
 
78
+
79
  if font_path:
80
  font = ImageFont.truetype(font_path, font_size)
81
  else:
82
+
83
  font = ImageFont.load_default()
84
+
 
85
  for detection in detections:
86
  box = detection['box']
87
  xmin = box['xmin']
 
89
  xmax = box['xmax']
90
  ymax = box['ymax']
91
 
92
+
93
  draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
94
 
95
+
96
  label = detection['label']
97
  score = detection['score']
98
  text = f"{label} {score:.2f}"
99
 
100
+
101
+ if font_path:
102
  text_size = draw.textbbox((xmin, ymin), text, font=font)
103
  else:
104
+
105
  text_size = draw.textbbox((xmin, ymin), text)
106
 
107
  draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")