File size: 3,589 Bytes
b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef 53a3094 b15b52e 5c09aef 53a3094 b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e 5c09aef b15b52e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile
from transformers import pipeline
narrator = pipeline("text-to-speech",
model="kakao-enterprise/vits-ljs")
object_detector = pipeline("object-detection",
model="facebook/detr-resnet-50")
def generate_audio(text):
narrated_text = narrator(text)
wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
data=narrated_text["audio"][0])
return "output.wav"
def read_objects(detection_objects):
object_counts = {}
for detection in detection_objects:
label = detection['label']
if label in object_counts:
object_counts[label] += 1
else:
object_counts[label] = 1
response = "This picture contains"
labels = list(object_counts.keys())
for i, label in enumerate(labels):
response += f" {object_counts[label]} {label}"
if object_counts[label] > 1:
response += "s"
if i < len(labels) - 2:
response += ","
elif i == len(labels) - 2:
response += " and"
response += "."
return response
def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
"""
Draws bounding boxes on the given image based on the detections.
:param image: PIL.Image object
:param detections: List of detection results, where each result is a dictionary containing
'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
'ymin', 'xmax', 'ymax'.
:param font_path: Path to the TrueType font file to use for text.
:param font_size: Size of the font to use for text.
:return: PIL.Image object with bounding boxes drawn.
"""
draw_image = image.copy()
draw = ImageDraw.Draw(draw_image)
if font_path:
font = ImageFont.truetype(font_path, font_size)
else:
font = ImageFont.load_default()
for detection in detections:
box = detection['box']
xmin = box['xmin']
ymin = box['ymin']
xmax = box['xmax']
ymax = box['ymax']
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
label = detection['label']
score = detection['score']
text = f"{label} {score:.2f}"
if font_path:
text_size = draw.textbbox((xmin, ymin), text, font=font)
else:
text_size = draw.textbbox((xmin, ymin), text)
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
draw.text((xmin, ymin), text, fill="white", font=font)
return draw_image
def detect_object(image):
raw_image = image
output = object_detector(raw_image)
processed_image = draw_bounding_boxes(raw_image, output)
natural_text = read_objects(output)
processed_audio = generate_audio(natural_text)
return processed_image, processed_audio
demo = gr.Interface(fn=detect_object,
inputs=[gr.Image(label="Select Image",type="pil")],
outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
title="@GenAILearniverse Project 7: Object Detector with Audio",
description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
demo.launch()
# print(output)
|