aleehassan's picture
Update app.py
6e2faa9 verified
raw
history blame
4.9 kB
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import scipy.io.wavfile as wavfile
from transformers import pipeline
# Load pipelines
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
# Function to apply Non-Maximum Suppression (NMS)
def compute_iou(box1, boxes):
x1 = np.maximum(box1['xmin'], boxes[:, 0])
y1 = np.maximum(box1['ymin'], boxes[:, 1])
x2 = np.minimum(box1['xmax'], boxes[:, 2])
y2 = np.minimum(box1['ymax'], boxes[:, 3])
intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
union = box1_area + boxes_area - intersection
return intersection / union
def nms(detections, iou_threshold=0.5):
if len(detections) == 0:
return []
boxes = np.array([[d['box']['xmin'], d['box']['ymin'], d['box']['xmax'], d['box']['ymax']] for d in detections])
scores = np.array([d['score'] for d in detections])
indices = np.argsort(scores)[::-1]
keep = []
while len(indices) > 0:
current = indices[0]
keep.append(current)
rest = indices[1:]
ious = compute_iou({
'xmin': boxes[current, 0],
'ymin': boxes[current, 1],
'xmax': boxes[current, 2],
'ymax': boxes[current, 3]
}, boxes[rest])
indices = rest[np.where(ious < iou_threshold)[0]]
return [detections[i] for i in keep]
# Function to generate audio from text
def generate_audio(text):
narrated_text = narrator(text)
wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
return "output.wav"
# Function to read and summarize detected objects
def read_objects(detection_objects):
object_counts = {}
for detection in detection_objects:
label = detection['label']
object_counts[label] = object_counts.get(label, 0) + 1
response = "This picture contains"
labels = list(object_counts.keys())
for i, label in enumerate(labels):
response += f" {object_counts[label]} {label}"
if object_counts[label] > 1:
response += "s"
if i < len(labels) - 2:
response += ","
elif i == len(labels) - 2:
response += " and"
response += "."
return response
# Function to draw bounding boxes on the image
def draw_bounding_boxes(image, detections):
draw_image = image.copy()
draw = ImageDraw.Draw(draw_image)
font = ImageFont.load_default()
for detection in detections:
box = detection['box']
xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
label = detection['label']
score = detection['score']
text = f"{label}: {score:.2f}"
text_size = draw.textbbox((xmin, ymin), text, font=font)
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
draw.text((xmin, ymin), text, fill="white", font=font)
return draw_image
# Main function to process the image
def detect_object(image):
detections = object_detector(image)
# Apply confidence threshold and NMS
confidence_threshold = 0.5
filtered_detections = [d for d in detections if d['score'] > confidence_threshold]
filtered_detections = nms(filtered_detections)
processed_image = draw_bounding_boxes(image, filtered_detections)
description_text = read_objects(filtered_detections)
processed_audio = generate_audio(description_text)
return processed_image, processed_audio
description_text = """
Upload an image to detect objects and hear a natural language description.
### Credits:
Developed by Taizun S
"""
# Google Analytics script
ga_script = """
<script async src="https://www.googletagmanager.com/gtag/js?id=G-WEYXHDZ3GQ"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-WEYXHDZ3GQ');
</script>
"""
# Use Gradio Blocks to organize the layout
with gr.Blocks() as demo:
gr.HTML(ga_script) # Injecting Google Analytics script
gr.Markdown(description_text) # Adding the description as Markdown
# Define the Interface components within Blocks
gr.Interface(
fn=detect_object,
inputs=gr.Image(label="Upload an Image", type="pil"),
outputs=[
gr.Image(label="Processed Image", type="pil"),
gr.Audio(label="Generated Audio")
],
title="Multi-Object Detection with Audio Narration",
)
# Launch the Blocks interface
demo.launch()