File size: 3,589 Bytes
b15b52e
 
 
 
 
5c09aef
b15b52e
 
5c09aef
b15b52e
 
 
 
 
 
 
 
 
 
5c09aef
b15b52e
 
5c09aef
53a3094
b15b52e
 
5c09aef
53a3094
b15b52e
 
 
 
5c09aef
b15b52e
 
5c09aef
b15b52e
 
 
 
 
 
 
5c09aef
b15b52e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c09aef
b15b52e
 
 
5c09aef
b15b52e
 
 
5c09aef
b15b52e
5c09aef
b15b52e
 
 
 
 
 
 
5c09aef
b15b52e
 
5c09aef
b15b52e
 
 
 
5c09aef
 
b15b52e
 
5c09aef
b15b52e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile



from transformers import pipeline




narrator = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

object_detector = pipeline("object-detection",
                model="facebook/detr-resnet-50")


def generate_audio(text):
   
    narrated_text = narrator(text)

   
    wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
                  data=narrated_text["audio"][0])

   
    return "output.wav"



def read_objects(detection_objects):
   
    object_counts = {}

   
    for detection in detection_objects:
        label = detection['label']
        if label in object_counts:
            object_counts[label] += 1
        else:
            object_counts[label] = 1

  
    response = "This picture contains"
    labels = list(object_counts.keys())
    for i, label in enumerate(labels):
        response += f" {object_counts[label]} {label}"
        if object_counts[label] > 1:
            response += "s"
        if i < len(labels) - 2:
            response += ","
        elif i == len(labels) - 2:
            response += " and"

    response += "."

    return response



def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
    """
    Draws bounding boxes on the given image based on the detections.

    :param image: PIL.Image object
    :param detections: List of detection results, where each result is a dictionary containing
                       'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
                       'ymin', 'xmax', 'ymax'.
    :param font_path: Path to the TrueType font file to use for text.
    :param font_size: Size of the font to use for text.
    :return: PIL.Image object with bounding boxes drawn.
    """
   
    draw_image = image.copy()
    draw = ImageDraw.Draw(draw_image)

  
    if font_path:
        font = ImageFont.truetype(font_path, font_size)
    else:
        
        font = ImageFont.load_default()
       
    for detection in detections:
        box = detection['box']
        xmin = box['xmin']
        ymin = box['ymin']
        xmax = box['xmax']
        ymax = box['ymax']

       
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

       
        label = detection['label']
        score = detection['score']
        text = f"{label} {score:.2f}"

       
        if font_path:  
            text_size = draw.textbbox((xmin, ymin), text, font=font)
        else:
            
            text_size = draw.textbbox((xmin, ymin), text)

        draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
        draw.text((xmin, ymin), text, fill="white", font=font)

    return draw_image


def detect_object(image):
    raw_image = image
    output = object_detector(raw_image)
    processed_image = draw_bounding_boxes(raw_image, output)
    natural_text = read_objects(output)
    processed_audio = generate_audio(natural_text)
    return processed_image, processed_audio


demo = gr.Interface(fn=detect_object,
                    inputs=[gr.Image(label="Select Image",type="pil")],
                    outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
                    title="@GenAILearniverse Project 7: Object Detector with Audio",
                    description="THIS APPLICATION WILL BE USED TO HIGHLIGHT OBJECTS AND GIVES AUDIO DESCRIPTION FOR THE PROVIDED INPUT IMAGE.")
demo.launch()

# print(output)