File size: 960 Bytes
7983775
 
 
 
 
 
 
 
 
 
 
8df60f2
7983775
 
 
 
 
 
 
 
8df60f2
7983775
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import gradio as gr
from PIL import Image

from asr import transcribe_audio
from qa import get_image_answer
from tts import text_to_speech

def multimodal_qa_app(image: Image.Image, audio_path: str):
    question_text = transcribe_audio(audio_path)
    answer = get_image_answer(image, question_text)
    audio_response = text_to_speech(answer)
    return question_text, answer, audio_response

interface = gr.Interface(
    fn=multimodal_qa_app,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Audio(type="filepath", label="Ask a Question via Mic (10s max)")
    ],
    outputs=[
        gr.Textbox(label="Transcribed Question"),
        gr.Textbox(label="Answer"),
        gr.Audio(label="Spoken Answer")
    ],
    title="Ask-the-Image: Multimodal QA",
    description="Upload an image and ask a question using your voice. The app answers and reads it out loud."
)

if __name__ == "__main__":
    interface.launch(debug=True)