File size: 960 Bytes
7983775 8df60f2 7983775 8df60f2 7983775 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import gradio as gr
from PIL import Image
from asr import transcribe_audio
from qa import get_image_answer
from tts import text_to_speech
def multimodal_qa_app(image: Image.Image, audio_path: str):
question_text = transcribe_audio(audio_path)
answer = get_image_answer(image, question_text)
audio_response = text_to_speech(answer)
return question_text, answer, audio_response
interface = gr.Interface(
fn=multimodal_qa_app,
inputs=[
gr.Image(type="pil", label="Upload an Image"),
gr.Audio(type="filepath", label="Ask a Question via Mic (10s max)")
],
outputs=[
gr.Textbox(label="Transcribed Question"),
gr.Textbox(label="Answer"),
gr.Audio(label="Spoken Answer")
],
title="Ask-the-Image: Multimodal QA",
description="Upload an image and ask a question using your voice. The app answers and reads it out loud."
)
if __name__ == "__main__":
interface.launch(debug=True)
|