burhan112 commited on
Commit
7983775
·
verified ·
1 Parent(s): ae6da0c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -0
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+
4
+ from asr import transcribe_audio
5
+ from qa import get_image_answer
6
+ from tts import text_to_speech
7
+
8
+ def multimodal_qa_app(image: Image.Image, audio_path: str):
9
+ question_text = transcribe_audio(audio_path)
10
+ answer = get_image_answer(image, question_text)
11
+ audio_response = text_to_speech(answer)
12
+ return answer, audio_response
13
+
14
+ interface = gr.Interface(
15
+ fn=multimodal_qa_app,
16
+ inputs=[
17
+ gr.Image(type="pil", label="Upload an Image"),
18
+ gr.Audio(type="filepath", label="Ask a Question via Mic (10s max)")
19
+ ],
20
+ outputs=[
21
+ gr.Textbox(label="Answer"),
22
+ gr.Audio(label="Spoken Answer")
23
+ ],
24
+ title="Ask-the-Image: Multimodal QA",
25
+ description="Upload an image and ask a question using your voice. The app answers and reads it out loud."
26
+ )
27
+
28
+ if __name__ == "__main__":
29
+ interface.launch(debug=True)