shukdevdatta123 commited on
Commit
133de89
·
verified ·
1 Parent(s): ee274c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -75
app.py CHANGED
@@ -210,13 +210,13 @@ custom_css = """
210
  }
211
  """
212
 
213
- # Gradio interface setup for multimodal chatbot
214
  def create_interface():
215
  with gr.Blocks(css=custom_css) as demo:
216
  gr.Markdown("""
217
  <div class="gradio-header">
218
- <h1>Multimodal Chatbot (Text + Image)</h1>
219
- <h3>Interact with a chatbot using text or image inputs</h3>
220
  </div>
221
  """)
222
 
@@ -224,9 +224,10 @@ def create_interface():
224
  with gr.Accordion("Click to expand for details", open=False):
225
  gr.Markdown("""
226
  ### Description:
227
- This is a multimodal chatbot that can handle both text and image inputs.
228
  - You can ask questions or provide text, and the assistant will respond.
229
- - You can also upload an image, and the assistant will process it and answer questions about the image.
 
230
  - Enter your OpenAI API key to start interacting with the model.
231
  - You can use the 'Clear History' button to remove the conversation history.
232
  - "o1" is for image chat and "o3-mini" is for text chat.
@@ -255,8 +256,13 @@ def create_interface():
255
  choices=["o1", "o3-mini"],
256
  value="o1" # Default to 'o1' for image-related tasks
257
  )
258
- submit_btn = gr.Button("Ask!", elem_id="submit-btn")
259
- clear_btn = gr.Button("Clear History", elem_id="clear-history")
 
 
 
 
 
260
 
261
  chat_history = gr.Chatbot()
262
 
@@ -266,73 +272,6 @@ def create_interface():
266
 
267
  return demo
268
 
269
- # Voice interaction (audio chat) setup for Gradio
270
- def voice_chat():
271
- # Float feature initialization
272
- float_init()
273
-
274
- # Prompt for API key
275
- api_key = get_api_key()
276
- if not api_key:
277
- gr.error("You must provide a valid OpenAI API Key to proceed.")
278
- return
279
-
280
- def initialize_session_state():
281
- if "messages" not in gr.session_state:
282
- gr.session_state.messages = [
283
- {"role": "assistant", "content": "Hi! How may I assist you today? (Please Speak Clearly)"}
284
- ]
285
-
286
- initialize_session_state()
287
-
288
- gr.title("OpenAI Conversational Chatbot (Voice Interaction) 🤖")
289
-
290
- # Footer container for the microphone
291
- footer_container = gr.container()
292
-
293
- with footer_container:
294
- audio_bytes = audio_recorder()
295
-
296
- for message in gr.session_state.messages:
297
- with gr.chat_message(message["role"]):
298
- gr.write(message["content"])
299
-
300
- if audio_bytes:
301
- # Write the audio bytes to a file
302
- with gr.spinner("Transcribing..."):
303
- webm_file_path = "temp_audio.mp3"
304
- with open(webm_file_path, "wb") as f:
305
- f.write(audio_bytes)
306
-
307
- transcript = speech_to_text(webm_file_path)
308
- if transcript:
309
- gr.session_state.messages.append({"role": "user", "content": transcript})
310
- with gr.chat_message("user"):
311
- gr.write(transcript)
312
- os.remove(webm_file_path)
313
-
314
- if gr.session_state.messages[-1]["role"] != "assistant":
315
- with gr.chat_message("assistant"):
316
- with gr.spinner("Thinking🤔..."):
317
- final_response = base_model_chatbot(gr.session_state.messages)
318
-
319
- # Final check for punctuation and completeness
320
- if not final_response.strip()[-1] in ".!?":
321
- final_response += " This is the end of the response. Let me know if you need anything else."
322
-
323
- with gr.spinner("Generating audio response..."):
324
- audio_file = text_to_speech(final_response)
325
- autoplay_audio(audio_file)
326
- gr.write(final_response)
327
- gr.session_state.messages.append({"role": "assistant", "content": final_response})
328
- os.remove(audio_file)
329
-
330
- # Float the footer container and provide CSS to target it with
331
- footer_container.float("bottom: 0rem;")
332
-
333
  if __name__ == "__main__":
334
  demo = create_interface() # Gradio multimodal chatbot
335
- demo.launch()
336
-
337
- # Gradio voice chat
338
- voice_chat()
 
210
  }
211
  """
212
 
213
+ # Gradio interface setup for multimodal chatbot with voice functionality
214
  def create_interface():
215
  with gr.Blocks(css=custom_css) as demo:
216
  gr.Markdown("""
217
  <div class="gradio-header">
218
+ <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
219
+ <h3>Interact with a chatbot using text, image, or voice inputs</h3>
220
  </div>
221
  """)
222
 
 
224
  with gr.Accordion("Click to expand for details", open=False):
225
  gr.Markdown("""
226
  ### Description:
227
+ This is a multimodal chatbot that can handle text, image, and voice inputs.
228
  - You can ask questions or provide text, and the assistant will respond.
229
+ - You can upload an image, and the assistant will process it and answer questions about the image.
230
+ - You can also speak to the assistant, and it will process your speech.
231
  - Enter your OpenAI API key to start interacting with the model.
232
  - You can use the 'Clear History' button to remove the conversation history.
233
  - "o1" is for image chat and "o3-mini" is for text chat.
 
256
  choices=["o1", "o3-mini"],
257
  value="o1" # Default to 'o1' for image-related tasks
258
  )
259
+
260
+ # Audio input (voice interaction)
261
+ with gr.Row():
262
+ voice_input = gr.Audio(label="Speak to the Assistant", type="filepath")
263
+
264
+ submit_btn = gr.Button("Ask!", elem_id="submit-btn")
265
+ clear_btn = gr.Button("Clear History", elem_id="clear-history")
266
 
267
  chat_history = gr.Chatbot()
268
 
 
272
 
273
  return demo
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  if __name__ == "__main__":
276
  demo = create_interface() # Gradio multimodal chatbot
277
+ demo.launch()