Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 8

Commit

133de89

verified ·

1 Parent(s): ee274c9

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -75

app.py CHANGED Viewed

@@ -210,13 +210,13 @@ custom_css = """
     }
 """
-# Gradio interface setup for multimodal chatbot
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
-                <h1>Multimodal Chatbot (Text + Image)</h1>
-                <h3>Interact with a chatbot using text or image inputs</h3>
             </div>
         """)
@@ -224,9 +224,10 @@ def create_interface():
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
-            This is a multimodal chatbot that can handle both text and image inputs.
             - You can ask questions or provide text, and the assistant will respond.
-            - You can also upload an image, and the assistant will process it and answer questions about the image.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
@@ -255,8 +256,13 @@ def create_interface():
                 choices=["o1", "o3-mini"],
                 value="o1"  # Default to 'o1' for image-related tasks
             )
-            submit_btn = gr.Button("Ask!", elem_id="submit-btn")
-            clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
@@ -266,73 +272,6 @@ def create_interface():
     return demo
-# Voice interaction (audio chat) setup for Gradio
-def voice_chat():
-    # Float feature initialization
-    float_init()
-    # Prompt for API key
-    api_key = get_api_key()
-    if not api_key:
-        gr.error("You must provide a valid OpenAI API Key to proceed.")
-        return
-    def initialize_session_state():
-        if "messages" not in gr.session_state:
-            gr.session_state.messages = [
-                {"role": "assistant", "content": "Hi! How may I assist you today? (Please Speak Clearly)"}
-            ]
-    initialize_session_state()
-    gr.title("OpenAI Conversational Chatbot (Voice Interaction) 🤖")
-    # Footer container for the microphone
-    footer_container = gr.container()
-    with footer_container:
-        audio_bytes = audio_recorder()
-    for message in gr.session_state.messages:
-        with gr.chat_message(message["role"]):
-            gr.write(message["content"])
-    if audio_bytes:
-        # Write the audio bytes to a file
-        with gr.spinner("Transcribing..."):
-            webm_file_path = "temp_audio.mp3"
-            with open(webm_file_path, "wb") as f:
-                f.write(audio_bytes)
-            transcript = speech_to_text(webm_file_path)
-            if transcript:
-                gr.session_state.messages.append({"role": "user", "content": transcript})
-                with gr.chat_message("user"):
-                    gr.write(transcript)
-                os.remove(webm_file_path)
-    if gr.session_state.messages[-1]["role"] != "assistant":
-        with gr.chat_message("assistant"):
-            with gr.spinner("Thinking🤔..."):
-                final_response = base_model_chatbot(gr.session_state.messages)
-                # Final check for punctuation and completeness
-                if not final_response.strip()[-1] in ".!?":
-                    final_response += " This is the end of the response. Let me know if you need anything else."
-            with gr.spinner("Generating audio response..."):
-                audio_file = text_to_speech(final_response)
-                autoplay_audio(audio_file)
-            gr.write(final_response)
-            gr.session_state.messages.append({"role": "assistant", "content": final_response})
-            os.remove(audio_file)
-    # Float the footer container and provide CSS to target it with
-    footer_container.float("bottom: 0rem;")
 if __name__ == "__main__":
     demo = create_interface()  # Gradio multimodal chatbot
-    demo.launch()
-    # Gradio voice chat
-    voice_chat()

     }
 """
+# Gradio interface setup for multimodal chatbot with voice functionality
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
+                <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
+                <h3>Interact with a chatbot using text, image, or voice inputs</h3>
             </div>
         """)
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
+            This is a multimodal chatbot that can handle text, image, and voice inputs.
             - You can ask questions or provide text, and the assistant will respond.
+            - You can upload an image, and the assistant will process it and answer questions about the image.
+            - You can also speak to the assistant, and it will process your speech.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
                 choices=["o1", "o3-mini"],
                 value="o1"  # Default to 'o1' for image-related tasks
             )
+        # Audio input (voice interaction)
+        with gr.Row():
+            voice_input = gr.Audio(label="Speak to the Assistant", type="filepath")
+        submit_btn = gr.Button("Ask!", elem_id="submit-btn")
+        clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
     return demo
 if __name__ == "__main__":
     demo = create_interface()  # Gradio multimodal chatbot
+    demo.launch()