Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 10

Commit

cae08cb

verified ·

1 Parent(s): 3324f5a

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -41

app.py CHANGED Viewed

@@ -82,13 +82,15 @@ def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_ch
     if audio and input_mode == "Voice":
         input_text = transcribe_audio(audio, openai_api_key)
-    if input_mode == "Text" or input_mode == "Voice":
-        response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
-    elif input_mode == "Image":
-        response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
-    history.append((f"User: {input_text}", f"Assistant: {response}"))
     return "", history
@@ -241,45 +243,25 @@ custom_css = """
 # Gradio interface setup
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
-        gr.Markdown("""
-            <div class="gradio-header">
-                <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
-                <h3>Interact with a chatbot using text, image, or voice inputs</h3>
-            </div>
-        """)
-        # Add a description with an expandable accordion
-        with gr.Accordion("Click to expand for details", open=False):
-            gr.Markdown("""
-            ### Description:
-            This is a multimodal chatbot that can handle text, image, and voice inputs.
-            - You can ask questions or provide text, and the assistant will respond.
-            - You can also upload an image, and the assistant will process it and answer questions about the image.
-            - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
-            - Enter your OpenAI API key to start interacting with the model.
-            - You can use the 'Clear History' button to remove the conversation history.
-            - "o1" is for image chat and "o3-mini" is for text chat.
-            ### Reasoning Effort:
-            The reasoning effort controls how complex or detailed the assistant's answers should be.
-            - **Low**: Provides quick, concise answers with minimal reasoning or details.
-            - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
-            - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
-            """)
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
-        with gr.Row():
-            input_mode = gr.Dropdown(
-                label="Select Input Mode",
-                choices=["Text", "Image", "Voice"],
-                value="Text"
-            )
-        with gr.Row():
-            image_input = gr.Image(label="Upload an Image", type="pil")  # Image upload input
-            input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
-            audio_input = gr.Audio(label="Upload or Record Audio", type="filepath")  # Audio upload or record input (using filepath)
         with gr.Row():
             reasoning_effort = gr.Dropdown(
@@ -290,17 +272,28 @@ def create_interface():
             model_choice = gr.Dropdown(
                 label="Select Model",
                 choices=["o1", "o3-mini"],
-                value="o1"  # Default to 'o1' for image-related tasks
             )
             submit_btn = gr.Button("Ask!", elem_id="submit-btn")
             clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
         # Button interactions
         submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, input_mode, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
     return demo
 # Run the interface

     if audio and input_mode == "Voice":
         input_text = transcribe_audio(audio, openai_api_key)
+    if input_mode == "Image" and image:
+        # If Image Mode is selected and image is uploaded
+        input_text = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
+    elif input_mode == "Text" and input_text:
+        # If Text Mode is selected
+        input_text = generate_response(input_text, None, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
+    history.append((f"User: {input_text}", f"Assistant: {input_text}"))
     return "", history
 # Gradio interface setup
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
+        gr.Markdown("""<div class="gradio-header">
+            <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
+            <h3>Interact with a chatbot using text, image, or voice inputs</h3>
+        </div>""")
+        # Choose input type (Text, Image, Voice)
+        input_mode = gr.Radio(
+            label="Choose Input Mode",
+            choices=["Text", "Image", "Voice"],
+            value="Text"
+        )
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
+        # Text, Image, and Audio Inputs will be displayed based on the chosen mode
+        input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
+        image_input = gr.Image(label="Upload an Image", type="pil")
+        audio_input = gr.Audio(label="Upload or Record Audio", type="filepath")
         with gr.Row():
             reasoning_effort = gr.Dropdown(
             model_choice = gr.Dropdown(
                 label="Select Model",
                 choices=["o1", "o3-mini"],
+                value="o1"
             )
             submit_btn = gr.Button("Ask!", elem_id="submit-btn")
             clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
+        # Dynamically control the input visibility based on the selected mode
+        def toggle_inputs(input_mode):
+            if input_mode == "Text":
+                return input_text, None, None
+            elif input_mode == "Image":
+                return None, image_input, None
+            else:  # Voice
+                return None, None, audio_input
         # Button interactions
         submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, input_mode, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
+        input_mode.change(toggle_inputs, inputs=[input_mode], outputs=[input_text, image_input, audio_input])
     return demo
 # Run the interface