Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 8

Commit

e555f36

verified ·

1 Parent(s): c014521

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -39

app.py CHANGED Viewed

@@ -1,13 +1,8 @@
 import gradio as gr
 import openai
 import base64
-import os
 import io
-from helpers import text_to_speech, autoplay_audio, speech_to_text, get_api_key
-from generate_answer import base_model_chatbot, with_pdf_chatbot
-from audio_recorder_streamlit import audio_recorder
-from streamlit_float import *
-from PIL import Image as stImage
 # Function to send the request to OpenAI API with an image or text input
 def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
@@ -16,28 +11,29 @@ def generate_response(input_text, image, openai_api_key, reasoning_effort="mediu
     openai.api_key = openai_api_key
-    # If the user uploaded an image, convert it to base64 and use it for API call
     if image:
         # Convert the image to base64 string
         image_info = get_base64_string_from_image(image)
         input_text = f"data:image/png;base64,{image_info}"
-    # Check for text input and pass to API
-    if not input_text:
-        return "Error: Please provide either text, image, or voice input."
-    # Prepare the messages for OpenAI API based on the selected model
-    if model_choice == "o1" and input_text:
-        messages = [{"role": "user", "content": input_text}]
-    elif model_choice == "o3-mini" and input_text:
-        messages = [{"role": "user", "content": input_text}]
     try:
         # Call OpenAI API with the selected model
         response = openai.ChatCompletion.create(
             model=model_choice,  # Dynamically choose the model (o1 or o3-mini)
             messages=messages,
-            max_tokens=2000  # Limit response tokens to 2000
         )
         return response["choices"][0]["message"]["content"]
@@ -54,11 +50,7 @@ def get_base64_string_from_image(pil_image):
     return base64_str
 # The function that will be used by Gradio interface
-def chatbot(input_text, image, voice_audio, openai_api_key, reasoning_effort, model_choice, history=[]):
-    # If voice_audio is provided, convert it to text
-    if voice_audio:
-        input_text = speech_to_text(voice_audio)  # Convert speech to text
     response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
@@ -140,6 +132,7 @@ custom_css = """
     #submit-btn:active {
         transform: scale(0.95);
     }
     #clear-history {
         background-color: #f04e4e; /* Slightly Darker red */
         color: white;
@@ -212,13 +205,13 @@ custom_css = """
     }
 """
-# Gradio interface setup for multimodal chatbot with voice functionality
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
-                <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
-                <h3>Interact with a chatbot using text, image, or voice inputs</h3>
             </div>
         """)
@@ -226,10 +219,9 @@ def create_interface():
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
-            This is a multimodal chatbot that can handle text, image, and voice inputs.
             - You can ask questions or provide text, and the assistant will respond.
-            - You can upload an image, and the assistant will process it and answer questions about the image.
-            - You can also speak to the assistant, and it will process your speech.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
@@ -258,22 +250,18 @@ def create_interface():
                 choices=["o1", "o3-mini"],
                 value="o1"  # Default to 'o1' for image-related tasks
             )
-        # Audio input (voice interaction)
-        with gr.Row():
-            voice_input = gr.Audio(label="Speak to the Assistant", type="filepath")
-        submit_btn = gr.Button("Ask!", elem_id="submit-btn")
-        clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
         # Button interactions
-        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, voice_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
     return demo
 if __name__ == "__main__":
-    demo = create_interface()  # Gradio multimodal chatbot
-    demo.launch()

 import gradio as gr
 import openai
 import base64
+from PIL import Image
 import io
 # Function to send the request to OpenAI API with an image or text input
 def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
     openai.api_key = openai_api_key
+    # Process the input depending on whether it's text or an image
     if image:
         # Convert the image to base64 string
         image_info = get_base64_string_from_image(image)
         input_text = f"data:image/png;base64,{image_info}"
+    # Prepare the messages for OpenAI API
+    if model_choice == "o1":
+        messages = [
+            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_text}}]}
+        ]
+    elif model_choice == "o3-mini":
+        messages = [
+            {"role": "user", "content": [{"type": "text", "text": input_text}]}
+        ]
     try:
         # Call OpenAI API with the selected model
         response = openai.ChatCompletion.create(
             model=model_choice,  # Dynamically choose the model (o1 or o3-mini)
             messages=messages,
+            reasoning_effort=reasoning_effort,  # Set reasoning_effort for the response
+            max_completion_tokens=2000  # Limit response tokens to 2000
         )
         return response["choices"][0]["message"]["content"]
     return base64_str
 # The function that will be used by Gradio interface
+def chatbot(input_text, image, openai_api_key, reasoning_effort, model_choice, history=[]):
     response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
     #submit-btn:active {
         transform: scale(0.95);
     }
+    /* Clear History Button: Light Red */
     #clear-history {
         background-color: #f04e4e; /* Slightly Darker red */
         color: white;
     }
 """
+# Gradio interface setup
 def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
+                <h1>Multimodal Chatbot (Text + Image)</h1>
+                <h3>Interact with a chatbot using text or image inputs</h3>
             </div>
         """)
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
+            This is a multimodal chatbot that can handle both text and image inputs.
             - You can ask questions or provide text, and the assistant will respond.
+            - You can also upload an image, and the assistant will process it and answer questions about the image.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
                 choices=["o1", "o3-mini"],
                 value="o1"  # Default to 'o1' for image-related tasks
             )
+            submit_btn = gr.Button("Ask!", elem_id="submit-btn")
+            clear_btn = gr.Button("Clear History", elem_id="clear-history")
         chat_history = gr.Chatbot()
         # Button interactions
+        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
     return demo
+# Run the interface
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()