Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 8

Commit

c4ff6ca

verified ·

1 Parent(s): e555f36

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -9

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import openai
 import base64
 from PIL import Image
 import io
 # Function to send the request to OpenAI API with an image or text input
 def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
@@ -49,8 +51,25 @@ def get_base64_string_from_image(pil_image):
     base64_str = base64.b64encode(img_bytes).decode("utf-8")
     return base64_str
 # The function that will be used by Gradio interface
-def chatbot(input_text, image, openai_api_key, reasoning_effort, model_choice, history=[]):
     response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
@@ -98,7 +117,7 @@ custom_css = """
         animation: fadeIn 2s ease-out;
     }
     /* Input field styles */
-    .gradio-textbox, .gradio-dropdown, .gradio-image {
         border-radius: 8px;
         border: 2px solid #ccc;
         padding: 10px;
@@ -107,7 +126,7 @@ custom_css = """
         font-size: 1rem;
         transition: all 0.3s ease;
     }
-    .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus {
         border-color: #007bff;
     }
     /* Button styles */
@@ -132,7 +151,6 @@ custom_css = """
     #submit-btn:active {
         transform: scale(0.95);
     }
-    /* Clear History Button: Light Red */
     #clear-history {
         background-color: #f04e4e; /* Slightly Darker red */
         color: white;
@@ -195,7 +213,7 @@ custom_css = """
         .gradio-chatbot {
             max-height: 400px;
         }
-        .gradio-textbox, .gradio-dropdown, .gradio-image {
             width: 100%;
         }
         #submit-btn, #clear-history {
@@ -210,8 +228,8 @@ def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
-                <h1>Multimodal Chatbot (Text + Image)</h1>
-                <h3>Interact with a chatbot using text or image inputs</h3>
             </div>
         """)
@@ -219,9 +237,10 @@ def create_interface():
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
-            This is a multimodal chatbot that can handle both text and image inputs.
             - You can ask questions or provide text, and the assistant will respond.
             - You can also upload an image, and the assistant will process it and answer questions about the image.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
@@ -238,6 +257,7 @@ def create_interface():
         with gr.Row():
             image_input = gr.Image(label="Upload an Image", type="pil")  # Image upload input
             input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
         with gr.Row():
             reasoning_effort = gr.Dropdown(
@@ -256,7 +276,7 @@ def create_interface():
         chat_history = gr.Chatbot()
         # Button interactions
-        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
     return demo

 import base64
 from PIL import Image
 import io
+import openai
+import os
 # Function to send the request to OpenAI API with an image or text input
 def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
     base64_str = base64.b64encode(img_bytes).decode("utf-8")
     return base64_str
+# Function to transcribe audio to text using OpenAI Whisper API
+def transcribe_audio(audio, openai_api_key):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    try:
+        # Transcribe the audio to text
+        audio_file = openai.Audio.create(file=audio, model="whisper-1")
+        return audio_file['text']
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
 # The function that will be used by Gradio interface
+def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_choice, history=[]):
+    # If there's audio, transcribe it to text
+    if audio:
+        input_text = transcribe_audio(audio, openai_api_key)
     response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
         animation: fadeIn 2s ease-out;
     }
     /* Input field styles */
+    .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio {
         border-radius: 8px;
         border: 2px solid #ccc;
         padding: 10px;
         font-size: 1rem;
         transition: all 0.3s ease;
     }
+    .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus {
         border-color: #007bff;
     }
     /* Button styles */
     #submit-btn:active {
         transform: scale(0.95);
     }
     #clear-history {
         background-color: #f04e4e; /* Slightly Darker red */
         color: white;
         .gradio-chatbot {
             max-height: 400px;
         }
+        .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio {
             width: 100%;
         }
         #submit-btn, #clear-history {
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
+                <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
+                <h3>Interact with a chatbot using text, image, or voice inputs</h3>
             </div>
         """)
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
+            This is a multimodal chatbot that can handle text, image, and voice inputs.
             - You can ask questions or provide text, and the assistant will respond.
             - You can also upload an image, and the assistant will process it and answer questions about the image.
+            - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
             - Enter your OpenAI API key to start interacting with the model.
             - You can use the 'Clear History' button to remove the conversation history.
             - "o1" is for image chat and "o3-mini" is for text chat.
         with gr.Row():
             image_input = gr.Image(label="Upload an Image", type="pil")  # Image upload input
             input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
+            audio_input = gr.Audio(label="Upload or Record Audio", type="file")  # Audio upload or record input
         with gr.Row():
             reasoning_effort = gr.Dropdown(
         chat_history = gr.Chatbot()
         # Button interactions
+        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
         clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])
     return demo