Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

File size: 9,629 Bytes

import gradio as gr
import openai
import base64
import os
import io
from helpers import text_to_speech, autoplay_audio, speech_to_text, get_api_key
from generate_answer import base_model_chatbot, with_pdf_chatbot
from audio_recorder_streamlit import audio_recorder
from streamlit_float import *
from PIL import Image as stImage

# Function to send the request to OpenAI API with an image or text input
def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key

    # If the user uploaded an image, convert it to base64 and use it for API call
    if image:
        # Convert the image to base64 string
        image_info = get_base64_string_from_image(image)
        input_text = f"data:image/png;base64,{image_info}"

    # Check for text input and pass to API
    if not input_text:
        return "Error: Please provide either text, image, or voice input."

    # Prepare the messages for OpenAI API based on the selected model
    if model_choice == "o1" and input_text:
        messages = [{"role": "user", "content": input_text}]
    elif model_choice == "o3-mini" and input_text:
        messages = [{"role": "user", "content": input_text}]

    try:
        # Call OpenAI API with the selected model
        response = openai.ChatCompletion.create(
            model=model_choice,  # Dynamically choose the model (o1 or o3-mini)
            messages=messages,
            max_tokens=2000  # Limit response tokens to 2000
        )

        return response["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error calling OpenAI API: {str(e)}"

# Function to convert an uploaded image to a base64 string
def get_base64_string_from_image(pil_image):
    # Convert PIL Image to bytes
    buffered = io.BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    base64_str = base64.b64encode(img_bytes).decode("utf-8")
    return base64_str

# The function that will be used by Gradio interface
def chatbot(input_text, image, voice_audio, openai_api_key, reasoning_effort, model_choice, history=[]):
    # If voice_audio is provided, convert it to text
    if voice_audio:
        input_text = speech_to_text(voice_audio)  # Convert speech to text
    
    response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)
    
    # Append the response to the history
    history.append((f"User: {input_text}", f"Assistant: {response}"))
    
    return "", history

# Function to clear the chat history
def clear_history():
    return "", []

# Custom CSS styles with animations and button colors
custom_css = """
    /* General body styles */
    .gradio-container {
        font-family: 'Arial', sans-serif;
        background-color: #f8f9fa;
        color: #333;
    }
    /* Header styles */
    .gradio-header {
        background-color: #007bff;
        color: white;
        padding: 20px;
        text-align: center;
        border-radius: 8px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        animation: fadeIn 1s ease-out;
    }
    .gradio-header h1 {
        font-size: 2.5rem;
    }
    .gradio-header h3 {
        font-size: 1.2rem;
        margin-top: 10px;
    }
    /* Chatbot container styles */
    .gradio-chatbot {
        background-color: #fff;
        border-radius: 10px;
        padding: 20px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        max-height: 500px;
        overflow-y: auto;
        animation: fadeIn 2s ease-out;
    }
    /* Input field styles */
    .gradio-textbox, .gradio-dropdown, .gradio-image {
        border-radius: 8px;
        border: 2px solid #ccc;
        padding: 10px;
        margin-bottom: 10px;
        width: 100%;
        font-size: 1rem;
        transition: all 0.3s ease;
    }
    .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus {
        border-color: #007bff;
    }
    /* Button styles */
    /* Send Button: Sky Blue */
    #submit-btn {
        background-color: #00aaff; /* Sky blue */
        color: white;
        border: none;
        border-radius: 8px;
        padding: 10px 19px;
        font-size: 1.1rem;
        cursor: pointer;
        transition: all 0.3s ease;
        margin-left: auto;
        margin-right: auto;
        display: block;
        margin-top: 10px;
    }
    #submit-btn:hover {
        background-color: #0099cc; /* Slightly darker blue */
    }
    #submit-btn:active {
        transform: scale(0.95);
    }
    #clear-history {
        background-color: #f04e4e; /* Slightly Darker red */
        color: white;
        border: none;
        border-radius: 8px;
        padding: 10px 13px;
        font-size: 1.1rem;
        cursor: pointer;
        transition: all 0.3s ease;
        margin-top: 10px;
    }
    #clear-history:hover {
        background-color: #f5a4a4; /* Light red */
    }
    #clear-history:active {
        transform: scale(0.95);
    }
    /* Chat history styles */
    .gradio-chatbot .message {
        margin-bottom: 10px;
    }
    .gradio-chatbot .user {
        background-color: #007bff;
        color: white;
        padding: 10px;
        border-radius: 12px;
        max-width: 70%;
        animation: slideInUser 0.5s ease-out;
    }
    .gradio-chatbot .assistant {
        background-color: #f1f1f1;
        color: #333;
        padding: 10px;
        border-radius: 12px;
        max-width: 70%;
        margin-left: auto;
        animation: slideInAssistant 0.5s ease-out;
    }
    /* Animation keyframes */
    @keyframes fadeIn {
        0% { opacity: 0; }
        100% { opacity: 1; }
    }
    @keyframes slideInUser {
        0% { transform: translateX(-100%); }
        100% { transform: translateX(0); }
    }
    @keyframes slideInAssistant {
        0% { transform: translateX(100%); }
        100% { transform: translateX(0); }
    }
    /* Mobile responsiveness */
    @media (max-width: 768px) {
        .gradio-header h1 {
            font-size: 1.8rem;
        }
        .gradio-header h3 {
            font-size: 1rem;
        }
        .gradio-chatbot {
            max-height: 400px;
        }
        .gradio-textbox, .gradio-dropdown, .gradio-image {
            width: 100%;
        }
        #submit-btn, #clear-history {
            width: 100%;
            margin-left: 0;
        }
    }
"""

# Gradio interface setup for multimodal chatbot with voice functionality
def create_interface():
    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("""
            <div class="gradio-header">
                <h1>Multimodal Chatbot (Text + Image + Voice)</h1>
                <h3>Interact with a chatbot using text, image, or voice inputs</h3>
            </div>
        """)

        # Add a description with an expandable accordion
        with gr.Accordion("Click to expand for details", open=False):
            gr.Markdown("""
            ### Description:
            This is a multimodal chatbot that can handle text, image, and voice inputs. 
            - You can ask questions or provide text, and the assistant will respond.
            - You can upload an image, and the assistant will process it and answer questions about the image.
            - You can also speak to the assistant, and it will process your speech.
            - Enter your OpenAI API key to start interacting with the model.
            - You can use the 'Clear History' button to remove the conversation history.
            - "o1" is for image chat and "o3-mini" is for text chat.
            ### Reasoning Effort:
            The reasoning effort controls how complex or detailed the assistant's answers should be. 
            - **Low**: Provides quick, concise answers with minimal reasoning or details.
            - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
            - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
            """)

        with gr.Row():
            openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)

        with gr.Row():
            image_input = gr.Image(label="Upload an Image", type="pil")  # Image upload input
            input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)

        with gr.Row():
            reasoning_effort = gr.Dropdown(
                label="Reasoning Effort",
                choices=["low", "medium", "high"],
                value="medium"
            )
            model_choice = gr.Dropdown(
                label="Select Model",
                choices=["o1", "o3-mini"],
                value="o1"  # Default to 'o1' for image-related tasks
            )

        # Audio input (voice interaction)
        with gr.Row():
            voice_input = gr.Audio(label="Speak to the Assistant", type="filepath")

        submit_btn = gr.Button("Ask!", elem_id="submit-btn")
        clear_btn = gr.Button("Clear History", elem_id="clear-history")

        chat_history = gr.Chatbot()

        # Button interactions
        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, voice_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
        clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])

    return demo

if __name__ == "__main__":
    demo = create_interface()  # Gradio multimodal chatbot
    demo.launch()