Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

File size: 3,557 Bytes

eaa4360

import gradio as gr
import openai
from PIL import Image
import io
import base64

# Function to send the request to OpenAI API
def generate_response(prompt, openai_api_key, image_info="", reasoning_effort="medium"):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key

    # Combine text prompt with optional image info
    full_prompt = prompt
    if image_info:
        full_prompt += f"\n\nAdditional context about the image: {image_info}"

    try:
        # Call OpenAI API with the specified model ("o1")
        response = openai.ChatCompletion.create(
            model="o1",  # use model "o1"
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": full_prompt},
            ],
            temperature=0.7,
            max_tokens=300,
            reasoning_effort=reasoning_effort  # Include reasoning_effort in the request
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error calling OpenAI API: {str(e)}"

# Function to convert an uploaded image to a base64 string
def get_base64_string_from_image(pil_image):
    buffered = io.BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    base64_str = base64.b64encode(img_bytes).decode("utf-8")
    return base64_str

# The function that will be used by Gradio interface
def chatbot(input_text, image, openai_api_key, reasoning_effort, history=[]):
    image_info = ""
    
    # If an image is uploaded, convert it to base64 for reference
    if image:
        try:
            image = Image.open(image)
            image_info = get_base64_string_from_image(image)
        except Exception as e:
            image_info = f"Error reading image: {e}"

    # Combine user input with image info (if any)
    response = generate_response(input_text, openai_api_key, image_info, reasoning_effort)
    
    # Append the response to the history
    history.append((f"User: {input_text}", f"Assistant: {response}"))
    
    return "", history

# Function to clear the chat history
def clear_history():
    return "", []

# Gradio interface setup
def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Multimodal Chatbot (Text + Image)")

        with gr.Row():
            openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
        
        with gr.Row():
            image_input = gr.Image(label="Upload an Image", type="pil")
            input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)
        
        with gr.Row():
            reasoning_effort = gr.Dropdown(
                label="Reasoning Effort",
                choices=["low", "medium", "high"],
                value="medium",
                description="Select the reasoning effort for generating the response."
            )
            submit_btn = gr.Button("Send")
            clear_btn = gr.Button("Clear History")

        chat_history = gr.Chatbot()

        # Button interactions
        submit_btn.click(fn=chatbot, inputs=[input_text, image_input, openai_api_key, reasoning_effort, chat_history], outputs=[input_text, chat_history])
        clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])

    return demo

# Run the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()