|
import gradio as gr |
|
import openai |
|
import base64 |
|
from PIL import Image |
|
import io |
|
|
|
# Function to send the request to OpenAI API with an image or text input |
|
def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"): |
|
if not openai_api_key: |
|
return "Error: No API key provided." |
|
|
|
openai.api_key = openai_api_key |
|
|
|
# Process the input depending on whether it's text or an image |
|
if image: |
|
# Convert the image to base64 string |
|
image_info = get_base64_string_from_image(image) |
|
input_text = f"data:image/png;base64,{image_info}" |
|
|
|
# Prepare the messages for OpenAI API |
|
if model_choice == "o1": |
|
if image: |
|
messages = [ |
|
{"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_text}}]} |
|
] |
|
else: |
|
messages = [ |
|
{"role": "user", "content": [{"type": "text", "text": input_text}]} |
|
] |
|
elif model_choice == "o3-mini": |
|
messages = [ |
|
{"role": "user", "content": [{"type": "text", "text": input_text}]} |
|
] |
|
|
|
try: |
|
# Call OpenAI API with the selected model |
|
response = openai.ChatCompletion.create( |
|
model=model_choice, # Dynamically choose the model (o1 or o3-mini) |
|
messages=messages, |
|
reasoning_effort=reasoning_effort, # Set reasoning_effort for the response |
|
max_completion_tokens=2000 # Limit response tokens to 2000 |
|
) |
|
|
|
return response["choices"][0]["message"]["content"] |
|
except Exception as e: |
|
return f"Error calling OpenAI API: {str(e)}" |
|
|
|
# Function to convert an uploaded image to a base64 string |
|
def get_base64_string_from_image(pil_image): |
|
# Convert PIL Image to bytes |
|
buffered = io.BytesIO() |
|
pil_image.save(buffered, format="PNG") |
|
img_bytes = buffered.getvalue() |
|
base64_str = base64.b64encode(img_bytes).decode("utf-8") |
|
return base64_str |
|
|
|
# Function to transcribe audio to text using OpenAI Whisper API |
|
def transcribe_audio(audio, openai_api_key): |
|
if not openai_api_key: |
|
return "Error: No API key provided." |
|
|
|
openai.api_key = openai_api_key |
|
|
|
try: |
|
# Open the audio file and pass it as a file object |
|
with open(audio, 'rb') as audio_file: |
|
audio_file_content = audio_file.read() |
|
|
|
# Use the correct transcription API call |
|
audio_file_obj = io.BytesIO(audio_file_content) |
|
audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it) |
|
|
|
# Transcribe the audio to text using OpenAI's whisper model |
|
audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1") |
|
return audio_file_transcription['text'] |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
# The function that will be used by Gradio interface |
|
def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_choice, history=[]): |
|
# If there's audio, transcribe it to text |
|
if audio: |
|
input_text = transcribe_audio(audio, openai_api_key) |
|
|
|
response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice) |
|
|
|
# Append the response to the history |
|
history.append((f"User: {input_text}", f"Assistant: {response}")) |
|
|
|
return "", history |
|
|
|
# Function to clear the chat history |
|
def clear_history(): |
|
return "", [] |
|
|
|
# Custom CSS styles with animations and button colors |
|
custom_css = """ |
|
/* General body styles */ |
|
.gradio-container { |
|
font-family: 'Arial', sans-serif; |
|
background-color: #f8f9fa; |
|
color: #333; |
|
} |
|
/* Header styles */ |
|
.gradio-header { |
|
background-color: #007bff; |
|
color: white; |
|
padding: 20px; |
|
text-align: center; |
|
border-radius: 8px; |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
|
animation: fadeIn 1s ease-out; |
|
} |
|
.gradio-header h1 { |
|
font-size: 2.5rem; |
|
} |
|
.gradio-header h3 { |
|
font-size: 1.2rem; |
|
margin-top: 10px; |
|
} |
|
/* Chatbot container styles */ |
|
.gradio-chatbot { |
|
background-color: #fff; |
|
border-radius: 10px; |
|
padding: 20px; |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
|
max-height: 500px; |
|
overflow-y: auto; |
|
animation: fadeIn 2s ease-out; |
|
} |
|
/* Input field styles */ |
|
.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { |
|
border-radius: 8px; |
|
border: 2px solid #ccc; |
|
padding: 10px; |
|
margin-bottom: 10px; |
|
width: 100%; |
|
font-size: 1rem; |
|
transition: all 0.3s ease; |
|
} |
|
.gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus { |
|
border-color: #007bff; |
|
} |
|
/* Button styles */ |
|
/* Send Button: Sky Blue */ |
|
#submit-btn { |
|
background-color: #00aaff; /* Sky blue */ |
|
color: white; |
|
border: none; |
|
border-radius: 8px; |
|
padding: 10px 19px; |
|
font-size: 1.1rem; |
|
cursor: pointer; |
|
transition: all 0.3s ease; |
|
margin-left: auto; |
|
margin-right: auto; |
|
display: block; |
|
margin-top: 10px; |
|
} |
|
#submit-btn:hover { |
|
background-color: #0099cc; /* Slightly darker blue */ |
|
} |
|
#submit-btn:active { |
|
transform: scale(0.95); |
|
} |
|
#clear-history { |
|
background-color: #f04e4e; /* Slightly Darker red */ |
|
color: white; |
|
border: none; |
|
border-radius: 8px; |
|
padding: 10px 13px; |
|
font-size: 1.1rem; |
|
cursor: pointer; |
|
transition: all 0.3s ease; |
|
margin-top: 10px; |
|
} |
|
#clear-history:hover { |
|
background-color: #f5a4a4; /* Light red */ |
|
} |
|
#clear-history:active { |
|
transform: scale(0.95); |
|
} |
|
/* Chat history styles */ |
|
.gradio-chatbot .message { |
|
margin-bottom: 10px; |
|
} |
|
.gradio-chatbot .user { |
|
background-color: #007bff; |
|
color: white; |
|
padding: 10px; |
|
border-radius: 12px; |
|
max-width: 70%; |
|
animation: slideInUser 0.5s ease-out; |
|
} |
|
.gradio-chatbot .assistant { |
|
background-color: #f1f1f1; |
|
color: #333; |
|
padding: 10px; |
|
border-radius: 12px; |
|
max-width: 70%; |
|
margin-left: auto; |
|
animation: slideInAssistant 0.5s ease-out; |
|
} |
|
/* Animation keyframes */ |
|
@keyframes fadeIn { |
|
0% { opacity: 0; } |
|
100% { opacity: 1; } |
|
} |
|
@keyframes slideInUser { |
|
0% { transform: translateX(-100%); } |
|
100% { transform: translateX(0); } |
|
} |
|
@keyframes slideInAssistant { |
|
0% { transform: translateX(100%); } |
|
100% { transform: translateX(0); } |
|
} |
|
/* Mobile responsiveness */ |
|
@media (max-width: 768px) { |
|
.gradio-header h1 { |
|
font-size: 1.8rem; |
|
} |
|
.gradio-header h3 { |
|
font-size: 1rem; |
|
} |
|
.gradio-chatbot { |
|
max-height: 400px; |
|
} |
|
.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { |
|
width: 100%; |
|
} |
|
#submit-btn, #clear-history { |
|
width: 100%; |
|
margin-left: 0; |
|
} |
|
} |
|
""" |
|
|
|
# Gradio interface setup |
|
def create_interface(): |
|
with gr.Blocks(css=custom_css) as demo: |
|
gr.Markdown(""" |
|
<div class="gradio-header"> |
|
<h1>Multimodal Chatbot (Text + Image + Voice)</h1> |
|
<h3>Interact with a chatbot using text, image, or voice inputs</h3> |
|
</div> |
|
""") |
|
|
|
# Add a description with an expandable accordion |
|
with gr.Accordion("Click to expand for details", open=False): |
|
gr.Markdown(""" |
|
### Description: |
|
This is a multimodal chatbot that can handle text, image, and voice inputs. |
|
- You can ask questions or provide text, and the assistant will respond. |
|
- You can also upload an image, and the assistant will process it and answer questions about the image. |
|
- Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant. |
|
- Enter your OpenAI API key to start interacting with the model. |
|
- You can use the 'Clear History' button to remove the conversation history. |
|
- "o1" is for image chat and "o3-mini" is for text chat. |
|
### Reasoning Effort: |
|
The reasoning effort controls how complex or detailed the assistant's answers should be. |
|
- **Low**: Provides quick, concise answers with minimal reasoning or details. |
|
- **Medium**: Offers a balanced response with a reasonable level of detail and thought. |
|
- **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning. |
|
""") |
|
|
|
with gr.Row(): |
|
openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True) |
|
|
|
with gr.Row(): |
|
image_input = gr.Image(label="Upload an Image", type="pil") # Image upload input |
|
input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2) |
|
audio_input = gr.Audio(label="Upload or Record Audio", type="filepath") # Audio upload or record input (using filepath) |
|
|
|
with gr.Row(): |
|
reasoning_effort = gr.Dropdown( |
|
label="Reasoning Effort", |
|
choices=["low", "medium", "high"], |
|
value="medium" |
|
) |
|
model_choice = gr.Dropdown( |
|
label="Select Model", |
|
choices=["o1", "o3-mini"], |
|
value="o1" # Default to 'o1' for image-related tasks |
|
) |
|
submit_btn = gr.Button("Ask!", elem_id="submit-btn") |
|
clear_btn = gr.Button("Clear History", elem_id="clear-history") |
|
|
|
chat_history = gr.Chatbot() |
|
|
|
# Button interactions |
|
submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history]) |
|
clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history]) |
|
|
|
return demo |
|
|
|
# Run the interface |
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch() |