| import gradio as gr | |
| import openai | |
| import base64 | |
| from PIL import Image | |
| import io | |
| # Function to send the request to OpenAI API with an image or text input | |
| def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"): | |
| if not openai_api_key: | |
| return "Error: No API key provided." | |
| openai.api_key = openai_api_key | |
| # Process the input depending on whether it's text or an image | |
| if image: | |
| # Convert the image to base64 string | |
| image_info = get_base64_string_from_image(image) | |
| input_text = f"data:image/png;base64,{image_info}" | |
| # Prepare the messages for OpenAI API | |
| if model_choice == "o1": | |
| if image: | |
| messages = [ | |
| {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_text}}]} | |
| ] | |
| else: | |
| messages = [ | |
| {"role": "user", "content": [{"type": "text", "text": input_text}]} | |
| ] | |
| elif model_choice == "o3-mini": | |
| messages = [ | |
| {"role": "user", "content": [{"type": "text", "text": input_text}]} | |
| ] | |
| try: | |
| # Call OpenAI API with the selected model | |
| response = openai.ChatCompletion.create( | |
| model=model_choice, # Dynamically choose the model (o1 or o3-mini) | |
| messages=messages, | |
| reasoning_effort=reasoning_effort, # Set reasoning_effort for the response | |
| max_completion_tokens=2000 # Limit response tokens to 2000 | |
| ) | |
| return response["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| return f"Error calling OpenAI API: {str(e)}" | |
| # Function to convert an uploaded image to a base64 string | |
| def get_base64_string_from_image(pil_image): | |
| # Convert PIL Image to bytes | |
| buffered = io.BytesIO() | |
| pil_image.save(buffered, format="PNG") | |
| img_bytes = buffered.getvalue() | |
| base64_str = base64.b64encode(img_bytes).decode("utf-8") | |
| return base64_str | |
| # Function to transcribe audio to text using OpenAI Whisper API | |
| def transcribe_audio(audio, openai_api_key): | |
| if not openai_api_key: | |
| return "Error: No API key provided." | |
| openai.api_key = openai_api_key | |
| try: | |
| # Open the audio file and pass it as a file object | |
| with open(audio, 'rb') as audio_file: | |
| audio_file_content = audio_file.read() | |
| # Use the correct transcription API call | |
| audio_file_obj = io.BytesIO(audio_file_content) | |
| audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it) | |
| # Transcribe the audio to text using OpenAI's whisper model | |
| audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1") | |
| return audio_file_transcription['text'] | |
| except Exception as e: | |
| return f"Error transcribing audio: {str(e)}" | |
| # The function that will be used by Gradio interface | |
| def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_choice, history=[]): | |
| # If there's audio, transcribe it to text | |
| if audio: | |
| input_text = transcribe_audio(audio, openai_api_key) | |
| response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice) | |
| # Append the response to the history | |
| history.append((f"User: {input_text}", f"Assistant: {response}")) | |
| return "", history | |
| # Function to clear the chat history | |
| def clear_history(): | |
| return "", [] | |
| # Custom CSS styles with animations and button colors | |
| custom_css = """ | |
| /* General body styles */ | |
| .gradio-container { | |
| font-family: 'Arial', sans-serif; | |
| background-color: #f8f9fa; | |
| color: #333; | |
| } | |
| /* Header styles */ | |
| .gradio-header { | |
| background-color: #007bff; | |
| color: white; | |
| padding: 20px; | |
| text-align: center; | |
| border-radius: 8px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
| animation: fadeIn 1s ease-out; | |
| } | |
| .gradio-header h1 { | |
| font-size: 2.5rem; | |
| } | |
| .gradio-header h3 { | |
| font-size: 1.2rem; | |
| margin-top: 10px; | |
| } | |
| /* Chatbot container styles */ | |
| .gradio-chatbot { | |
| background-color: #fff; | |
| border-radius: 10px; | |
| padding: 20px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
| max-height: 500px; | |
| overflow-y: auto; | |
| animation: fadeIn 2s ease-out; | |
| } | |
| /* Input field styles */ | |
| .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { | |
| border-radius: 8px; | |
| border: 2px solid #ccc; | |
| padding: 10px; | |
| margin-bottom: 10px; | |
| width: 100%; | |
| font-size: 1rem; | |
| transition: all 0.3s ease; | |
| } | |
| .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus { | |
| border-color: #007bff; | |
| } | |
| /* Button styles */ | |
| /* Send Button: Sky Blue */ | |
| #submit-btn { | |
| background-color: #00aaff; /* Sky blue */ | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| padding: 10px 19px; | |
| font-size: 1.1rem; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| margin-left: auto; | |
| margin-right: auto; | |
| display: block; | |
| margin-top: 10px; | |
| } | |
| #submit-btn:hover { | |
| background-color: #0099cc; /* Slightly darker blue */ | |
| } | |
| #submit-btn:active { | |
| transform: scale(0.95); | |
| } | |
| #clear-history { | |
| background-color: #f04e4e; /* Slightly Darker red */ | |
| color: white; | |
| border: none; | |
| border-radius: 8px; | |
| padding: 10px 13px; | |
| font-size: 1.1rem; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| margin-top: 10px; | |
| } | |
| #clear-history:hover { | |
| background-color: #f5a4a4; /* Light red */ | |
| } | |
| #clear-history:active { | |
| transform: scale(0.95); | |
| } | |
| /* Chat history styles */ | |
| .gradio-chatbot .message { | |
| margin-bottom: 10px; | |
| } | |
| .gradio-chatbot .user { | |
| background-color: #007bff; | |
| color: white; | |
| padding: 10px; | |
| border-radius: 12px; | |
| max-width: 70%; | |
| animation: slideInUser 0.5s ease-out; | |
| } | |
| .gradio-chatbot .assistant { | |
| background-color: #f1f1f1; | |
| color: #333; | |
| padding: 10px; | |
| border-radius: 12px; | |
| max-width: 70%; | |
| margin-left: auto; | |
| animation: slideInAssistant 0.5s ease-out; | |
| } | |
| /* Animation keyframes */ | |
| @keyframes fadeIn { | |
| 0% { opacity: 0; } | |
| 100% { opacity: 1; } | |
| } | |
| @keyframes slideInUser { | |
| 0% { transform: translateX(-100%); } | |
| 100% { transform: translateX(0); } | |
| } | |
| @keyframes slideInAssistant { | |
| 0% { transform: translateX(100%); } | |
| 100% { transform: translateX(0); } | |
| } | |
| /* Mobile responsiveness */ | |
| @media (max-width: 768px) { | |
| .gradio-header h1 { | |
| font-size: 1.8rem; | |
| } | |
| .gradio-header h3 { | |
| font-size: 1rem; | |
| } | |
| .gradio-chatbot { | |
| max-height: 400px; | |
| } | |
| .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { | |
| width: 100%; | |
| } | |
| #submit-btn, #clear-history { | |
| width: 100%; | |
| margin-left: 0; | |
| } | |
| } | |
| """ | |
| # Gradio interface setup | |
| def create_interface(): | |
| with gr.Blocks(css=custom_css) as demo: | |
| gr.Markdown(""" | |
| <div class="gradio-header"> | |
| <h1>Multimodal Chatbot (Text + Image + Voice)</h1> | |
| <h3>Interact with a chatbot using text, image, or voice inputs</h3> | |
| </div> | |
| """) | |
| # Add a description with an expandable accordion | |
| with gr.Accordion("Click to expand for details", open=False): | |
| gr.Markdown(""" | |
| ### Description: | |
| This is a multimodal chatbot that can handle text, image, and voice inputs. | |
| - You can ask questions or provide text, and the assistant will respond. | |
| - You can also upload an image, and the assistant will process it and answer questions about the image. | |
| - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant. | |
| - Enter your OpenAI API key to start interacting with the model. | |
| - You can use the 'Clear History' button to remove the conversation history. | |
| - "o1" is for image chat and "o3-mini" is for text chat. | |
| ### Reasoning Effort: | |
| The reasoning effort controls how complex or detailed the assistant's answers should be. | |
| - **Low**: Provides quick, concise answers with minimal reasoning or details. | |
| - **Medium**: Offers a balanced response with a reasonable level of detail and thought. | |
| - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning. | |
| """) | |
| with gr.Row(): | |
| openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True) | |
| with gr.Row(): | |
| image_input = gr.Image(label="Upload an Image", type="pil") # Image upload input | |
| input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2) | |
| audio_input = gr.Audio(label="Upload or Record Audio", type="filepath") # Audio upload or record input (using filepath) | |
| with gr.Row(): | |
| reasoning_effort = gr.Dropdown( | |
| label="Reasoning Effort", | |
| choices=["low", "medium", "high"], | |
| value="medium" | |
| ) | |
| model_choice = gr.Dropdown( | |
| label="Select Model", | |
| choices=["o1", "o3-mini"], | |
| value="o1" # Default to 'o1' for image-related tasks | |
| ) | |
| submit_btn = gr.Button("Ask!", elem_id="submit-btn") | |
| clear_btn = gr.Button("Clear History", elem_id="clear-history") | |
| chat_history = gr.Chatbot() | |
| # Button interactions | |
| submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history]) | |
| clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history]) | |
| return demo | |
| # Run the interface | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() |