import gradio as gr import openai import base64 from PIL import Image import io # Function to send the request to OpenAI API with an image or text input def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key # Process the input depending on whether it's text or an image if image: # Convert the image to base64 string image_info = get_base64_string_from_image(image) input_text = f"data:image/png;base64,{image_info}" # Prepare the messages for OpenAI API if model_choice == "o1": if image: messages = [ {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_text}}]} ] else: messages = [ {"role": "user", "content": [{"type": "text", "text": input_text}]} ] elif model_choice == "o3-mini": messages = [ {"role": "user", "content": [{"type": "text", "text": input_text}]} ] try: # Call OpenAI API with the selected model response = openai.ChatCompletion.create( model=model_choice, # Dynamically choose the model (o1 or o3-mini) messages=messages, reasoning_effort=reasoning_effort, # Set reasoning_effort for the response max_completion_tokens=2000 # Limit response tokens to 2000 ) return response["choices"][0]["message"]["content"] except Exception as e: return f"Error calling OpenAI API: {str(e)}" # Function to convert an uploaded image to a base64 string def get_base64_string_from_image(pil_image): # Convert PIL Image to bytes buffered = io.BytesIO() pil_image.save(buffered, format="PNG") img_bytes = buffered.getvalue() base64_str = base64.b64encode(img_bytes).decode("utf-8") return base64_str # Function to transcribe audio to text using OpenAI Whisper API def transcribe_audio(audio, openai_api_key): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key try: # Open the audio file and pass it as a file object with open(audio, 'rb') as audio_file: audio_file_content = audio_file.read() # Use the correct transcription API call audio_file_obj = io.BytesIO(audio_file_content) audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it) # Transcribe the audio to text using OpenAI's whisper model audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1") return audio_file_transcription['text'] except Exception as e: return f"Error transcribing audio: {str(e)}" # The function that will be used by Gradio interface def chatbot(input_text, image, audio, openai_api_key, reasoning_effort, model_choice, input_mode, history=[]): # If there's audio, transcribe it to text if audio and input_mode == "Voice": input_text = transcribe_audio(audio, openai_api_key) if input_mode == "Image" and image: # If Image Mode is selected and image is uploaded input_text = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice) elif input_mode == "Text" and input_text: # If Text Mode is selected input_text = generate_response(input_text, None, openai_api_key, reasoning_effort, model_choice) # Append the response to the history history.append((f"User: {input_text}", f"Assistant: {input_text}")) return "", history # Function to clear the chat history def clear_history(): return "", [] # Custom CSS styles with animations and button colors custom_css = """ /* General body styles */ .gradio-container { font-family: 'Arial', sans-serif; background-color: #f8f9fa; color: #333; } /* Header styles */ .gradio-header { background-color: #007bff; color: white; padding: 20px; text-align: center; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); animation: fadeIn 1s ease-out; } .gradio-header h1 { font-size: 2.5rem; } .gradio-header h3 { font-size: 1.2rem; margin-top: 10px; } /* Chatbot container styles */ .gradio-chatbot { background-color: #fff; border-radius: 10px; padding: 20px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); max-height: 500px; overflow-y: auto; animation: fadeIn 2s ease-out; } /* Input field styles */ .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { border-radius: 8px; border: 2px solid #ccc; padding: 10px; margin-bottom: 10px; width: 100%; font-size: 1rem; transition: all 0.3s ease; } .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus { border-color: #007bff; } /* Button styles */ /* Send Button: Sky Blue */ #submit-btn { background-color: #00aaff; /* Sky blue */ color: white; border: none; border-radius: 8px; padding: 10px 19px; font-size: 1.1rem; cursor: pointer; transition: all 0.3s ease; margin-left: auto; margin-right: auto; display: block; margin-top: 10px; } #submit-btn:hover { background-color: #0099cc; /* Slightly darker blue */ } #submit-btn:active { transform: scale(0.95); } #clear-history { background-color: #f04e4e; /* Slightly Darker red */ color: white; border: none; border-radius: 8px; padding: 10px 13px; font-size: 1.1rem; cursor: pointer; transition: all 0.3s ease; margin-top: 10px; } #clear-history:hover { background-color: #f5a4a4; /* Light red */ } #clear-history:active { transform: scale(0.95); } /* Chat history styles */ .gradio-chatbot .message { margin-bottom: 10px; } .gradio-chatbot .user { background-color: #007bff; color: white; padding: 10px; border-radius: 12px; max-width: 70%; animation: slideInUser 0.5s ease-out; } .gradio-chatbot .assistant { background-color: #f1f1f1; color: #333; padding: 10px; border-radius: 12px; max-width: 70%; margin-left: auto; animation: slideInAssistant 0.5s ease-out; } /* Animation keyframes */ @keyframes fadeIn { 0% { opacity: 0; } 100% { opacity: 1; } } @keyframes slideInUser { 0% { transform: translateX(-100%); } 100% { transform: translateX(0); } } @keyframes slideInAssistant { 0% { transform: translateX(100%); } 100% { transform: translateX(0); } } /* Mobile responsiveness */ @media (max-width: 768px) { .gradio-header h1 { font-size: 1.8rem; } .gradio-header h3 { font-size: 1rem; } .gradio-chatbot { max-height: 400px; } .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio { width: 100%; } #submit-btn, #clear-history { width: 100%; margin-left: 0; } } """ # Gradio interface setup def create_interface(): with gr.Blocks(css=custom_css) as demo: gr.Markdown("""

Multimodal Chatbot (Text + Image + Voice)

Interact with a chatbot using text, image, or voice inputs

""") # Choose input type (Text, Image, Voice) input_mode = gr.Radio( label="Choose Input Mode", choices=["Text", "Image", "Voice"], value="Text" ) with gr.Row(): openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True) # Text, Image, and Audio Inputs will be displayed based on the chosen mode input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2) image_input = gr.Image(label="Upload an Image", type="pil") audio_input = gr.Audio(label="Upload or Record Audio", type="filepath") with gr.Row(): reasoning_effort = gr.Dropdown( label="Reasoning Effort", choices=["low", "medium", "high"], value="medium" ) model_choice = gr.Dropdown( label="Select Model", choices=["o1", "o3-mini"], value="o1" ) submit_btn = gr.Button("Ask!", elem_id="submit-btn") clear_btn = gr.Button("Clear History", elem_id="clear-history") chat_history = gr.Chatbot() # Dynamically control the input visibility based on the selected mode def toggle_inputs(input_mode): if input_mode == "Text": return input_text, None, None elif input_mode == "Image": return input_text, image_input, None else: # Voice return None, None, audio_input # Button interactions submit_btn.click(fn=chatbot, inputs=[input_text, image_input, audio_input, openai_api_key, reasoning_effort, model_choice, input_mode, chat_history], outputs=[input_text, chat_history]) clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history]) input_mode.change(toggle_inputs, inputs=[input_mode], outputs=[input_text, image_input, audio_input]) return demo # Run the interface if __name__ == "__main__": demo = create_interface() demo.launch()