Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

Multi-modal-o1-Chatbot / app.py

shukdevdatta123

Update app.py

ee274c9 verified about 2 months ago

raw

history blame

11.6 kB

	import gradio as gr
	import openai
	import base64
	from PIL import Image
	import io
	import os
	from helpers import text_to_speech, autoplay_audio, speech_to_text, get_api_key
	from generate_answer import base_model_chatbot, with_pdf_chatbot
	from audio_recorder_streamlit import audio_recorder
	from streamlit_float import *
	from PIL import Image as stImage

	# Function to send the request to OpenAI API with an image or text input
	def generate_response(input_text, image, openai_api_key, reasoning_effort="medium", model_choice="o1"):
	if not openai_api_key:
	return "Error: No API key provided."

	openai.api_key = openai_api_key

	# Process the input depending on whether it's text or an image
	if image:
	# Convert the image to base64 string
	image_info = get_base64_string_from_image(image)
	input_text = f"data:image/png;base64,{image_info}"

	# Prepare the messages for OpenAI API
	if model_choice == "o1":
	messages = [
	{"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_text}}]}
	]
	elif model_choice == "o3-mini":
	messages = [
	{"role": "user", "content": [{"type": "text", "text": input_text}]}
	]

	try:
	# Call OpenAI API with the selected model
	response = openai.ChatCompletion.create(
	model=model_choice, # Dynamically choose the model (o1 or o3-mini)
	messages=messages,
	reasoning_effort=reasoning_effort, # Set reasoning_effort for the response
	max_completion_tokens=2000 # Limit response tokens to 2000
	)

	return response["choices"][0]["message"]["content"]
	except Exception as e:
	return f"Error calling OpenAI API: {str(e)}"

	# Function to convert an uploaded image to a base64 string
	def get_base64_string_from_image(pil_image):
	# Convert PIL Image to bytes
	buffered = io.BytesIO()
	pil_image.save(buffered, format="PNG")
	img_bytes = buffered.getvalue()
	base64_str = base64.b64encode(img_bytes).decode("utf-8")
	return base64_str

	# The function that will be used by Gradio interface
	def chatbot(input_text, image, openai_api_key, reasoning_effort, model_choice, history=[]):
	response = generate_response(input_text, image, openai_api_key, reasoning_effort, model_choice)

	# Append the response to the history
	history.append((f"User: {input_text}", f"Assistant: {response}"))

	return "", history

	# Function to clear the chat history
	def clear_history():
	return "", []

	# Custom CSS styles with animations and button colors
	custom_css = """
	/* General body styles */
	.gradio-container {
	font-family: 'Arial', sans-serif;
	background-color: #f8f9fa;
	color: #333;
	}
	/* Header styles */
	.gradio-header {
	background-color: #007bff;
	color: white;
	padding: 20px;
	text-align: center;
	border-radius: 8px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	animation: fadeIn 1s ease-out;
	}
	.gradio-header h1 {
	font-size: 2.5rem;
	}
	.gradio-header h3 {
	font-size: 1.2rem;
	margin-top: 10px;
	}
	/* Chatbot container styles */
	.gradio-chatbot {
	background-color: #fff;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	max-height: 500px;
	overflow-y: auto;
	animation: fadeIn 2s ease-out;
	}
	/* Input field styles */
	.gradio-textbox, .gradio-dropdown, .gradio-image {
	border-radius: 8px;
	border: 2px solid #ccc;
	padding: 10px;
	margin-bottom: 10px;
	width: 100%;
	font-size: 1rem;
	transition: all 0.3s ease;
	}
	.gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus {
	border-color: #007bff;
	}
	/* Button styles */
	/* Send Button: Sky Blue */
	#submit-btn {
	background-color: #00aaff; /* Sky blue */
	color: white;
	border: none;
	border-radius: 8px;
	padding: 10px 19px;
	font-size: 1.1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	margin-left: auto;
	margin-right: auto;
	display: block;
	margin-top: 10px;
	}
	#submit-btn:hover {
	background-color: #0099cc; /* Slightly darker blue */
	}
	#submit-btn:active {
	transform: scale(0.95);
	}
	#clear-history {
	background-color: #f04e4e; /* Slightly Darker red */
	color: white;
	border: none;
	border-radius: 8px;
	padding: 10px 13px;
	font-size: 1.1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	margin-top: 10px;
	}
	#clear-history:hover {
	background-color: #f5a4a4; /* Light red */
	}
	#clear-history:active {
	transform: scale(0.95);
	}
	/* Chat history styles */
	.gradio-chatbot .message {
	margin-bottom: 10px;
	}
	.gradio-chatbot .user {
	background-color: #007bff;
	color: white;
	padding: 10px;
	border-radius: 12px;
	max-width: 70%;
	animation: slideInUser 0.5s ease-out;
	}
	.gradio-chatbot .assistant {
	background-color: #f1f1f1;
	color: #333;
	padding: 10px;
	border-radius: 12px;
	max-width: 70%;
	margin-left: auto;
	animation: slideInAssistant 0.5s ease-out;
	}
	/* Animation keyframes */
	@keyframes fadeIn {
	0% { opacity: 0; }
	100% { opacity: 1; }
	}
	@keyframes slideInUser {
	0% { transform: translateX(-100%); }
	100% { transform: translateX(0); }
	}
	@keyframes slideInAssistant {
	0% { transform: translateX(100%); }
	100% { transform: translateX(0); }
	}
	/* Mobile responsiveness */
	@media (max-width: 768px) {
	.gradio-header h1 {
	font-size: 1.8rem;
	}
	.gradio-header h3 {
	font-size: 1rem;
	}
	.gradio-chatbot {
	max-height: 400px;
	}
	.gradio-textbox, .gradio-dropdown, .gradio-image {
	width: 100%;
	}
	#submit-btn, #clear-history {
	width: 100%;
	margin-left: 0;
	}
	}
	"""

	# Gradio interface setup for multimodal chatbot
	def create_interface():
	with gr.Blocks(css=custom_css) as demo:
	gr.Markdown("""
	<div class="gradio-header">
	<h1>Multimodal Chatbot (Text + Image)</h1>
	<h3>Interact with a chatbot using text or image inputs</h3>
	</div>
	""")

	# Add a description with an expandable accordion
	with gr.Accordion("Click to expand for details", open=False):
	gr.Markdown("""
	### Description:
	This is a multimodal chatbot that can handle both text and image inputs.
	- You can ask questions or provide text, and the assistant will respond.
	- You can also upload an image, and the assistant will process it and answer questions about the image.
	- Enter your OpenAI API key to start interacting with the model.
	- You can use the 'Clear History' button to remove the conversation history.
	- "o1" is for image chat and "o3-mini" is for text chat.
	### Reasoning Effort:
	The reasoning effort controls how complex or detailed the assistant's answers should be.
	- Low: Provides quick, concise answers with minimal reasoning or details.
	- Medium: Offers a balanced response with a reasonable level of detail and thought.
	- High: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
	""")

	with gr.Row():
	openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)

	with gr.Row():
	image_input = gr.Image(label="Upload an Image", type="pil") # Image upload input
	input_text = gr.Textbox(label="Enter Text Question", placeholder="Ask a question or provide text", lines=2)

	with gr.Row():
	reasoning_effort = gr.Dropdown(
	label="Reasoning Effort",
	choices=["low", "medium", "high"],
	value="medium"
	)
	model_choice = gr.Dropdown(
	label="Select Model",
	choices=["o1", "o3-mini"],
	value="o1" # Default to 'o1' for image-related tasks
	)
	submit_btn = gr.Button("Ask!", elem_id="submit-btn")
	clear_btn = gr.Button("Clear History", elem_id="clear-history")

	chat_history = gr.Chatbot()

	# Button interactions
	submit_btn.click(fn=chatbot, inputs=[input_text, image_input, openai_api_key, reasoning_effort, model_choice, chat_history], outputs=[input_text, chat_history])
	clear_btn.click(fn=clear_history, inputs=[], outputs=[chat_history, chat_history])

	return demo

	# Voice interaction (audio chat) setup for Gradio
	def voice_chat():
	# Float feature initialization
	float_init()

	# Prompt for API key
	api_key = get_api_key()
	if not api_key:
	gr.error("You must provide a valid OpenAI API Key to proceed.")
	return

	def initialize_session_state():
	if "messages" not in gr.session_state:
	gr.session_state.messages = [
	{"role": "assistant", "content": "Hi! How may I assist you today? (Please Speak Clearly)"}
	]

	initialize_session_state()

	gr.title("OpenAI Conversational Chatbot (Voice Interaction) 🤖")

	# Footer container for the microphone
	footer_container = gr.container()

	with footer_container:
	audio_bytes = audio_recorder()

	for message in gr.session_state.messages:
	with gr.chat_message(message["role"]):
	gr.write(message["content"])

	if audio_bytes:
	# Write the audio bytes to a file
	with gr.spinner("Transcribing..."):
	webm_file_path = "temp_audio.mp3"
	with open(webm_file_path, "wb") as f:
	f.write(audio_bytes)

	transcript = speech_to_text(webm_file_path)
	if transcript:
	gr.session_state.messages.append({"role": "user", "content": transcript})
	with gr.chat_message("user"):
	gr.write(transcript)
	os.remove(webm_file_path)

	if gr.session_state.messages[-1]["role"] != "assistant":
	with gr.chat_message("assistant"):
	with gr.spinner("Thinking🤔..."):
	final_response = base_model_chatbot(gr.session_state.messages)

	# Final check for punctuation and completeness
	if not final_response.strip()[-1] in ".!?":
	final_response += " This is the end of the response. Let me know if you need anything else."

	with gr.spinner("Generating audio response..."):
	audio_file = text_to_speech(final_response)
	autoplay_audio(audio_file)
	gr.write(final_response)
	gr.session_state.messages.append({"role": "assistant", "content": final_response})
	os.remove(audio_file)

	# Float the footer container and provide CSS to target it with
	footer_container.float("bottom: 0rem;")

	if __name__ == "__main__":
	demo = create_interface() # Gradio multimodal chatbot
	demo.launch()

	# Gradio voice chat
	voice_chat()