Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

Multi-modal-o1-Chatbot / app.py

shukdevdatta123

Update app.py

ea2202e verified about 2 months ago

raw

history blame

22.9 kB

	import gradio as gr
	import openai
	import base64
	from PIL import Image
	import io
	import os
	import tempfile
	import fitz # PyMuPDF for PDF handling
	import uuid
	import json

	# Class to manage document storage
	class DocumentManager:
	def __init__(self):
	self.documents = {} # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}

	def add_document(self, file_path, file_name=None):
	"""Add a document to the manager and return its ID"""
	if file_name is None:
	file_name = os.path.basename(file_path)

	doc_id = str(uuid.uuid4())
	content = extract_text_from_pdf(file_path)

	self.documents[doc_id] = {
	"name": file_name,
	"content": content,
	"path": file_path
	}

	return doc_id

	def get_document_content(self, doc_id):
	"""Get the content of a document by its ID"""
	if doc_id in self.documents:
	return self.documents[doc_id]["content"]
	return ""

	def get_document_path(self, doc_id):
	"""Get the file path of a document by its ID"""
	if doc_id in self.documents:
	return self.documents[doc_id]["path"]
	return None

	def get_document_list(self):
	"""Get a list of document names and IDs for dropdown"""
	return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]

	def clear_documents(self):
	"""Clear all documents"""
	self.documents = {}
	return []

	# Initialize the document manager
	document_manager = DocumentManager()

	# Function to extract text from PDF files
	def extract_text_from_pdf(pdf_file):
	try:
	text = ""
	pdf_document = fitz.open(pdf_file)

	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	text += page.get_text()

	pdf_document.close()
	return text
	except Exception as e:
	return f"Error extracting text from PDF: {str(e)}"

	# Function to send the request to OpenAI API with an image, text or PDF input
	def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
	if not openai_api_key:
	return "Error: No API key provided."

	openai.api_key = openai_api_key

	# Process the input depending on whether it's text, image, or a PDF-related query
	if pdf_content and input_text:
	# For PDF queries, we combine the PDF content with the user's question
	prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
	input_content = prompt
	elif image:
	# Convert the image to base64 string
	image_info = get_base64_string_from_image(image)
	input_content = f"data:image/png;base64,{image_info}"
	else:
	# Plain text input
	input_content = input_text

	# Prepare the messages for OpenAI API
	if model_choice == "o1":
	if image and not pdf_content:
	messages = [
	{"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]}
	]
	else:
	messages = [
	{"role": "user", "content": [{"type": "text", "text": input_content}]}
	]
	elif model_choice == "o3-mini":
	messages = [
	{"role": "user", "content": [{"type": "text", "text": input_content}]}
	]

	try:
	# Call OpenAI API with the selected model
	response = openai.ChatCompletion.create(
	model=model_choice,
	messages=messages,
	reasoning_effort=reasoning_effort,
	max_completion_tokens=2000
	)

	return response["choices"][0]["message"]["content"]
	except Exception as e:
	return f"Error calling OpenAI API: {str(e)}"

	# Function to convert an uploaded image to a base64 string
	def get_base64_string_from_image(pil_image):
	# Convert PIL Image to bytes
	buffered = io.BytesIO()
	pil_image.save(buffered, format="PNG")
	img_bytes = buffered.getvalue()
	base64_str = base64.b64encode(img_bytes).decode("utf-8")
	return base64_str

	# Function to transcribe audio to text using OpenAI Whisper API
	def transcribe_audio(audio, openai_api_key):
	if not openai_api_key:
	return "Error: No API key provided."

	openai.api_key = openai_api_key

	try:
	# Open the audio file and pass it as a file object
	with open(audio, 'rb') as audio_file:
	audio_file_content = audio_file.read()

	# Use the correct transcription API call
	audio_file_obj = io.BytesIO(audio_file_content)
	audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it)

	# Transcribe the audio to text using OpenAI's whisper model
	audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
	return audio_file_transcription['text']
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	# Function to handle PDF uploads
	def handle_pdf_upload(pdf_file):
	if pdf_file is None:
	return [], None

	# Add the PDF to the document manager
	doc_id = document_manager.add_document(pdf_file.name)

	# Return updated dropdown list and the selected document ID
	doc_list = document_manager.get_document_list()
	# Only set the value if the list is not empty
	selected_value = doc_id if doc_list else None

	return doc_list, selected_value

	# Function to get PDF content based on selected document
	def get_selected_document_content(doc_id):
	if not doc_id:
	return "", None

	# Get the document path for the PDF viewer
	doc_path = document_manager.get_document_path(doc_id)

	# Return the document content for the AI and the path for the viewer
	return document_manager.get_document_content(doc_id), doc_path

	# The function that will be used by Gradio interface
	def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
	# If there's audio, transcribe it to text
	if audio:
	input_text = transcribe_audio(audio, openai_api_key)

	# Determine which PDF content to use
	pdf_content_to_use = current_pdf_content

	# Generate the response
	response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)

	# Append the response to the history
	if input_text:
	if doc_selection:
	# Include the document name in the history
	doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
	history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
	else:
	history.append((f"User: {input_text}", f"Assistant: {response}"))
	else:
	history.append((f"User: [Uploaded content]", f"Assistant: {response}"))

	return "", None, None, None, doc_selection, current_pdf_content, history

	# Function to clear the chat history and reset selected document
	def clear_history():
	return "", None, None, None, None, "", []

	# Function to clear all documents
	def clear_documents():
	document_list = document_manager.clear_documents()
	return document_list, None, "", None

	# Function to update visible components based on input type selection
	def update_input_type(choice):
	if choice == "Text":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
	elif choice == "Image":
	return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
	elif choice == "Voice":
	return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
	elif choice == "PDF":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

	# Custom CSS styles with animations and button colors
	custom_css = """
	/* General body styles */
	.gradio-container {
	font-family: 'Arial', sans-serif;
	background-color: #f8f9fa;
	color: #333;
	}
	/* Header styles */
	.gradio-header {
	background-color: #007bff;
	color: white;
	padding: 20px;
	text-align: center;
	border-radius: 8px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	animation: fadeIn 1s ease-out;
	}
	.gradio-header h1 {
	font-size: 2.5rem;
	}
	.gradio-header h3 {
	font-size: 1.2rem;
	margin-top: 10px;
	}
	/* Chatbot container styles */
	.gradio-chatbot {
	background-color: #fff;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	max-height: 500px;
	overflow-y: auto;
	animation: fadeIn 2s ease-out;
	}
	/* Input field styles */
	.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
	border-radius: 8px;
	border: 2px solid #ccc;
	padding: 10px;
	margin-bottom: 10px;
	width: 100%;
	font-size: 1rem;
	transition: all 0.3s ease;
	}
	.gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus {
	border-color: #007bff;
	}
	/* Button styles */
	/* Send Button: Sky Blue */
	#submit-btn {
	background-color: #00aaff; /* Sky blue */
	color: white;
	border: none;
	border-radius: 8px;
	padding: 10px 19px;
	font-size: 1.1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	margin-left: auto;
	margin-right: auto;
	display: block;
	margin-top: 10px;
	}
	#submit-btn:hover {
	background-color: #0099cc; /* Slightly darker blue */
	}
	#submit-btn:active {
	transform: scale(0.95);
	}
	#clear-history {
	background-color: #f04e4e; /* Slightly Darker red */
	color: white;
	border: none;
	border-radius: 8px;
	padding: 10px 13px;
	font-size: 1.1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	margin-top: 10px;
	}
	#clear-history:hover {
	background-color: #f5a4a4; /* Light red */
	}
	#clear-history:active {
	transform: scale(0.95);
	}
	/* Input type selector buttons */
	#input-type-group {
	display: flex;
	justify-content: center;
	gap: 10px;
	margin-bottom: 20px;
	}
	.input-type-btn {
	background-color: #6c757d;
	color: white;
	border: none;
	border-radius: 8px;
	padding: 10px 15px;
	font-size: 1rem;
	cursor: pointer;
	transition: all 0.3s ease;
	}
	.input-type-btn.selected {
	background-color: #007bff;
	}
	.input-type-btn:hover {
	background-color: #5a6268;
	}
	/* Chat history styles */
	.gradio-chatbot .message {
	margin-bottom: 10px;
	}
	.gradio-chatbot .user {
	background-color: #007bff;
	color: white;
	padding: 10px;
	border-radius: 12px;
	max-width: 70%;
	animation: slideInUser 0.5s ease-out;
	}
	.gradio-chatbot .assistant {
	background-color: #f1f1f1;
	color: #333;
	padding: 10px;
	border-radius: 12px;
	max-width: 70%;
	margin-left: auto;
	animation: slideInAssistant 0.5s ease-out;
	}
	/* PDF preview panel */
	.pdf-preview-panel {
	border: 2px solid #ccc;
	border-radius: 8px;
	overflow: hidden;
	height: 600px;
	background-color: #f5f5f5;
	}
	/* PDF viewer iframe */
	.pdf-viewer {
	width: 100%;
	height: 100%;
	border: none;
	}
	/* Split view container */
	.split-view-container {
	display: flex;
	gap: 20px;
	}
	.split-view-panel {
	flex: 1;
	min-width: 0; /* Allow panels to shrink below their content size */
	}
	/* Animation keyframes */
	@keyframes fadeIn {
	0% { opacity: 0; }
	100% { opacity: 1; }
	}
	@keyframes slideInUser {
	0% { transform: translateX(-100%); }
	100% { transform: translateX(0); }
	}
	@keyframes slideInAssistant {
	0% { transform: translateX(100%); }
	100% { transform: translateX(0); }
	}
	/* Document management styles */
	.document-manager {
	background-color: #fff;
	border-radius: 10px;
	padding: 15px;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
	margin-bottom: 20px;
	}
	.document-manager-header {
	display: flex;
	justify-content: space-between;
	align-items: center;
	margin-bottom: 15px;
	}
	.document-list {
	max-height: 200px;
	overflow-y: auto;
	border: 1px solid #eee;
	border-radius: 5px;
	padding: 10px;
	}
	/* Mobile responsiveness */
	@media (max-width: 768px) {
	.gradio-header h1 {
	font-size: 1.8rem;
	}
	.gradio-header h3 {
	font-size: 1rem;
	}
	.gradio-chatbot {
	max-height: 400px;
	}
	.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
	width: 100%;
	}
	#submit-btn, #clear-history {
	width: 100%;
	margin-left: 0;
	}
	.split-view-container {
	flex-direction: column;
	}
	}
	"""

	# Gradio interface setup
	def create_interface():
	with gr.Blocks(css=custom_css) as demo:
	gr.Markdown("""
	<div class="gradio-header">
	<h1>Enhanced Multimodal Chatbot</h1>
	<h3>Interact with text, images, voice, and multiple PDFs</h3>
	</div>
	""")

	# Add a description with an expandable accordion
	with gr.Accordion("Click to expand for details", open=False):
	gr.Markdown("""
	### Description:
	This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.

	- Text Mode: Ask questions or provide text for the assistant to respond.
	- Image Mode: Upload an image for the assistant to analyze and discuss.
	- Voice Mode: Upload or record audio that will be transcribed and processed.
	- PDF Mode: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.

	### PDF Features:
	- Upload and manage multiple PDFs in a single session
	- Select which document to query from a dropdown menu
	- View PDFs side-by-side with the chat interface
	- Clear document library as needed

	### Model Options:
	- "o1" is for image, voice, PDF and text chat
	- "o3-mini" is for text, PDF and voice chat only
	""")

	# Store PDF content as a state variable
	current_pdf_content = gr.State("")

	with gr.Row():
	openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)

	# Input type selector
	with gr.Row():
	input_type = gr.Radio(
	["Text", "Image", "Voice", "PDF"],
	label="Choose Input Type",
	value="Text"
	)

	# Create the input components (initially text is visible, others are hidden)
	with gr.Row():
	# Text input
	input_text = gr.Textbox(
	label="Enter Text Question",
	placeholder="Ask a question or provide text",
	lines=2,
	visible=True
	)

	# Image input
	image_input = gr.Image(
	label="Upload an Image",
	type="pil",
	visible=False
	)

	# Audio input
	audio_input = gr.Audio(
	label="Upload or Record Audio",
	type="filepath",
	visible=False
	)

	# PDF input and document selection components
	pdf_input = gr.File(
	label="Upload your PDF",
	file_types=[".pdf"],
	visible=False
	)

	# Dropdown for document selection
	doc_selection = gr.Dropdown(
	label="Select Document to Query",
	choices=[],
	interactive=True,
	visible=False
	)

	# PDF Viewer (initially hidden)
	pdf_viewer = gr.HTML(
	label="PDF Preview",
	visible=False
	)

	# Action buttons row
	with gr.Row():
	with gr.Column(scale=1):
	reasoning_effort = gr.Dropdown(
	label="Reasoning Effort",
	choices=["low", "medium", "high"],
	value="medium"
	)

	with gr.Column(scale=1):
	model_choice = gr.Dropdown(
	label="Select Model",
	choices=["o1", "o3-mini"],
	value="o1"
	)

	with gr.Column(scale=1):
	submit_btn = gr.Button("Ask!", elem_id="submit-btn")

	with gr.Column(scale=1):
	clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")

	with gr.Column(scale=1, visible=False) as clear_docs_col:
	clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")

	# Create a container for the split view layout when in PDF mode
	with gr.Row(visible=False) as split_view_container:
	with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
	pdf_display = gr.HTML(
	"""<div class="pdf-preview-panel">
	<iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
	</div>"""
	)

	with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
	chat_history = gr.Chatbot()

	# Regular chat history display (when not in split view)
	with gr.Row(visible=True) as regular_chat_container:
	chat_history_regular = gr.Chatbot()

	# Function to handle selection of a document from dropdown
	def handle_doc_selection(doc_id):
	if not doc_id:
	return "", update_pdf_viewer(None)

	content, path = get_selected_document_content(doc_id)
	return content, update_pdf_viewer(path)

	# Function to update the PDF viewer
	def update_pdf_viewer(pdf_path):
	if not pdf_path:
	return """<div class="pdf-preview-panel">
	<div style="padding: 20px; text-align: center;">No PDF selected</div>
	</div>"""

	# Create a data URL or temporary file path to display the PDF
	return f"""<div class="pdf-preview-panel">
	<iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
	</div>"""

	# Function to toggle between split view and regular view based on input type
	def toggle_view(choice):
	if choice == "PDF":
	return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
	else:
	return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)

	# Connect the input type selector to the update function
	input_type.change(
	fn=update_input_type,
	inputs=[input_type],
	outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
	)

	# Toggle between split view and regular view when input type changes
	input_type.change(
	fn=toggle_view,
	inputs=[input_type],
	outputs=[split_view_container, regular_chat_container, clear_docs_col]
	)

	# Process PDF when uploaded
	pdf_input.change(
	fn=handle_pdf_upload,
	inputs=[pdf_input],
	outputs=[doc_selection, doc_selection]
	)

	# Update content when document is selected
	doc_selection.change(
	fn=handle_doc_selection,
	inputs=[doc_selection],
	outputs=[current_pdf_content, pdf_display]
	)

	# Button interactions
	submit_btn.click(
	fn=chatbot,
	inputs=[
	input_text, image_input, audio_input, pdf_input,
	doc_selection, openai_api_key, reasoning_effort,
	model_choice, current_pdf_content, chat_history_regular # Added chat_history_regular to avoid creating new empty list
	],
	outputs=[
	input_text, image_input, audio_input, pdf_input,
	doc_selection, current_pdf_content, chat_history_regular
	]
	)

	# Also update the split view chat history when submitting
	submit_btn.click(
	fn=lambda history: history,
	inputs=[chat_history_regular],
	outputs=[chat_history]
	)

	clear_chat_btn.click(
	fn=clear_history,
	inputs=[],
	outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
	)

	# Also clear the split view chat history
	clear_chat_btn.click(
	fn=lambda: [],
	inputs=[],
	outputs=[chat_history]
	)

	# Clear all documents
	clear_docs_btn.click(
	fn=clear_documents,
	inputs=[],
	outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
	)

	return demo

	# Run the interface
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()