import gradio as gr import openai import base64 from PIL import Image import io import os import tempfile import fitz # PyMuPDF for PDF handling import uuid import json # Class to manage document storage class DocumentManager: def __init__(self): self.documents = {} # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}} def add_document(self, file_path, file_name=None): """Add a document to the manager and return its ID""" if file_name is None: file_name = os.path.basename(file_path) doc_id = str(uuid.uuid4()) content = extract_text_from_pdf(file_path) self.documents[doc_id] = { "name": file_name, "content": content, "path": file_path } return doc_id def get_document_content(self, doc_id): """Get the content of a document by its ID""" if doc_id in self.documents: return self.documents[doc_id]["content"] return "" def get_document_path(self, doc_id): """Get the file path of a document by its ID""" if doc_id in self.documents: return self.documents[doc_id]["path"] return None def get_document_list(self): """Get a list of document names and IDs for dropdown""" return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents] def clear_documents(self): """Clear all documents""" self.documents = {} return [] # Initialize the document manager document_manager = DocumentManager() # Function to extract text from PDF files def extract_text_from_pdf(pdf_file): try: text = "" pdf_document = fitz.open(pdf_file) for page_num in range(len(pdf_document)): page = pdf_document[page_num] text += page.get_text() pdf_document.close() return text except Exception as e: return f"Error extracting text from PDF: {str(e)}" # Function to send the request to OpenAI API with an image, text or PDF input def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key # Process the input depending on whether it's text, image, or a PDF-related query if pdf_content and input_text: # For PDF queries, we combine the PDF content with the user's question prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}" input_content = prompt elif image: # Convert the image to base64 string image_info = get_base64_string_from_image(image) input_content = f"data:image/png;base64,{image_info}" else: # Plain text input input_content = input_text # Prepare the messages for OpenAI API if model_choice == "o1": if image and not pdf_content: messages = [ {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]} ] else: messages = [ {"role": "user", "content": [{"type": "text", "text": input_content}]} ] elif model_choice == "o3-mini": messages = [ {"role": "user", "content": [{"type": "text", "text": input_content}]} ] try: # Call OpenAI API with the selected model response = openai.ChatCompletion.create( model=model_choice, messages=messages, reasoning_effort=reasoning_effort, max_completion_tokens=2000 ) return response["choices"][0]["message"]["content"] except Exception as e: return f"Error calling OpenAI API: {str(e)}" # Function to convert an uploaded image to a base64 string def get_base64_string_from_image(pil_image): # Convert PIL Image to bytes buffered = io.BytesIO() pil_image.save(buffered, format="PNG") img_bytes = buffered.getvalue() base64_str = base64.b64encode(img_bytes).decode("utf-8") return base64_str # Function to transcribe audio to text using OpenAI Whisper API def transcribe_audio(audio, openai_api_key): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key try: # Open the audio file and pass it as a file object with open(audio, 'rb') as audio_file: audio_file_content = audio_file.read() # Use the correct transcription API call audio_file_obj = io.BytesIO(audio_file_content) audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it) # Transcribe the audio to text using OpenAI's whisper model audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1") return audio_file_transcription['text'] except Exception as e: return f"Error transcribing audio: {str(e)}" # Function to handle PDF uploads def handle_pdf_upload(pdf_file): if pdf_file is None: return [], None # Add the PDF to the document manager doc_id = document_manager.add_document(pdf_file.name) # Return updated dropdown list and the selected document ID return document_manager.get_document_list(), doc_id # Function to get PDF content based on selected document def get_selected_document_content(doc_id): if not doc_id: return "", None # Get the document path for the PDF viewer doc_path = document_manager.get_document_path(doc_id) # Return the document content for the AI and the path for the viewer return document_manager.get_document_content(doc_id), doc_path # The function that will be used by Gradio interface def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]): # If there's audio, transcribe it to text if audio: input_text = transcribe_audio(audio, openai_api_key) # Determine which PDF content to use pdf_content_to_use = current_pdf_content # Generate the response response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice) # Append the response to the history if input_text: if doc_selection: # Include the document name in the history doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document") history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}")) else: history.append((f"User: {input_text}", f"Assistant: {response}")) else: history.append((f"User: [Uploaded content]", f"Assistant: {response}")) return "", None, None, None, doc_selection, current_pdf_content, history # Function to clear the chat history and reset selected document def clear_history(): return "", None, None, None, None, "", [] # Function to clear all documents def clear_documents(): document_list = document_manager.clear_documents() return document_list, None, "", None # Function to update visible components based on input type selection def update_input_type(choice): if choice == "Text": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) elif choice == "Image": return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) elif choice == "Voice": return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) elif choice == "PDF": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) # Custom CSS styles with animations and button colors custom_css = """ /* General body styles */ .gradio-container { font-family: 'Arial', sans-serif; background-color: #f8f9fa; color: #333; } /* Header styles */ .gradio-header { background-color: #007bff; color: white; padding: 20px; text-align: center; border-radius: 8px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); animation: fadeIn 1s ease-out; } .gradio-header h1 { font-size: 2.5rem; } .gradio-header h3 { font-size: 1.2rem; margin-top: 10px; } /* Chatbot container styles */ .gradio-chatbot { background-color: #fff; border-radius: 10px; padding: 20px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); max-height: 500px; overflow-y: auto; animation: fadeIn 2s ease-out; } /* Input field styles */ .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file { border-radius: 8px; border: 2px solid #ccc; padding: 10px; margin-bottom: 10px; width: 100%; font-size: 1rem; transition: all 0.3s ease; } .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus { border-color: #007bff; } /* Button styles */ /* Send Button: Sky Blue */ #submit-btn { background-color: #00aaff; /* Sky blue */ color: white; border: none; border-radius: 8px; padding: 10px 19px; font-size: 1.1rem; cursor: pointer; transition: all 0.3s ease; margin-left: auto; margin-right: auto; display: block; margin-top: 10px; } #submit-btn:hover { background-color: #0099cc; /* Slightly darker blue */ } #submit-btn:active { transform: scale(0.95); } #clear-history { background-color: #f04e4e; /* Slightly Darker red */ color: white; border: none; border-radius: 8px; padding: 10px 13px; font-size: 1.1rem; cursor: pointer; transition: all 0.3s ease; margin-top: 10px; } #clear-history:hover { background-color: #f5a4a4; /* Light red */ } #clear-history:active { transform: scale(0.95); } /* Input type selector buttons */ #input-type-group { display: flex; justify-content: center; gap: 10px; margin-bottom: 20px; } .input-type-btn { background-color: #6c757d; color: white; border: none; border-radius: 8px; padding: 10px 15px; font-size: 1rem; cursor: pointer; transition: all 0.3s ease; } .input-type-btn.selected { background-color: #007bff; } .input-type-btn:hover { background-color: #5a6268; } /* Chat history styles */ .gradio-chatbot .message { margin-bottom: 10px; } .gradio-chatbot .user { background-color: #007bff; color: white; padding: 10px; border-radius: 12px; max-width: 70%; animation: slideInUser 0.5s ease-out; } .gradio-chatbot .assistant { background-color: #f1f1f1; color: #333; padding: 10px; border-radius: 12px; max-width: 70%; margin-left: auto; animation: slideInAssistant 0.5s ease-out; } /* PDF preview panel */ .pdf-preview-panel { border: 2px solid #ccc; border-radius: 8px; overflow: hidden; height: 600px; background-color: #f5f5f5; } /* PDF viewer iframe */ .pdf-viewer { width: 100%; height: 100%; border: none; } /* Split view container */ .split-view-container { display: flex; gap: 20px; } .split-view-panel { flex: 1; min-width: 0; /* Allow panels to shrink below their content size */ } /* Animation keyframes */ @keyframes fadeIn { 0% { opacity: 0; } 100% { opacity: 1; } } @keyframes slideInUser { 0% { transform: translateX(-100%); } 100% { transform: translateX(0); } } @keyframes slideInAssistant { 0% { transform: translateX(100%); } 100% { transform: translateX(0); } } /* Document management styles */ .document-manager { background-color: #fff; border-radius: 10px; padding: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); margin-bottom: 20px; } .document-manager-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 15px; } .document-list { max-height: 200px; overflow-y: auto; border: 1px solid #eee; border-radius: 5px; padding: 10px; } /* Mobile responsiveness */ @media (max-width: 768px) { .gradio-header h1 { font-size: 1.8rem; } .gradio-header h3 { font-size: 1rem; } .gradio-chatbot { max-height: 400px; } .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file { width: 100%; } #submit-btn, #clear-history { width: 100%; margin-left: 0; } .split-view-container { flex-direction: column; } } """ # Gradio interface setup def create_interface(): with gr.Blocks(css=custom_css) as demo: gr.Markdown("""

Enhanced Multimodal Chatbot

Interact with text, images, voice, and multiple PDFs

""") # Add a description with an expandable accordion with gr.Accordion("Click to expand for details", open=False): gr.Markdown(""" ### Description: This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management. - **Text Mode**: Ask questions or provide text for the assistant to respond. - **Image Mode**: Upload an image for the assistant to analyze and discuss. - **Voice Mode**: Upload or record audio that will be transcribed and processed. - **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat. ### PDF Features: - Upload and manage multiple PDFs in a single session - Select which document to query from a dropdown menu - View PDFs side-by-side with the chat interface - Clear document library as needed ### Model Options: - "o1" is for image, voice, PDF and text chat - "o3-mini" is for text, PDF and voice chat only """) # Store PDF content as a state variable current_pdf_content = gr.State("") with gr.Row(): openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True) # Input type selector with gr.Row(): input_type = gr.Radio( ["Text", "Image", "Voice", "PDF"], label="Choose Input Type", value="Text" ) # Create the input components (initially text is visible, others are hidden) with gr.Row(): # Text input input_text = gr.Textbox( label="Enter Text Question", placeholder="Ask a question or provide text", lines=2, visible=True ) # Image input image_input = gr.Image( label="Upload an Image", type="pil", visible=False ) # Audio input audio_input = gr.Audio( label="Upload or Record Audio", type="filepath", visible=False ) # PDF input and document selection components pdf_input = gr.File( label="Upload your PDF", file_types=[".pdf"], visible=False ) # Dropdown for document selection doc_selection = gr.Dropdown( label="Select Document to Query", choices=[], interactive=True, visible=False ) # PDF Viewer (initially hidden) pdf_viewer = gr.HTML( label="PDF Preview", visible=False ) # Action buttons row with gr.Row(): with gr.Column(scale=1): reasoning_effort = gr.Dropdown( label="Reasoning Effort", choices=["low", "medium", "high"], value="medium" ) with gr.Column(scale=1): model_choice = gr.Dropdown( label="Select Model", choices=["o1", "o3-mini"], value="o1" ) with gr.Column(scale=1): submit_btn = gr.Button("Ask!", elem_id="submit-btn") with gr.Column(scale=1): clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history") with gr.Column(scale=1, visible=False) as clear_docs_col: clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs") # Create a container for the split view layout when in PDF mode with gr.Row(visible=False) as split_view_container: with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel: pdf_display = gr.HTML( """
""" ) with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel: chat_history = gr.Chatbot() # Regular chat history display (when not in split view) with gr.Row(visible=True) as regular_chat_container: chat_history_regular = gr.Chatbot() # Function to handle selection of a document from dropdown def handle_doc_selection(doc_id): if not doc_id: return "", update_pdf_viewer(None) content, path = get_selected_document_content(doc_id) return content, update_pdf_viewer(path) # Function to update the PDF viewer def update_pdf_viewer(pdf_path): if not pdf_path: return """
No PDF selected
""" # Create a data URL or temporary file path to display the PDF return f"""
""" # Function to toggle between split view and regular view based on input type def toggle_view(choice): if choice == "PDF": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # Connect the input type selector to the update function input_type.change( fn=update_input_type, inputs=[input_type], outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer] ) # Toggle between split view and regular view when input type changes input_type.change( fn=toggle_view, inputs=[input_type], outputs=[split_view_container, regular_chat_container, clear_docs_col] ) # Process PDF when uploaded pdf_input.change( fn=handle_pdf_upload, inputs=[pdf_input], outputs=[doc_selection, doc_selection] ) # Update content when document is selected doc_selection.change( fn=handle_doc_selection, inputs=[doc_selection], outputs=[current_pdf_content, pdf_display] ) # Button interactions submit_btn.click( fn=chatbot, inputs=[ input_text, image_input, audio_input, pdf_input, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content ], outputs=[ input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular ] ) # Also update the split view chat history when submitting submit_btn.click( fn=lambda history: history, inputs=[chat_history_regular], outputs=[chat_history] ) clear_chat_btn.click( fn=clear_history, inputs=[], outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular] ) # Also clear the split view chat history clear_chat_btn.click( fn=lambda: [], inputs=[], outputs=[chat_history] ) # Clear all documents clear_docs_btn.click( fn=clear_documents, inputs=[], outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display] ) return demo # Run the interface if __name__ == "__main__": demo = create_interface() demo.launch()