Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 11

Commit

0d11d75

verified ·

1 Parent(s): 238e053

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -56

app.py CHANGED Viewed

@@ -6,6 +6,53 @@ import io
 import os
 import tempfile
 import fitz  # PyMuPDF for PDF handling
 # Function to extract text from PDF files
 def extract_text_from_pdf(pdf_file):
@@ -101,48 +148,72 @@ def transcribe_audio(audio, openai_api_key):
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
 # The function that will be used by Gradio interface
-def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, history=[]):
     # If there's audio, transcribe it to text
     if audio:
         input_text = transcribe_audio(audio, openai_api_key)
-    # If a new PDF is uploaded, extract its text
-    new_pdf_content = pdf_content
-    if pdf_file is not None:
-        new_pdf_content = extract_text_from_pdf(pdf_file)
     # Generate the response
-    response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
     if input_text:
-        history.append((f"User: {input_text}", f"Assistant: {response}"))
     else:
         history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
-    return "", None, None, None, new_pdf_content, history
-# Function to clear the chat history and PDF content
 def clear_history():
-    return "", None, None, None, "", []
-# Function to process a newly uploaded PDF
-def process_pdf(pdf_file):
-    if pdf_file is None:
-        return ""
-    return extract_text_from_pdf(pdf_file)
 # Function to update visible components based on input type selection
 def update_input_type(choice):
     if choice == "Text":
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Image":
-        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Voice":
-        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     elif choice == "PDF":
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
 # Custom CSS styles with animations and button colors
 custom_css = """
@@ -275,6 +346,29 @@ custom_css = """
         margin-left: auto;
         animation: slideInAssistant 0.5s ease-out;
     }
     /* Animation keyframes */
     @keyframes fadeIn {
         0% { opacity: 0; }
@@ -288,6 +382,27 @@ custom_css = """
         0% { transform: translateX(100%); }
         100% { transform: translateX(0); }
     }
     /* Mobile responsiveness */
     @media (max-width: 768px) {
         .gradio-header h1 {
@@ -306,6 +421,9 @@ custom_css = """
             width: 100%;
             margin-left: 0;
         }
     }
 """
@@ -314,8 +432,8 @@ def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
-                <h1>Multimodal Chatbot (Text + Image + Voice + PDF)</h1>
-                <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
             </div>
         """)
@@ -323,23 +441,26 @@ def create_interface():
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
-            This is a multimodal chatbot that can handle text, image, voice, and PDF inputs.
-            - You can ask questions or provide text, and the assistant will respond.
-            - You can upload an image, and the assistant will process it and answer questions about the image.
-            - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
-            - PDF support: Upload a PDF and ask questions about its content.
-            - Enter your OpenAI API key to start interacting with the model.
-            - You can use the 'Clear History' button to remove the conversation history.
-            - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
-            ### Reasoning Effort:
-            The reasoning effort controls how complex or detailed the assistant's answers should be.
-            - **Low**: Provides quick, concise answers with minimal reasoning or details.
-            - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
-            - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
             """)
         # Store PDF content as a state variable
-        pdf_content = gr.State("")
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
@@ -376,54 +497,162 @@ def create_interface():
                 visible=False
             )
-            # PDF input
             pdf_input = gr.File(
                 label="Upload your PDF",
                 file_types=[".pdf"],
                 visible=False
             )
-        with gr.Row():
-            reasoning_effort = gr.Dropdown(
-                label="Reasoning Effort",
-                choices=["low", "medium", "high"],
-                value="medium"
             )
-            model_choice = gr.Dropdown(
-                label="Select Model",
-                choices=["o1", "o3-mini"],
-                value="o1"  # Default to 'o1' for image-related tasks
             )
-            submit_btn = gr.Button("Ask!", elem_id="submit-btn")
-            clear_btn = gr.Button("Clear History", elem_id="clear-history")
-        chat_history = gr.Chatbot()
         # Connect the input type selector to the update function
         input_type.change(
             fn=update_input_type,
             inputs=[input_type],
-            outputs=[input_text, image_input, audio_input, pdf_input]
         )
         # Process PDF when uploaded
         pdf_input.change(
-            fn=process_pdf,
             inputs=[pdf_input],
-            outputs=[pdf_content]
         )
         # Button interactions
         submit_btn.click(
             fn=chatbot,
-            inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content],
-            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
         )
-        clear_btn.click(
             fn=clear_history,
             inputs=[],
-            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
         )
     return demo

 import os
 import tempfile
 import fitz  # PyMuPDF for PDF handling
+import uuid
+import json
+# Class to manage document storage
+class DocumentManager:
+    def __init__(self):
+        self.documents = {}  # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}
+    def add_document(self, file_path, file_name=None):
+        """Add a document to the manager and return its ID"""
+        if file_name is None:
+            file_name = os.path.basename(file_path)
+        doc_id = str(uuid.uuid4())
+        content = extract_text_from_pdf(file_path)
+        self.documents[doc_id] = {
+            "name": file_name,
+            "content": content,
+            "path": file_path
+        }
+        return doc_id
+    def get_document_content(self, doc_id):
+        """Get the content of a document by its ID"""
+        if doc_id in self.documents:
+            return self.documents[doc_id]["content"]
+        return ""
+    def get_document_path(self, doc_id):
+        """Get the file path of a document by its ID"""
+        if doc_id in self.documents:
+            return self.documents[doc_id]["path"]
+        return None
+    def get_document_list(self):
+        """Get a list of document names and IDs for dropdown"""
+        return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]
+    def clear_documents(self):
+        """Clear all documents"""
+        self.documents = {}
+        return []
+# Initialize the document manager
+document_manager = DocumentManager()
 # Function to extract text from PDF files
 def extract_text_from_pdf(pdf_file):
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
+# Function to handle PDF uploads
+def handle_pdf_upload(pdf_file):
+    if pdf_file is None:
+        return [], None
+    # Add the PDF to the document manager
+    doc_id = document_manager.add_document(pdf_file.name)
+    # Return updated dropdown list and the selected document ID
+    return document_manager.get_document_list(), doc_id
+# Function to get PDF content based on selected document
+def get_selected_document_content(doc_id):
+    if not doc_id:
+        return "", None
+    # Get the document path for the PDF viewer
+    doc_path = document_manager.get_document_path(doc_id)
+    # Return the document content for the AI and the path for the viewer
+    return document_manager.get_document_content(doc_id), doc_path
 # The function that will be used by Gradio interface
+def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
     # If there's audio, transcribe it to text
     if audio:
         input_text = transcribe_audio(audio, openai_api_key)
+    # Determine which PDF content to use
+    pdf_content_to_use = current_pdf_content
     # Generate the response
+    response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
     if input_text:
+        if doc_selection:
+            # Include the document name in the history
+            doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
+            history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
+        else:
+            history.append((f"User: {input_text}", f"Assistant: {response}"))
     else:
         history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
+    return "", None, None, None, doc_selection, current_pdf_content, history
+# Function to clear the chat history and reset selected document
 def clear_history():
+    return "", None, None, None, None, "", []
+# Function to clear all documents
+def clear_documents():
+    document_list = document_manager.clear_documents()
+    return document_list, None, "", None
 # Function to update visible components based on input type selection
 def update_input_type(choice):
     if choice == "Text":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Image":
+        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Voice":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "PDF":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 # Custom CSS styles with animations and button colors
 custom_css = """
         margin-left: auto;
         animation: slideInAssistant 0.5s ease-out;
     }
+    /* PDF preview panel */
+    .pdf-preview-panel {
+        border: 2px solid #ccc;
+        border-radius: 8px;
+        overflow: hidden;
+        height: 600px;
+        background-color: #f5f5f5;
+    }
+    /* PDF viewer iframe */
+    .pdf-viewer {
+        width: 100%;
+        height: 100%;
+        border: none;
+    }
+    /* Split view container */
+    .split-view-container {
+        display: flex;
+        gap: 20px;
+    }
+    .split-view-panel {
+        flex: 1;
+        min-width: 0;  /* Allow panels to shrink below their content size */
+    }
     /* Animation keyframes */
     @keyframes fadeIn {
         0% { opacity: 0; }
         0% { transform: translateX(100%); }
         100% { transform: translateX(0); }
     }
+    /* Document management styles */
+    .document-manager {
+        background-color: #fff;
+        border-radius: 10px;
+        padding: 15px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        margin-bottom: 20px;
+    }
+    .document-manager-header {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        margin-bottom: 15px;
+    }
+    .document-list {
+        max-height: 200px;
+        overflow-y: auto;
+        border: 1px solid #eee;
+        border-radius: 5px;
+        padding: 10px;
+    }
     /* Mobile responsiveness */
     @media (max-width: 768px) {
         .gradio-header h1 {
             width: 100%;
             margin-left: 0;
         }
+        .split-view-container {
+            flex-direction: column;
+        }
     }
 """
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
+                <h1>Enhanced Multimodal Chatbot</h1>
+                <h3>Interact with text, images, voice, and multiple PDFs</h3>
             </div>
         """)
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
+            This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.
+            - **Text Mode**: Ask questions or provide text for the assistant to respond.
+            - **Image Mode**: Upload an image for the assistant to analyze and discuss.
+            - **Voice Mode**: Upload or record audio that will be transcribed and processed.
+            - **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.
+            ### PDF Features:
+            - Upload and manage multiple PDFs in a single session
+            - Select which document to query from a dropdown menu
+            - View PDFs side-by-side with the chat interface
+            - Clear document library as needed
+            ### Model Options:
+            - "o1" is for image, voice, PDF and text chat
+            - "o3-mini" is for text, PDF and voice chat only
             """)
         # Store PDF content as a state variable
+        current_pdf_content = gr.State("")
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
                 visible=False
             )
+            # PDF input and document selection components
             pdf_input = gr.File(
                 label="Upload your PDF",
                 file_types=[".pdf"],
                 visible=False
             )
+            # Dropdown for document selection
+            doc_selection = gr.Dropdown(
+                label="Select Document to Query",
+                choices=[],
+                interactive=True,
+                visible=False
             )
+            # PDF Viewer (initially hidden)
+            pdf_viewer = gr.HTML(
+                label="PDF Preview",
+                visible=False
             )
+        # Action buttons row
+        with gr.Row():
+            with gr.Column(scale=1):
+                reasoning_effort = gr.Dropdown(
+                    label="Reasoning Effort",
+                    choices=["low", "medium", "high"],
+                    value="medium"
+                )
+            with gr.Column(scale=1):
+                model_choice = gr.Dropdown(
+                    label="Select Model",
+                    choices=["o1", "o3-mini"],
+                    value="o1"
+                )
+            with gr.Column(scale=1):
+                submit_btn = gr.Button("Ask!", elem_id="submit-btn")
+            with gr.Column(scale=1):
+                clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")
+            with gr.Column(scale=1, visible=False) as clear_docs_col:
+                clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")
+        # Create a container for the split view layout when in PDF mode
+        with gr.Row(visible=False) as split_view_container:
+            with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
+                pdf_display = gr.HTML(
+                    """<div class="pdf-preview-panel">
+                        <iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
+                    </div>"""
+                )
+            with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
+                chat_history = gr.Chatbot()
+        # Regular chat history display (when not in split view)
+        with gr.Row(visible=True) as regular_chat_container:
+            chat_history_regular = gr.Chatbot()
+        # Function to handle selection of a document from dropdown
+        def handle_doc_selection(doc_id):
+            if not doc_id:
+                return "", update_pdf_viewer(None)
+            content, path = get_selected_document_content(doc_id)
+            return content, update_pdf_viewer(path)
+        # Function to update the PDF viewer
+        def update_pdf_viewer(pdf_path):
+            if not pdf_path:
+                return """<div class="pdf-preview-panel">
+                            <div style="padding: 20px; text-align: center;">No PDF selected</div>
+                        </div>"""
+            # Create a data URL or temporary file path to display the PDF
+            return f"""<div class="pdf-preview-panel">
+                        <iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
+                    </div>"""
+        # Function to toggle between split view and regular view based on input type
+        def toggle_view(choice):
+            if choice == "PDF":
+                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+            else:
+                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         # Connect the input type selector to the update function
         input_type.change(
             fn=update_input_type,
             inputs=[input_type],
+            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
+        )
+        # Toggle between split view and regular view when input type changes
+        input_type.change(
+            fn=toggle_view,
+            inputs=[input_type],
+            outputs=[split_view_container, regular_chat_container, clear_docs_col]
         )
         # Process PDF when uploaded
         pdf_input.change(
+            fn=handle_pdf_upload,
             inputs=[pdf_input],
+            outputs=[doc_selection, doc_selection]
+        )
+        # Update content when document is selected
+        doc_selection.change(
+            fn=handle_doc_selection,
+            inputs=[doc_selection],
+            outputs=[current_pdf_content, pdf_display]
         )
         # Button interactions
         submit_btn.click(
             fn=chatbot,
+            inputs=[
+                input_text, image_input, audio_input, pdf_input,
+                doc_selection, openai_api_key, reasoning_effort,
+                model_choice, current_pdf_content
+            ],
+            outputs=[
+                input_text, image_input, audio_input, pdf_input,
+                doc_selection, current_pdf_content, chat_history_regular
+            ]
+        )
+        # Also update the split view chat history when submitting
+        submit_btn.click(
+            fn=lambda history: history,
+            inputs=[chat_history_regular],
+            outputs=[chat_history]
         )
+        clear_chat_btn.click(
             fn=clear_history,
             inputs=[],
+            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
+        )
+        # Also clear the split view chat history
+        clear_chat_btn.click(
+            fn=lambda: [],
+            inputs=[],
+            outputs=[chat_history]
+        )
+        # Clear all documents
+        clear_docs_btn.click(
+            fn=clear_documents,
+            inputs=[],
+            outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
         )
     return demo