Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 11

Commit

02e9dd4

verified ·

1 Parent(s): ea2202e

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -289

app.py CHANGED Viewed

@@ -6,53 +6,6 @@ import io
 import os
 import tempfile
 import fitz  # PyMuPDF for PDF handling
-import uuid
-import json
-# Class to manage document storage
-class DocumentManager:
-    def __init__(self):
-        self.documents = {}  # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}
-    def add_document(self, file_path, file_name=None):
-        """Add a document to the manager and return its ID"""
-        if file_name is None:
-            file_name = os.path.basename(file_path)
-        doc_id = str(uuid.uuid4())
-        content = extract_text_from_pdf(file_path)
-        self.documents[doc_id] = {
-            "name": file_name,
-            "content": content,
-            "path": file_path
-        }
-        return doc_id
-    def get_document_content(self, doc_id):
-        """Get the content of a document by its ID"""
-        if doc_id in self.documents:
-            return self.documents[doc_id]["content"]
-        return ""
-    def get_document_path(self, doc_id):
-        """Get the file path of a document by its ID"""
-        if doc_id in self.documents:
-            return self.documents[doc_id]["path"]
-        return None
-    def get_document_list(self):
-        """Get a list of document names and IDs for dropdown"""
-        return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]
-    def clear_documents(self):
-        """Clear all documents"""
-        self.documents = {}
-        return []
-# Initialize the document manager
-document_manager = DocumentManager()
 # Function to extract text from PDF files
 def extract_text_from_pdf(pdf_file):
@@ -148,76 +101,48 @@ def transcribe_audio(audio, openai_api_key):
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
-# Function to handle PDF uploads
-def handle_pdf_upload(pdf_file):
-    if pdf_file is None:
-        return [], None
-    # Add the PDF to the document manager
-    doc_id = document_manager.add_document(pdf_file.name)
-    # Return updated dropdown list and the selected document ID
-    doc_list = document_manager.get_document_list()
-    # Only set the value if the list is not empty
-    selected_value = doc_id if doc_list else None
-    return doc_list, selected_value
-# Function to get PDF content based on selected document
-def get_selected_document_content(doc_id):
-    if not doc_id:
-        return "", None
-    # Get the document path for the PDF viewer
-    doc_path = document_manager.get_document_path(doc_id)
-    # Return the document content for the AI and the path for the viewer
-    return document_manager.get_document_content(doc_id), doc_path
 # The function that will be used by Gradio interface
-def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
     # If there's audio, transcribe it to text
     if audio:
         input_text = transcribe_audio(audio, openai_api_key)
-    # Determine which PDF content to use
-    pdf_content_to_use = current_pdf_content
     # Generate the response
-    response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
     if input_text:
-        if doc_selection:
-            # Include the document name in the history
-            doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
-            history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
-        else:
-            history.append((f"User: {input_text}", f"Assistant: {response}"))
     else:
         history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
-    return "", None, None, None, doc_selection, current_pdf_content, history
-# Function to clear the chat history and reset selected document
 def clear_history():
-    return "", None, None, None, None, "", []
-# Function to clear all documents
-def clear_documents():
-    document_list = document_manager.clear_documents()
-    return document_list, None, "", None
 # Function to update visible components based on input type selection
 def update_input_type(choice):
     if choice == "Text":
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Image":
-        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Voice":
-        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "PDF":
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 # Custom CSS styles with animations and button colors
 custom_css = """
@@ -350,29 +275,6 @@ custom_css = """
         margin-left: auto;
         animation: slideInAssistant 0.5s ease-out;
     }
-    /* PDF preview panel */
-    .pdf-preview-panel {
-        border: 2px solid #ccc;
-        border-radius: 8px;
-        overflow: hidden;
-        height: 600px;
-        background-color: #f5f5f5;
-    }
-    /* PDF viewer iframe */
-    .pdf-viewer {
-        width: 100%;
-        height: 100%;
-        border: none;
-    }
-    /* Split view container */
-    .split-view-container {
-        display: flex;
-        gap: 20px;
-    }
-    .split-view-panel {
-        flex: 1;
-        min-width: 0;  /* Allow panels to shrink below their content size */
-    }
     /* Animation keyframes */
     @keyframes fadeIn {
         0% { opacity: 0; }
@@ -386,27 +288,6 @@ custom_css = """
         0% { transform: translateX(100%); }
         100% { transform: translateX(0); }
     }
-    /* Document management styles */
-    .document-manager {
-        background-color: #fff;
-        border-radius: 10px;
-        padding: 15px;
-        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
-        margin-bottom: 20px;
-    }
-    .document-manager-header {
-        display: flex;
-        justify-content: space-between;
-        align-items: center;
-        margin-bottom: 15px;
-    }
-    .document-list {
-        max-height: 200px;
-        overflow-y: auto;
-        border: 1px solid #eee;
-        border-radius: 5px;
-        padding: 10px;
-    }
     /* Mobile responsiveness */
     @media (max-width: 768px) {
         .gradio-header h1 {
@@ -425,9 +306,6 @@ custom_css = """
             width: 100%;
             margin-left: 0;
         }
-        .split-view-container {
-            flex-direction: column;
-        }
     }
 """
@@ -436,8 +314,8 @@ def create_interface():
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
-                <h1>Enhanced Multimodal Chatbot</h1>
-                <h3>Interact with text, images, voice, and multiple PDFs</h3>
             </div>
         """)
@@ -445,26 +323,23 @@ def create_interface():
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
-            This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.
-            - **Text Mode**: Ask questions or provide text for the assistant to respond.
-            - **Image Mode**: Upload an image for the assistant to analyze and discuss.
-            - **Voice Mode**: Upload or record audio that will be transcribed and processed.
-            - **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.
-            ### PDF Features:
-            - Upload and manage multiple PDFs in a single session
-            - Select which document to query from a dropdown menu
-            - View PDFs side-by-side with the chat interface
-            - Clear document library as needed
-            ### Model Options:
-            - "o1" is for image, voice, PDF and text chat
-            - "o3-mini" is for text, PDF and voice chat only
             """)
         # Store PDF content as a state variable
-        current_pdf_content = gr.State("")
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
@@ -501,162 +376,54 @@ def create_interface():
                 visible=False
             )
-            # PDF input and document selection components
             pdf_input = gr.File(
                 label="Upload your PDF",
                 file_types=[".pdf"],
                 visible=False
             )
-            # Dropdown for document selection
-            doc_selection = gr.Dropdown(
-                label="Select Document to Query",
-                choices=[],
-                interactive=True,
-                visible=False
-            )
-            # PDF Viewer (initially hidden)
-            pdf_viewer = gr.HTML(
-                label="PDF Preview",
-                visible=False
-            )
-        # Action buttons row
         with gr.Row():
-            with gr.Column(scale=1):
-                reasoning_effort = gr.Dropdown(
-                    label="Reasoning Effort",
-                    choices=["low", "medium", "high"],
-                    value="medium"
-                )
-            with gr.Column(scale=1):
-                model_choice = gr.Dropdown(
-                    label="Select Model",
-                    choices=["o1", "o3-mini"],
-                    value="o1"
-                )
-            with gr.Column(scale=1):
-                submit_btn = gr.Button("Ask!", elem_id="submit-btn")
-            with gr.Column(scale=1):
-                clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")
-            with gr.Column(scale=1, visible=False) as clear_docs_col:
-                clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")
-        # Create a container for the split view layout when in PDF mode
-        with gr.Row(visible=False) as split_view_container:
-            with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
-                pdf_display = gr.HTML(
-                    """<div class="pdf-preview-panel">
-                        <iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
-                    </div>"""
-                )
-            with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
-                chat_history = gr.Chatbot()
-        # Regular chat history display (when not in split view)
-        with gr.Row(visible=True) as regular_chat_container:
-            chat_history_regular = gr.Chatbot()
-        # Function to handle selection of a document from dropdown
-        def handle_doc_selection(doc_id):
-            if not doc_id:
-                return "", update_pdf_viewer(None)
-            content, path = get_selected_document_content(doc_id)
-            return content, update_pdf_viewer(path)
-        # Function to update the PDF viewer
-        def update_pdf_viewer(pdf_path):
-            if not pdf_path:
-                return """<div class="pdf-preview-panel">
-                            <div style="padding: 20px; text-align: center;">No PDF selected</div>
-                        </div>"""
-            # Create a data URL or temporary file path to display the PDF
-            return f"""<div class="pdf-preview-panel">
-                        <iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
-                    </div>"""
-        # Function to toggle between split view and regular view based on input type
-        def toggle_view(choice):
-            if choice == "PDF":
-                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
-            else:
-                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         # Connect the input type selector to the update function
         input_type.change(
             fn=update_input_type,
             inputs=[input_type],
-            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
-        )
-        # Toggle between split view and regular view when input type changes
-        input_type.change(
-            fn=toggle_view,
-            inputs=[input_type],
-            outputs=[split_view_container, regular_chat_container, clear_docs_col]
         )
         # Process PDF when uploaded
         pdf_input.change(
-            fn=handle_pdf_upload,
             inputs=[pdf_input],
-            outputs=[doc_selection, doc_selection]
-        )
-        # Update content when document is selected
-        doc_selection.change(
-            fn=handle_doc_selection,
-            inputs=[doc_selection],
-            outputs=[current_pdf_content, pdf_display]
         )
         # Button interactions
         submit_btn.click(
             fn=chatbot,
-            inputs=[
-                input_text, image_input, audio_input, pdf_input,
-                doc_selection, openai_api_key, reasoning_effort,
-                model_choice, current_pdf_content, chat_history_regular  # Added chat_history_regular to avoid creating new empty list
-            ],
-            outputs=[
-                input_text, image_input, audio_input, pdf_input,
-                doc_selection, current_pdf_content, chat_history_regular
-            ]
-        )
-        # Also update the split view chat history when submitting
-        submit_btn.click(
-            fn=lambda history: history,
-            inputs=[chat_history_regular],
-            outputs=[chat_history]
         )
-        clear_chat_btn.click(
             fn=clear_history,
             inputs=[],
-            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
-        )
-        # Also clear the split view chat history
-        clear_chat_btn.click(
-            fn=lambda: [],
-            inputs=[],
-            outputs=[chat_history]
-        )
-        # Clear all documents
-        clear_docs_btn.click(
-            fn=clear_documents,
-            inputs=[],
-            outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
         )
     return demo

 import os
 import tempfile
 import fitz  # PyMuPDF for PDF handling
 # Function to extract text from PDF files
 def extract_text_from_pdf(pdf_file):
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
 # The function that will be used by Gradio interface
+def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, history=[]):
     # If there's audio, transcribe it to text
     if audio:
         input_text = transcribe_audio(audio, openai_api_key)
+    # If a new PDF is uploaded, extract its text
+    new_pdf_content = pdf_content
+    if pdf_file is not None:
+        new_pdf_content = extract_text_from_pdf(pdf_file)
     # Generate the response
+    response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
     # Append the response to the history
     if input_text:
+        history.append((f"User: {input_text}", f"Assistant: {response}"))
     else:
         history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
+    return "", None, None, None, new_pdf_content, history
+# Function to clear the chat history and PDF content
 def clear_history():
+    return "", None, None, None, "", []
+# Function to process a newly uploaded PDF
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return ""
+    return extract_text_from_pdf(pdf_file)
 # Function to update visible components based on input type selection
 def update_input_type(choice):
     if choice == "Text":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Image":
+        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
     elif choice == "Voice":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     elif choice == "PDF":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
 # Custom CSS styles with animations and button colors
 custom_css = """
         margin-left: auto;
         animation: slideInAssistant 0.5s ease-out;
     }
     /* Animation keyframes */
     @keyframes fadeIn {
         0% { opacity: 0; }
         0% { transform: translateX(100%); }
         100% { transform: translateX(0); }
     }
     /* Mobile responsiveness */
     @media (max-width: 768px) {
         .gradio-header h1 {
             width: 100%;
             margin-left: 0;
         }
     }
 """
     with gr.Blocks(css=custom_css) as demo:
         gr.Markdown("""
             <div class="gradio-header">
+                <h1>Multimodal Chatbot (Text + Image + Voice + PDF)</h1>
+                <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
             </div>
         """)
         with gr.Accordion("Click to expand for details", open=False):
             gr.Markdown("""
             ### Description:
+            This is a multimodal chatbot that can handle text, image, voice, and PDF inputs.
+            - You can ask questions or provide text, and the assistant will respond.
+            - You can upload an image, and the assistant will process it and answer questions about the image.
+            - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
+            - PDF support: Upload a PDF and ask questions about its content.
+            - Enter your OpenAI API key to start interacting with the model.
+            - You can use the 'Clear History' button to remove the conversation history.
+            - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
+            ### Reasoning Effort:
+            The reasoning effort controls how complex or detailed the assistant's answers should be.
+            - **Low**: Provides quick, concise answers with minimal reasoning or details.
+            - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
+            - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
             """)
         # Store PDF content as a state variable
+        pdf_content = gr.State("")
         with gr.Row():
             openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
                 visible=False
             )
+            # PDF input
             pdf_input = gr.File(
                 label="Upload your PDF",
                 file_types=[".pdf"],
                 visible=False
             )
         with gr.Row():
+            reasoning_effort = gr.Dropdown(
+                label="Reasoning Effort",
+                choices=["low", "medium", "high"],
+                value="medium"
+            )
+            model_choice = gr.Dropdown(
+                label="Select Model",
+                choices=["o1", "o3-mini"],
+                value="o1"  # Default to 'o1' for image-related tasks
+            )
+            submit_btn = gr.Button("Ask!", elem_id="submit-btn")
+            clear_btn = gr.Button("Clear History", elem_id="clear-history")
+        chat_history = gr.Chatbot()
         # Connect the input type selector to the update function
         input_type.change(
             fn=update_input_type,
             inputs=[input_type],
+            outputs=[input_text, image_input, audio_input, pdf_input]
         )
         # Process PDF when uploaded
         pdf_input.change(
+            fn=process_pdf,
             inputs=[pdf_input],
+            outputs=[pdf_content]
         )
         # Button interactions
         submit_btn.click(
             fn=chatbot,
+            inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content],
+            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
         )
+        clear_btn.click(
             fn=clear_history,
             inputs=[],
+            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
         )
     return demo