Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

File size: 22,924 Bytes

import gradio as gr
import openai
import base64
from PIL import Image
import io
import os
import tempfile
import fitz  # PyMuPDF for PDF handling
import uuid
import json

# Class to manage document storage
class DocumentManager:
    def __init__(self):
        self.documents = {}  # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}
    
    def add_document(self, file_path, file_name=None):
        """Add a document to the manager and return its ID"""
        if file_name is None:
            file_name = os.path.basename(file_path)
        
        doc_id = str(uuid.uuid4())
        content = extract_text_from_pdf(file_path)
        
        self.documents[doc_id] = {
            "name": file_name,
            "content": content,
            "path": file_path
        }
        
        return doc_id
    
    def get_document_content(self, doc_id):
        """Get the content of a document by its ID"""
        if doc_id in self.documents:
            return self.documents[doc_id]["content"]
        return ""
    
    def get_document_path(self, doc_id):
        """Get the file path of a document by its ID"""
        if doc_id in self.documents:
            return self.documents[doc_id]["path"]
        return None
    
    def get_document_list(self):
        """Get a list of document names and IDs for dropdown"""
        return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]
    
    def clear_documents(self):
        """Clear all documents"""
        self.documents = {}
        return []

# Initialize the document manager
document_manager = DocumentManager()

# Function to extract text from PDF files
def extract_text_from_pdf(pdf_file):
    try:
        text = ""
        pdf_document = fitz.open(pdf_file)
        
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text += page.get_text()
            
        pdf_document.close()
        return text
    except Exception as e:
        return f"Error extracting text from PDF: {str(e)}"

# Function to send the request to OpenAI API with an image, text or PDF input
def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
    if not openai_api_key:
        return "Error: No API key provided."

    openai.api_key = openai_api_key

    # Process the input depending on whether it's text, image, or a PDF-related query
    if pdf_content and input_text:
        # For PDF queries, we combine the PDF content with the user's question
        prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
        input_content = prompt
    elif image:
        # Convert the image to base64 string
        image_info = get_base64_string_from_image(image)
        input_content = f"data:image/png;base64,{image_info}"
    else:
        # Plain text input
        input_content = input_text

    # Prepare the messages for OpenAI API
    if model_choice == "o1":
        if image and not pdf_content:
            messages = [
                {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]}
            ]
        else:
            messages = [
                {"role": "user", "content": [{"type": "text", "text": input_content}]}
            ]
    elif model_choice == "o3-mini":
        messages = [
            {"role": "user", "content": [{"type": "text", "text": input_content}]}
        ]
    
    try:
        # Call OpenAI API with the selected model
        response = openai.ChatCompletion.create(
            model=model_choice,
            messages=messages,
            reasoning_effort=reasoning_effort,
            max_completion_tokens=2000
        )

        return response["choices"][0]["message"]["content"]
    except Exception as e:
        return f"Error calling OpenAI API: {str(e)}"

# Function to convert an uploaded image to a base64 string
def get_base64_string_from_image(pil_image):
    # Convert PIL Image to bytes
    buffered = io.BytesIO()
    pil_image.save(buffered, format="PNG")
    img_bytes = buffered.getvalue()
    base64_str = base64.b64encode(img_bytes).decode("utf-8")
    return base64_str

# Function to transcribe audio to text using OpenAI Whisper API
def transcribe_audio(audio, openai_api_key):
    if not openai_api_key:
        return "Error: No API key provided."
    
    openai.api_key = openai_api_key
    
    try:
        # Open the audio file and pass it as a file object
        with open(audio, 'rb') as audio_file:
            audio_file_content = audio_file.read()
        
        # Use the correct transcription API call
        audio_file_obj = io.BytesIO(audio_file_content)
        audio_file_obj.name = 'audio.wav'  # Set a name for the file object (as OpenAI expects it)

        # Transcribe the audio to text using OpenAI's whisper model
        audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
        return audio_file_transcription['text']
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

# Function to handle PDF uploads
def handle_pdf_upload(pdf_file):
    if pdf_file is None:
        return [], None
    
    # Add the PDF to the document manager
    doc_id = document_manager.add_document(pdf_file.name)
    
    # Return updated dropdown list and the selected document ID
    doc_list = document_manager.get_document_list()
    # Only set the value if the list is not empty
    selected_value = doc_id if doc_list else None
    
    return doc_list, selected_value

# Function to get PDF content based on selected document
def get_selected_document_content(doc_id):
    if not doc_id:
        return "", None
    
    # Get the document path for the PDF viewer
    doc_path = document_manager.get_document_path(doc_id)
    
    # Return the document content for the AI and the path for the viewer
    return document_manager.get_document_content(doc_id), doc_path

# The function that will be used by Gradio interface
def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
    # If there's audio, transcribe it to text
    if audio:
        input_text = transcribe_audio(audio, openai_api_key)
    
    # Determine which PDF content to use
    pdf_content_to_use = current_pdf_content
    
    # Generate the response
    response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)
    
    # Append the response to the history
    if input_text:
        if doc_selection:
            # Include the document name in the history
            doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
            history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
        else:
            history.append((f"User: {input_text}", f"Assistant: {response}"))
    else:
        history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
    
    return "", None, None, None, doc_selection, current_pdf_content, history

# Function to clear the chat history and reset selected document
def clear_history():
    return "", None, None, None, None, "", []

# Function to clear all documents
def clear_documents():
    document_list = document_manager.clear_documents()
    return document_list, None, "", None

# Function to update visible components based on input type selection
def update_input_type(choice):
    if choice == "Text":
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    elif choice == "Image":
        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    elif choice == "Voice":
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    elif choice == "PDF":
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
    
# Custom CSS styles with animations and button colors
custom_css = """
    /* General body styles */
    .gradio-container {
        font-family: 'Arial', sans-serif;
        background-color: #f8f9fa;
        color: #333;
    }
    /* Header styles */
    .gradio-header {
        background-color: #007bff;
        color: white;
        padding: 20px;
        text-align: center;
        border-radius: 8px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        animation: fadeIn 1s ease-out;
    }
    .gradio-header h1 {
        font-size: 2.5rem;
    }
    .gradio-header h3 {
        font-size: 1.2rem;
        margin-top: 10px;
    }
    /* Chatbot container styles */
    .gradio-chatbot {
        background-color: #fff;
        border-radius: 10px;
        padding: 20px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        max-height: 500px;
        overflow-y: auto;
        animation: fadeIn 2s ease-out;
    }
    /* Input field styles */
    .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
        border-radius: 8px;
        border: 2px solid #ccc;
        padding: 10px;
        margin-bottom: 10px;
        width: 100%;
        font-size: 1rem;
        transition: all 0.3s ease;
    }
    .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus {
        border-color: #007bff;
    }
    /* Button styles */
    /* Send Button: Sky Blue */
    #submit-btn {
        background-color: #00aaff; /* Sky blue */
        color: white;
        border: none;
        border-radius: 8px;
        padding: 10px 19px;
        font-size: 1.1rem;
        cursor: pointer;
        transition: all 0.3s ease;
        margin-left: auto;
        margin-right: auto;
        display: block;
        margin-top: 10px;
    }
    #submit-btn:hover {
        background-color: #0099cc; /* Slightly darker blue */
    }
    #submit-btn:active {
        transform: scale(0.95);
    }
    #clear-history {
        background-color: #f04e4e; /* Slightly Darker red */
        color: white;
        border: none;
        border-radius: 8px;
        padding: 10px 13px;
        font-size: 1.1rem;
        cursor: pointer;
        transition: all 0.3s ease;
        margin-top: 10px;
    }
    #clear-history:hover {
        background-color: #f5a4a4; /* Light red */
    }
    #clear-history:active {
        transform: scale(0.95);
    }
    /* Input type selector buttons */
    #input-type-group {
        display: flex;
        justify-content: center;
        gap: 10px;
        margin-bottom: 20px;
    }
    .input-type-btn {
        background-color: #6c757d;
        color: white;
        border: none;
        border-radius: 8px;
        padding: 10px 15px;
        font-size: 1rem;
        cursor: pointer;
        transition: all 0.3s ease;
    }
    .input-type-btn.selected {
        background-color: #007bff;
    }
    .input-type-btn:hover {
        background-color: #5a6268;
    }
    /* Chat history styles */
    .gradio-chatbot .message {
        margin-bottom: 10px;
    }
    .gradio-chatbot .user {
        background-color: #007bff;
        color: white;
        padding: 10px;
        border-radius: 12px;
        max-width: 70%;
        animation: slideInUser 0.5s ease-out;
    }
    .gradio-chatbot .assistant {
        background-color: #f1f1f1;
        color: #333;
        padding: 10px;
        border-radius: 12px;
        max-width: 70%;
        margin-left: auto;
        animation: slideInAssistant 0.5s ease-out;
    }
    /* PDF preview panel */
    .pdf-preview-panel {
        border: 2px solid #ccc;
        border-radius: 8px;
        overflow: hidden;
        height: 600px;
        background-color: #f5f5f5;
    }
    /* PDF viewer iframe */
    .pdf-viewer {
        width: 100%;
        height: 100%;
        border: none;
    }
    /* Split view container */
    .split-view-container {
        display: flex;
        gap: 20px;
    }
    .split-view-panel {
        flex: 1;
        min-width: 0;  /* Allow panels to shrink below their content size */
    }
    /* Animation keyframes */
    @keyframes fadeIn {
        0% { opacity: 0; }
        100% { opacity: 1; }
    }
    @keyframes slideInUser {
        0% { transform: translateX(-100%); }
        100% { transform: translateX(0); }
    }
    @keyframes slideInAssistant {
        0% { transform: translateX(100%); }
        100% { transform: translateX(0); }
    }
    /* Document management styles */
    .document-manager {
        background-color: #fff;
        border-radius: 10px;
        padding: 15px;
        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
        margin-bottom: 20px;
    }
    .document-manager-header {
        display: flex;
        justify-content: space-between;
        align-items: center;
        margin-bottom: 15px;
    }
    .document-list {
        max-height: 200px;
        overflow-y: auto;
        border: 1px solid #eee;
        border-radius: 5px;
        padding: 10px;
    }
    /* Mobile responsiveness */
    @media (max-width: 768px) {
        .gradio-header h1 {
            font-size: 1.8rem;
        }
        .gradio-header h3 {
            font-size: 1rem;
        }
        .gradio-chatbot {
            max-height: 400px;
        }
        .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
            width: 100%;
        }
        #submit-btn, #clear-history {
            width: 100%;
            margin-left: 0;
        }
        .split-view-container {
            flex-direction: column;
        }
    }
"""

# Gradio interface setup
def create_interface():
    with gr.Blocks(css=custom_css) as demo:
        gr.Markdown("""
            <div class="gradio-header">
                <h1>Enhanced Multimodal Chatbot</h1>
                <h3>Interact with text, images, voice, and multiple PDFs</h3>
            </div>
        """)

        # Add a description with an expandable accordion
        with gr.Accordion("Click to expand for details", open=False):
            gr.Markdown("""
            ### Description:
            This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.
            
            - **Text Mode**: Ask questions or provide text for the assistant to respond.
            - **Image Mode**: Upload an image for the assistant to analyze and discuss.
            - **Voice Mode**: Upload or record audio that will be transcribed and processed.
            - **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.
            
            ### PDF Features:
            - Upload and manage multiple PDFs in a single session
            - Select which document to query from a dropdown menu
            - View PDFs side-by-side with the chat interface
            - Clear document library as needed
            
            ### Model Options:
            - "o1" is for image, voice, PDF and text chat
            - "o3-mini" is for text, PDF and voice chat only
            """)

        # Store PDF content as a state variable
        current_pdf_content = gr.State("")

        with gr.Row():
            openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)

        # Input type selector
        with gr.Row():
            input_type = gr.Radio(
                ["Text", "Image", "Voice", "PDF"], 
                label="Choose Input Type", 
                value="Text"
            )

        # Create the input components (initially text is visible, others are hidden)
        with gr.Row():
            # Text input
            input_text = gr.Textbox(
                label="Enter Text Question", 
                placeholder="Ask a question or provide text", 
                lines=2,
                visible=True
            )
            
            # Image input
            image_input = gr.Image(
                label="Upload an Image", 
                type="pil",
                visible=False
            )
            
            # Audio input
            audio_input = gr.Audio(
                label="Upload or Record Audio", 
                type="filepath",
                visible=False
            )
            
            # PDF input and document selection components
            pdf_input = gr.File(
                label="Upload your PDF",
                file_types=[".pdf"],
                visible=False
            )
            
            # Dropdown for document selection
            doc_selection = gr.Dropdown(
                label="Select Document to Query",
                choices=[],
                interactive=True,
                visible=False
            )
            
            # PDF Viewer (initially hidden)
            pdf_viewer = gr.HTML(
                label="PDF Preview",
                visible=False
            )

        # Action buttons row
        with gr.Row():
            with gr.Column(scale=1):
                reasoning_effort = gr.Dropdown(
                    label="Reasoning Effort",
                    choices=["low", "medium", "high"],
                    value="medium"
                )
            
            with gr.Column(scale=1):
                model_choice = gr.Dropdown(
                    label="Select Model",
                    choices=["o1", "o3-mini"],
                    value="o1"
                )
            
            with gr.Column(scale=1):
                submit_btn = gr.Button("Ask!", elem_id="submit-btn")
            
            with gr.Column(scale=1):
                clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")
            
            with gr.Column(scale=1, visible=False) as clear_docs_col:
                clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")

        # Create a container for the split view layout when in PDF mode
        with gr.Row(visible=False) as split_view_container:
            with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
                pdf_display = gr.HTML(
                    """<div class="pdf-preview-panel">
                        <iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
                    </div>"""
                )
            
            with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
                chat_history = gr.Chatbot()

        # Regular chat history display (when not in split view)
        with gr.Row(visible=True) as regular_chat_container:
            chat_history_regular = gr.Chatbot()

        # Function to handle selection of a document from dropdown
        def handle_doc_selection(doc_id):
            if not doc_id:
                return "", update_pdf_viewer(None)
            
            content, path = get_selected_document_content(doc_id)
            return content, update_pdf_viewer(path)
        
        # Function to update the PDF viewer
        def update_pdf_viewer(pdf_path):
            if not pdf_path:
                return """<div class="pdf-preview-panel">
                            <div style="padding: 20px; text-align: center;">No PDF selected</div>
                        </div>"""
            
            # Create a data URL or temporary file path to display the PDF
            return f"""<div class="pdf-preview-panel">
                        <iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
                    </div>"""

        # Function to toggle between split view and regular view based on input type
        def toggle_view(choice):
            if choice == "PDF":
                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)

        # Connect the input type selector to the update function
        input_type.change(
            fn=update_input_type,
            inputs=[input_type],
            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
        )
        
        # Toggle between split view and regular view when input type changes
        input_type.change(
            fn=toggle_view,
            inputs=[input_type],
            outputs=[split_view_container, regular_chat_container, clear_docs_col]
        )
        
        # Process PDF when uploaded
        pdf_input.change(
            fn=handle_pdf_upload,
            inputs=[pdf_input],
            outputs=[doc_selection, doc_selection]
        )
        
        # Update content when document is selected
        doc_selection.change(
            fn=handle_doc_selection,
            inputs=[doc_selection],
            outputs=[current_pdf_content, pdf_display]
        )

        # Button interactions
        submit_btn.click(
            fn=chatbot, 
            inputs=[
                input_text, image_input, audio_input, pdf_input, 
                doc_selection, openai_api_key, reasoning_effort, 
                model_choice, current_pdf_content, chat_history_regular  # Added chat_history_regular to avoid creating new empty list
            ], 
            outputs=[
                input_text, image_input, audio_input, pdf_input, 
                doc_selection, current_pdf_content, chat_history_regular
            ]
        )
        
        # Also update the split view chat history when submitting
        submit_btn.click(
            fn=lambda history: history,
            inputs=[chat_history_regular],
            outputs=[chat_history]
        )
        
        clear_chat_btn.click(
            fn=clear_history, 
            inputs=[], 
            outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
        )
        
        # Also clear the split view chat history
        clear_chat_btn.click(
            fn=lambda: [],
            inputs=[],
            outputs=[chat_history]
        )
        
        # Clear all documents
        clear_docs_btn.click(
            fn=clear_documents,
            inputs=[],
            outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
        )

    return demo

# Run the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()