Spaces:

shukdevdatta123
/

Multi-modal-o1-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 11

Commit

511c89a

verified ·

1 Parent(s): 7661e71

Create abc3.txt

Browse files

Files changed (1) hide show

abc3.txt +434 -0

abc3.txt ADDED Viewed

	@@ -0,0 +1,434 @@

+import gradio as gr
+import openai
+import base64
+from PIL import Image
+import io
+import os
+import tempfile
+import fitz  # PyMuPDF for PDF handling
+# Function to extract text from PDF files
+def extract_text_from_pdf(pdf_file):
+    try:
+        text = ""
+        pdf_document = fitz.open(pdf_file)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            text += page.get_text()
+        pdf_document.close()
+        return text
+    except Exception as e:
+        return f"Error extracting text from PDF: {str(e)}"
+# Function to send the request to OpenAI API with an image, text or PDF input
+def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    # Process the input depending on whether it's text, image, or a PDF-related query
+    if pdf_content and input_text:
+        # For PDF queries, we combine the PDF content with the user's question
+        prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
+        input_content = prompt
+    elif image:
+        # Convert the image to base64 string
+        image_info = get_base64_string_from_image(image)
+        input_content = f"data:image/png;base64,{image_info}"
+    else:
+        # Plain text input
+        input_content = input_text
+    # Prepare the messages for OpenAI API
+    if model_choice == "o1":
+        if image and not pdf_content:
+            messages = [
+                {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]}
+            ]
+        else:
+            messages = [
+                {"role": "user", "content": [{"type": "text", "text": input_content}]}
+            ]
+    elif model_choice == "o3-mini":
+        messages = [
+            {"role": "user", "content": [{"type": "text", "text": input_content}]}
+        ]
+    try:
+        # Call OpenAI API with the selected model
+        response = openai.ChatCompletion.create(
+            model=model_choice,
+            messages=messages,
+            reasoning_effort=reasoning_effort,
+            max_completion_tokens=2000
+        )
+        return response["choices"][0]["message"]["content"]
+    except Exception as e:
+        return f"Error calling OpenAI API: {str(e)}"
+# Function to convert an uploaded image to a base64 string
+def get_base64_string_from_image(pil_image):
+    # Convert PIL Image to bytes
+    buffered = io.BytesIO()
+    pil_image.save(buffered, format="PNG")
+    img_bytes = buffered.getvalue()
+    base64_str = base64.b64encode(img_bytes).decode("utf-8")
+    return base64_str
+# Function to transcribe audio to text using OpenAI Whisper API
+def transcribe_audio(audio, openai_api_key):
+    if not openai_api_key:
+        return "Error: No API key provided."
+    openai.api_key = openai_api_key
+    try:
+        # Open the audio file and pass it as a file object
+        with open(audio, 'rb') as audio_file:
+            audio_file_content = audio_file.read()
+        # Use the correct transcription API call
+        audio_file_obj = io.BytesIO(audio_file_content)
+        audio_file_obj.name = 'audio.wav'  # Set a name for the file object (as OpenAI expects it)
+        # Transcribe the audio to text using OpenAI's whisper model
+        audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
+        return audio_file_transcription['text']
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
+# The function that will be used by Gradio interface
+def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, history=[]):
+    # If there's audio, transcribe it to text
+    if audio:
+        input_text = transcribe_audio(audio, openai_api_key)
+    # If a new PDF is uploaded, extract its text
+    new_pdf_content = pdf_content
+    if pdf_file is not None:
+        new_pdf_content = extract_text_from_pdf(pdf_file)
+    # Generate the response
+    response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
+    # Append the response to the history
+    if input_text:
+        history.append((f"User: {input_text}", f"Assistant: {response}"))
+    else:
+        history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
+    return "", None, None, None, new_pdf_content, history
+# Function to clear the chat history and PDF content
+def clear_history():
+    return "", None, None, None, "", []
+# Function to process a newly uploaded PDF
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return ""
+    return extract_text_from_pdf(pdf_file)
+# Function to update visible components based on input type selection
+def update_input_type(choice):
+    if choice == "Text":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    elif choice == "Image":
+        return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+    elif choice == "Voice":
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    elif choice == "PDF":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
+# Custom CSS styles with animations and button colors
+custom_css = """
+    /* General body styles */
+    .gradio-container {
+        font-family: 'Arial', sans-serif;
+        background-color: #f8f9fa;
+        color: #333;
+    }
+    /* Header styles */
+    .gradio-header {
+        background-color: #007bff;
+        color: white;
+        padding: 20px;
+        text-align: center;
+        border-radius: 8px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        animation: fadeIn 1s ease-out;
+    }
+    .gradio-header h1 {
+        font-size: 2.5rem;
+    }
+    .gradio-header h3 {
+        font-size: 1.2rem;
+        margin-top: 10px;
+    }
+    /* Chatbot container styles */
+    .gradio-chatbot {
+        background-color: #fff;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        max-height: 500px;
+        overflow-y: auto;
+        animation: fadeIn 2s ease-out;
+    }
+    /* Input field styles */
+    .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
+        border-radius: 8px;
+        border: 2px solid #ccc;
+        padding: 10px;
+        margin-bottom: 10px;
+        width: 100%;
+        font-size: 1rem;
+        transition: all 0.3s ease;
+    }
+    .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus {
+        border-color: #007bff;
+    }
+    /* Button styles */
+    /* Send Button: Sky Blue */
+    #submit-btn {
+        background-color: #00aaff; /* Sky blue */
+        color: white;
+        border: none;
+        border-radius: 8px;
+        padding: 10px 19px;
+        font-size: 1.1rem;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        margin-left: auto;
+        margin-right: auto;
+        display: block;
+        margin-top: 10px;
+    }
+    #submit-btn:hover {
+        background-color: #0099cc; /* Slightly darker blue */
+    }
+    #submit-btn:active {
+        transform: scale(0.95);
+    }
+    #clear-history {
+        background-color: #f04e4e; /* Slightly Darker red */
+        color: white;
+        border: none;
+        border-radius: 8px;
+        padding: 10px 13px;
+        font-size: 1.1rem;
+        cursor: pointer;
+        transition: all 0.3s ease;
+        margin-top: 10px;
+    }
+    #clear-history:hover {
+        background-color: #f5a4a4; /* Light red */
+    }
+    #clear-history:active {
+        transform: scale(0.95);
+    }
+    /* Input type selector buttons */
+    #input-type-group {
+        display: flex;
+        justify-content: center;
+        gap: 10px;
+        margin-bottom: 20px;
+    }
+    .input-type-btn {
+        background-color: #6c757d;
+        color: white;
+        border: none;
+        border-radius: 8px;
+        padding: 10px 15px;
+        font-size: 1rem;
+        cursor: pointer;
+        transition: all 0.3s ease;
+    }
+    .input-type-btn.selected {
+        background-color: #007bff;
+    }
+    .input-type-btn:hover {
+        background-color: #5a6268;
+    }
+    /* Chat history styles */
+    .gradio-chatbot .message {
+        margin-bottom: 10px;
+    }
+    .gradio-chatbot .user {
+        background-color: #007bff;
+        color: white;
+        padding: 10px;
+        border-radius: 12px;
+        max-width: 70%;
+        animation: slideInUser 0.5s ease-out;
+    }
+    .gradio-chatbot .assistant {
+        background-color: #f1f1f1;
+        color: #333;
+        padding: 10px;
+        border-radius: 12px;
+        max-width: 70%;
+        margin-left: auto;
+        animation: slideInAssistant 0.5s ease-out;
+    }
+    /* Animation keyframes */
+    @keyframes fadeIn {
+        0% { opacity: 0; }
+        100% { opacity: 1; }
+    }
+    @keyframes slideInUser {
+        0% { transform: translateX(-100%); }
+        100% { transform: translateX(0); }
+    }
+    @keyframes slideInAssistant {
+        0% { transform: translateX(100%); }
+        100% { transform: translateX(0); }
+    }
+    /* Mobile responsiveness */
+    @media (max-width: 768px) {
+        .gradio-header h1 {
+            font-size: 1.8rem;
+        }
+        .gradio-header h3 {
+            font-size: 1rem;
+        }
+        .gradio-chatbot {
+            max-height: 400px;
+        }
+        .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
+            width: 100%;
+        }
+        #submit-btn, #clear-history {
+            width: 100%;
+            margin-left: 0;
+        }
+    }
+"""
+# Gradio interface setup
+def create_interface():
+    with gr.Blocks(css=custom_css) as demo:
+        gr.Markdown("""
+            <div class="gradio-header">
+                <h1>Multimodal Chatbot (Text + Image + Voice + PDF)</h1>
+                <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
+            </div>
+        """)
+        # Add a description with an expandable accordion
+        with gr.Accordion("Click to expand for details", open=False):
+            gr.Markdown("""
+            ### Description:
+            This is a multimodal chatbot that can handle text, image, voice, and PDF inputs.
+            - You can ask questions or provide text, and the assistant will respond.
+            - You can upload an image, and the assistant will process it and answer questions about the image.
+            - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
+            - PDF support: Upload a PDF and ask questions about its content.
+            - Enter your OpenAI API key to start interacting with the model.
+            - You can use the 'Clear History' button to remove the conversation history.
+            - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
+            ### Reasoning Effort:
+            The reasoning effort controls how complex or detailed the assistant's answers should be.
+            - **Low**: Provides quick, concise answers with minimal reasoning or details.
+            - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
+            - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
+            """)
+        # Store PDF content as a state variable
+        pdf_content = gr.State("")
+        with gr.Row():
+            openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
+        # Input type selector
+        with gr.Row():
+            input_type = gr.Radio(
+                ["Text", "Image", "Voice", "PDF"],
+                label="Choose Input Type",
+                value="Text"
+            )
+        # Create the input components (initially text is visible, others are hidden)
+        with gr.Row():
+            # Text input
+            input_text = gr.Textbox(
+                label="Enter Text Question",
+                placeholder="Ask a question or provide text",
+                lines=2,
+                visible=True
+            )
+            # Image input
+            image_input = gr.Image(
+                label="Upload an Image",
+                type="pil",
+                visible=False
+            )
+            # Audio input
+            audio_input = gr.Audio(
+                label="Upload or Record Audio",
+                type="filepath",
+                visible=False
+            )
+            # PDF input
+            pdf_input = gr.File(
+                label="Upload your PDF",
+                file_types=[".pdf"],
+                visible=False
+            )
+        with gr.Row():
+            reasoning_effort = gr.Dropdown(
+                label="Reasoning Effort",
+                choices=["low", "medium", "high"],
+                value="medium"
+            )
+            model_choice = gr.Dropdown(
+                label="Select Model",
+                choices=["o1", "o3-mini"],
+                value="o1"  # Default to 'o1' for image-related tasks
+            )
+            submit_btn = gr.Button("Ask!", elem_id="submit-btn")
+            clear_btn = gr.Button("Clear History", elem_id="clear-history")
+        chat_history = gr.Chatbot()
+        # Connect the input type selector to the update function
+        input_type.change(
+            fn=update_input_type,
+            inputs=[input_type],
+            outputs=[input_text, image_input, audio_input, pdf_input]
+        )
+        # Process PDF when uploaded
+        pdf_input.change(
+            fn=process_pdf,
+            inputs=[pdf_input],
+            outputs=[pdf_content]
+        )
+        # Button interactions
+        submit_btn.click(
+            fn=chatbot,
+            inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content],
+            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
+        )
+        clear_btn.click(
+            fn=clear_history,
+            inputs=[],
+            outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
+        )
+    return demo
+# Run the interface
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()