shukdevdatta123's picture
Update app.py
ea2202e verified
raw
history blame
22.9 kB
import gradio as gr
import openai
import base64
from PIL import Image
import io
import os
import tempfile
import fitz # PyMuPDF for PDF handling
import uuid
import json
# Class to manage document storage
class DocumentManager:
def __init__(self):
self.documents = {} # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}
def add_document(self, file_path, file_name=None):
"""Add a document to the manager and return its ID"""
if file_name is None:
file_name = os.path.basename(file_path)
doc_id = str(uuid.uuid4())
content = extract_text_from_pdf(file_path)
self.documents[doc_id] = {
"name": file_name,
"content": content,
"path": file_path
}
return doc_id
def get_document_content(self, doc_id):
"""Get the content of a document by its ID"""
if doc_id in self.documents:
return self.documents[doc_id]["content"]
return ""
def get_document_path(self, doc_id):
"""Get the file path of a document by its ID"""
if doc_id in self.documents:
return self.documents[doc_id]["path"]
return None
def get_document_list(self):
"""Get a list of document names and IDs for dropdown"""
return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]
def clear_documents(self):
"""Clear all documents"""
self.documents = {}
return []
# Initialize the document manager
document_manager = DocumentManager()
# Function to extract text from PDF files
def extract_text_from_pdf(pdf_file):
try:
text = ""
pdf_document = fitz.open(pdf_file)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
return f"Error extracting text from PDF: {str(e)}"
# Function to send the request to OpenAI API with an image, text or PDF input
def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
# Process the input depending on whether it's text, image, or a PDF-related query
if pdf_content and input_text:
# For PDF queries, we combine the PDF content with the user's question
prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
input_content = prompt
elif image:
# Convert the image to base64 string
image_info = get_base64_string_from_image(image)
input_content = f"data:image/png;base64,{image_info}"
else:
# Plain text input
input_content = input_text
# Prepare the messages for OpenAI API
if model_choice == "o1":
if image and not pdf_content:
messages = [
{"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]}
]
else:
messages = [
{"role": "user", "content": [{"type": "text", "text": input_content}]}
]
elif model_choice == "o3-mini":
messages = [
{"role": "user", "content": [{"type": "text", "text": input_content}]}
]
try:
# Call OpenAI API with the selected model
response = openai.ChatCompletion.create(
model=model_choice,
messages=messages,
reasoning_effort=reasoning_effort,
max_completion_tokens=2000
)
return response["choices"][0]["message"]["content"]
except Exception as e:
return f"Error calling OpenAI API: {str(e)}"
# Function to convert an uploaded image to a base64 string
def get_base64_string_from_image(pil_image):
# Convert PIL Image to bytes
buffered = io.BytesIO()
pil_image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
base64_str = base64.b64encode(img_bytes).decode("utf-8")
return base64_str
# Function to transcribe audio to text using OpenAI Whisper API
def transcribe_audio(audio, openai_api_key):
if not openai_api_key:
return "Error: No API key provided."
openai.api_key = openai_api_key
try:
# Open the audio file and pass it as a file object
with open(audio, 'rb') as audio_file:
audio_file_content = audio_file.read()
# Use the correct transcription API call
audio_file_obj = io.BytesIO(audio_file_content)
audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it)
# Transcribe the audio to text using OpenAI's whisper model
audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
return audio_file_transcription['text']
except Exception as e:
return f"Error transcribing audio: {str(e)}"
# Function to handle PDF uploads
def handle_pdf_upload(pdf_file):
if pdf_file is None:
return [], None
# Add the PDF to the document manager
doc_id = document_manager.add_document(pdf_file.name)
# Return updated dropdown list and the selected document ID
doc_list = document_manager.get_document_list()
# Only set the value if the list is not empty
selected_value = doc_id if doc_list else None
return doc_list, selected_value
# Function to get PDF content based on selected document
def get_selected_document_content(doc_id):
if not doc_id:
return "", None
# Get the document path for the PDF viewer
doc_path = document_manager.get_document_path(doc_id)
# Return the document content for the AI and the path for the viewer
return document_manager.get_document_content(doc_id), doc_path
# The function that will be used by Gradio interface
def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
# If there's audio, transcribe it to text
if audio:
input_text = transcribe_audio(audio, openai_api_key)
# Determine which PDF content to use
pdf_content_to_use = current_pdf_content
# Generate the response
response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)
# Append the response to the history
if input_text:
if doc_selection:
# Include the document name in the history
doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
else:
history.append((f"User: {input_text}", f"Assistant: {response}"))
else:
history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
return "", None, None, None, doc_selection, current_pdf_content, history
# Function to clear the chat history and reset selected document
def clear_history():
return "", None, None, None, None, "", []
# Function to clear all documents
def clear_documents():
document_list = document_manager.clear_documents()
return document_list, None, "", None
# Function to update visible components based on input type selection
def update_input_type(choice):
if choice == "Text":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif choice == "Image":
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif choice == "Voice":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif choice == "PDF":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
# Custom CSS styles with animations and button colors
custom_css = """
/* General body styles */
.gradio-container {
font-family: 'Arial', sans-serif;
background-color: #f8f9fa;
color: #333;
}
/* Header styles */
.gradio-header {
background-color: #007bff;
color: white;
padding: 20px;
text-align: center;
border-radius: 8px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
animation: fadeIn 1s ease-out;
}
.gradio-header h1 {
font-size: 2.5rem;
}
.gradio-header h3 {
font-size: 1.2rem;
margin-top: 10px;
}
/* Chatbot container styles */
.gradio-chatbot {
background-color: #fff;
border-radius: 10px;
padding: 20px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
max-height: 500px;
overflow-y: auto;
animation: fadeIn 2s ease-out;
}
/* Input field styles */
.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
border-radius: 8px;
border: 2px solid #ccc;
padding: 10px;
margin-bottom: 10px;
width: 100%;
font-size: 1rem;
transition: all 0.3s ease;
}
.gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus {
border-color: #007bff;
}
/* Button styles */
/* Send Button: Sky Blue */
#submit-btn {
background-color: #00aaff; /* Sky blue */
color: white;
border: none;
border-radius: 8px;
padding: 10px 19px;
font-size: 1.1rem;
cursor: pointer;
transition: all 0.3s ease;
margin-left: auto;
margin-right: auto;
display: block;
margin-top: 10px;
}
#submit-btn:hover {
background-color: #0099cc; /* Slightly darker blue */
}
#submit-btn:active {
transform: scale(0.95);
}
#clear-history {
background-color: #f04e4e; /* Slightly Darker red */
color: white;
border: none;
border-radius: 8px;
padding: 10px 13px;
font-size: 1.1rem;
cursor: pointer;
transition: all 0.3s ease;
margin-top: 10px;
}
#clear-history:hover {
background-color: #f5a4a4; /* Light red */
}
#clear-history:active {
transform: scale(0.95);
}
/* Input type selector buttons */
#input-type-group {
display: flex;
justify-content: center;
gap: 10px;
margin-bottom: 20px;
}
.input-type-btn {
background-color: #6c757d;
color: white;
border: none;
border-radius: 8px;
padding: 10px 15px;
font-size: 1rem;
cursor: pointer;
transition: all 0.3s ease;
}
.input-type-btn.selected {
background-color: #007bff;
}
.input-type-btn:hover {
background-color: #5a6268;
}
/* Chat history styles */
.gradio-chatbot .message {
margin-bottom: 10px;
}
.gradio-chatbot .user {
background-color: #007bff;
color: white;
padding: 10px;
border-radius: 12px;
max-width: 70%;
animation: slideInUser 0.5s ease-out;
}
.gradio-chatbot .assistant {
background-color: #f1f1f1;
color: #333;
padding: 10px;
border-radius: 12px;
max-width: 70%;
margin-left: auto;
animation: slideInAssistant 0.5s ease-out;
}
/* PDF preview panel */
.pdf-preview-panel {
border: 2px solid #ccc;
border-radius: 8px;
overflow: hidden;
height: 600px;
background-color: #f5f5f5;
}
/* PDF viewer iframe */
.pdf-viewer {
width: 100%;
height: 100%;
border: none;
}
/* Split view container */
.split-view-container {
display: flex;
gap: 20px;
}
.split-view-panel {
flex: 1;
min-width: 0; /* Allow panels to shrink below their content size */
}
/* Animation keyframes */
@keyframes fadeIn {
0% { opacity: 0; }
100% { opacity: 1; }
}
@keyframes slideInUser {
0% { transform: translateX(-100%); }
100% { transform: translateX(0); }
}
@keyframes slideInAssistant {
0% { transform: translateX(100%); }
100% { transform: translateX(0); }
}
/* Document management styles */
.document-manager {
background-color: #fff;
border-radius: 10px;
padding: 15px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
margin-bottom: 20px;
}
.document-manager-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 15px;
}
.document-list {
max-height: 200px;
overflow-y: auto;
border: 1px solid #eee;
border-radius: 5px;
padding: 10px;
}
/* Mobile responsiveness */
@media (max-width: 768px) {
.gradio-header h1 {
font-size: 1.8rem;
}
.gradio-header h3 {
font-size: 1rem;
}
.gradio-chatbot {
max-height: 400px;
}
.gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file {
width: 100%;
}
#submit-btn, #clear-history {
width: 100%;
margin-left: 0;
}
.split-view-container {
flex-direction: column;
}
}
"""
# Gradio interface setup
def create_interface():
with gr.Blocks(css=custom_css) as demo:
gr.Markdown("""
<div class="gradio-header">
<h1>Enhanced Multimodal Chatbot</h1>
<h3>Interact with text, images, voice, and multiple PDFs</h3>
</div>
""")
# Add a description with an expandable accordion
with gr.Accordion("Click to expand for details", open=False):
gr.Markdown("""
### Description:
This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.
- **Text Mode**: Ask questions or provide text for the assistant to respond.
- **Image Mode**: Upload an image for the assistant to analyze and discuss.
- **Voice Mode**: Upload or record audio that will be transcribed and processed.
- **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.
### PDF Features:
- Upload and manage multiple PDFs in a single session
- Select which document to query from a dropdown menu
- View PDFs side-by-side with the chat interface
- Clear document library as needed
### Model Options:
- "o1" is for image, voice, PDF and text chat
- "o3-mini" is for text, PDF and voice chat only
""")
# Store PDF content as a state variable
current_pdf_content = gr.State("")
with gr.Row():
openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
# Input type selector
with gr.Row():
input_type = gr.Radio(
["Text", "Image", "Voice", "PDF"],
label="Choose Input Type",
value="Text"
)
# Create the input components (initially text is visible, others are hidden)
with gr.Row():
# Text input
input_text = gr.Textbox(
label="Enter Text Question",
placeholder="Ask a question or provide text",
lines=2,
visible=True
)
# Image input
image_input = gr.Image(
label="Upload an Image",
type="pil",
visible=False
)
# Audio input
audio_input = gr.Audio(
label="Upload or Record Audio",
type="filepath",
visible=False
)
# PDF input and document selection components
pdf_input = gr.File(
label="Upload your PDF",
file_types=[".pdf"],
visible=False
)
# Dropdown for document selection
doc_selection = gr.Dropdown(
label="Select Document to Query",
choices=[],
interactive=True,
visible=False
)
# PDF Viewer (initially hidden)
pdf_viewer = gr.HTML(
label="PDF Preview",
visible=False
)
# Action buttons row
with gr.Row():
with gr.Column(scale=1):
reasoning_effort = gr.Dropdown(
label="Reasoning Effort",
choices=["low", "medium", "high"],
value="medium"
)
with gr.Column(scale=1):
model_choice = gr.Dropdown(
label="Select Model",
choices=["o1", "o3-mini"],
value="o1"
)
with gr.Column(scale=1):
submit_btn = gr.Button("Ask!", elem_id="submit-btn")
with gr.Column(scale=1):
clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")
with gr.Column(scale=1, visible=False) as clear_docs_col:
clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")
# Create a container for the split view layout when in PDF mode
with gr.Row(visible=False) as split_view_container:
with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
pdf_display = gr.HTML(
"""<div class="pdf-preview-panel">
<iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
</div>"""
)
with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
chat_history = gr.Chatbot()
# Regular chat history display (when not in split view)
with gr.Row(visible=True) as regular_chat_container:
chat_history_regular = gr.Chatbot()
# Function to handle selection of a document from dropdown
def handle_doc_selection(doc_id):
if not doc_id:
return "", update_pdf_viewer(None)
content, path = get_selected_document_content(doc_id)
return content, update_pdf_viewer(path)
# Function to update the PDF viewer
def update_pdf_viewer(pdf_path):
if not pdf_path:
return """<div class="pdf-preview-panel">
<div style="padding: 20px; text-align: center;">No PDF selected</div>
</div>"""
# Create a data URL or temporary file path to display the PDF
return f"""<div class="pdf-preview-panel">
<iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
</div>"""
# Function to toggle between split view and regular view based on input type
def toggle_view(choice):
if choice == "PDF":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
# Connect the input type selector to the update function
input_type.change(
fn=update_input_type,
inputs=[input_type],
outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
)
# Toggle between split view and regular view when input type changes
input_type.change(
fn=toggle_view,
inputs=[input_type],
outputs=[split_view_container, regular_chat_container, clear_docs_col]
)
# Process PDF when uploaded
pdf_input.change(
fn=handle_pdf_upload,
inputs=[pdf_input],
outputs=[doc_selection, doc_selection]
)
# Update content when document is selected
doc_selection.change(
fn=handle_doc_selection,
inputs=[doc_selection],
outputs=[current_pdf_content, pdf_display]
)
# Button interactions
submit_btn.click(
fn=chatbot,
inputs=[
input_text, image_input, audio_input, pdf_input,
doc_selection, openai_api_key, reasoning_effort,
model_choice, current_pdf_content, chat_history_regular # Added chat_history_regular to avoid creating new empty list
],
outputs=[
input_text, image_input, audio_input, pdf_input,
doc_selection, current_pdf_content, chat_history_regular
]
)
# Also update the split view chat history when submitting
submit_btn.click(
fn=lambda history: history,
inputs=[chat_history_regular],
outputs=[chat_history]
)
clear_chat_btn.click(
fn=clear_history,
inputs=[],
outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
)
# Also clear the split view chat history
clear_chat_btn.click(
fn=lambda: [],
inputs=[],
outputs=[chat_history]
)
# Clear all documents
clear_docs_btn.click(
fn=clear_documents,
inputs=[],
outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
)
return demo
# Run the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()