shukdevdatta123 commited on
Commit
0d11d75
·
verified ·
1 Parent(s): 238e053

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +285 -56
app.py CHANGED
@@ -6,6 +6,53 @@ import io
6
  import os
7
  import tempfile
8
  import fitz # PyMuPDF for PDF handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Function to extract text from PDF files
11
  def extract_text_from_pdf(pdf_file):
@@ -101,48 +148,72 @@ def transcribe_audio(audio, openai_api_key):
101
  except Exception as e:
102
  return f"Error transcribing audio: {str(e)}"
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # The function that will be used by Gradio interface
105
- def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, history=[]):
106
  # If there's audio, transcribe it to text
107
  if audio:
108
  input_text = transcribe_audio(audio, openai_api_key)
109
 
110
- # If a new PDF is uploaded, extract its text
111
- new_pdf_content = pdf_content
112
- if pdf_file is not None:
113
- new_pdf_content = extract_text_from_pdf(pdf_file)
114
 
115
  # Generate the response
116
- response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
117
 
118
  # Append the response to the history
119
  if input_text:
120
- history.append((f"User: {input_text}", f"Assistant: {response}"))
 
 
 
 
 
121
  else:
122
  history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
123
 
124
- return "", None, None, None, new_pdf_content, history
125
 
126
- # Function to clear the chat history and PDF content
127
  def clear_history():
128
- return "", None, None, None, "", []
129
 
130
- # Function to process a newly uploaded PDF
131
- def process_pdf(pdf_file):
132
- if pdf_file is None:
133
- return ""
134
- return extract_text_from_pdf(pdf_file)
135
 
136
  # Function to update visible components based on input type selection
137
  def update_input_type(choice):
138
  if choice == "Text":
139
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
140
  elif choice == "Image":
141
- return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
142
  elif choice == "Voice":
143
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
144
  elif choice == "PDF":
145
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
146
 
147
  # Custom CSS styles with animations and button colors
148
  custom_css = """
@@ -275,6 +346,29 @@ custom_css = """
275
  margin-left: auto;
276
  animation: slideInAssistant 0.5s ease-out;
277
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  /* Animation keyframes */
279
  @keyframes fadeIn {
280
  0% { opacity: 0; }
@@ -288,6 +382,27 @@ custom_css = """
288
  0% { transform: translateX(100%); }
289
  100% { transform: translateX(0); }
290
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  /* Mobile responsiveness */
292
  @media (max-width: 768px) {
293
  .gradio-header h1 {
@@ -306,6 +421,9 @@ custom_css = """
306
  width: 100%;
307
  margin-left: 0;
308
  }
 
 
 
309
  }
310
  """
311
 
@@ -314,8 +432,8 @@ def create_interface():
314
  with gr.Blocks(css=custom_css) as demo:
315
  gr.Markdown("""
316
  <div class="gradio-header">
317
- <h1>Multimodal Chatbot (Text + Image + Voice + PDF)</h1>
318
- <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
319
  </div>
320
  """)
321
 
@@ -323,23 +441,26 @@ def create_interface():
323
  with gr.Accordion("Click to expand for details", open=False):
324
  gr.Markdown("""
325
  ### Description:
326
- This is a multimodal chatbot that can handle text, image, voice, and PDF inputs.
327
- - You can ask questions or provide text, and the assistant will respond.
328
- - You can upload an image, and the assistant will process it and answer questions about the image.
329
- - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
330
- - PDF support: Upload a PDF and ask questions about its content.
331
- - Enter your OpenAI API key to start interacting with the model.
332
- - You can use the 'Clear History' button to remove the conversation history.
333
- - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
334
- ### Reasoning Effort:
335
- The reasoning effort controls how complex or detailed the assistant's answers should be.
336
- - **Low**: Provides quick, concise answers with minimal reasoning or details.
337
- - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
338
- - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
 
 
 
339
  """)
340
 
341
  # Store PDF content as a state variable
342
- pdf_content = gr.State("")
343
 
344
  with gr.Row():
345
  openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
@@ -376,54 +497,162 @@ def create_interface():
376
  visible=False
377
  )
378
 
379
- # PDF input
380
  pdf_input = gr.File(
381
  label="Upload your PDF",
382
  file_types=[".pdf"],
383
  visible=False
384
  )
385
-
386
- with gr.Row():
387
- reasoning_effort = gr.Dropdown(
388
- label="Reasoning Effort",
389
- choices=["low", "medium", "high"],
390
- value="medium"
 
391
  )
392
- model_choice = gr.Dropdown(
393
- label="Select Model",
394
- choices=["o1", "o3-mini"],
395
- value="o1" # Default to 'o1' for image-related tasks
 
396
  )
397
- submit_btn = gr.Button("Ask!", elem_id="submit-btn")
398
- clear_btn = gr.Button("Clear History", elem_id="clear-history")
399
 
400
- chat_history = gr.Chatbot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  # Connect the input type selector to the update function
403
  input_type.change(
404
  fn=update_input_type,
405
  inputs=[input_type],
406
- outputs=[input_text, image_input, audio_input, pdf_input]
 
 
 
 
 
 
 
407
  )
408
 
409
  # Process PDF when uploaded
410
  pdf_input.change(
411
- fn=process_pdf,
412
  inputs=[pdf_input],
413
- outputs=[pdf_content]
 
 
 
 
 
 
 
414
  )
415
 
416
  # Button interactions
417
  submit_btn.click(
418
  fn=chatbot,
419
- inputs=[input_text, image_input, audio_input, pdf_input, openai_api_key, reasoning_effort, model_choice, pdf_content],
420
- outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  )
422
 
423
- clear_btn.click(
424
  fn=clear_history,
425
  inputs=[],
426
- outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  )
428
 
429
  return demo
 
6
  import os
7
  import tempfile
8
  import fitz # PyMuPDF for PDF handling
9
+ import uuid
10
+ import json
11
+
12
+ # Class to manage document storage
13
+ class DocumentManager:
14
+ def __init__(self):
15
+ self.documents = {} # Dictionary to store documents: {doc_id: {"name": name, "content": content, "path": path}}
16
+
17
+ def add_document(self, file_path, file_name=None):
18
+ """Add a document to the manager and return its ID"""
19
+ if file_name is None:
20
+ file_name = os.path.basename(file_path)
21
+
22
+ doc_id = str(uuid.uuid4())
23
+ content = extract_text_from_pdf(file_path)
24
+
25
+ self.documents[doc_id] = {
26
+ "name": file_name,
27
+ "content": content,
28
+ "path": file_path
29
+ }
30
+
31
+ return doc_id
32
+
33
+ def get_document_content(self, doc_id):
34
+ """Get the content of a document by its ID"""
35
+ if doc_id in self.documents:
36
+ return self.documents[doc_id]["content"]
37
+ return ""
38
+
39
+ def get_document_path(self, doc_id):
40
+ """Get the file path of a document by its ID"""
41
+ if doc_id in self.documents:
42
+ return self.documents[doc_id]["path"]
43
+ return None
44
+
45
+ def get_document_list(self):
46
+ """Get a list of document names and IDs for dropdown"""
47
+ return [(self.documents[doc_id]["name"], doc_id) for doc_id in self.documents]
48
+
49
+ def clear_documents(self):
50
+ """Clear all documents"""
51
+ self.documents = {}
52
+ return []
53
+
54
+ # Initialize the document manager
55
+ document_manager = DocumentManager()
56
 
57
  # Function to extract text from PDF files
58
  def extract_text_from_pdf(pdf_file):
 
148
  except Exception as e:
149
  return f"Error transcribing audio: {str(e)}"
150
 
151
+ # Function to handle PDF uploads
152
+ def handle_pdf_upload(pdf_file):
153
+ if pdf_file is None:
154
+ return [], None
155
+
156
+ # Add the PDF to the document manager
157
+ doc_id = document_manager.add_document(pdf_file.name)
158
+
159
+ # Return updated dropdown list and the selected document ID
160
+ return document_manager.get_document_list(), doc_id
161
+
162
+ # Function to get PDF content based on selected document
163
+ def get_selected_document_content(doc_id):
164
+ if not doc_id:
165
+ return "", None
166
+
167
+ # Get the document path for the PDF viewer
168
+ doc_path = document_manager.get_document_path(doc_id)
169
+
170
+ # Return the document content for the AI and the path for the viewer
171
+ return document_manager.get_document_content(doc_id), doc_path
172
+
173
  # The function that will be used by Gradio interface
174
+ def chatbot(input_text, image, audio, pdf_file, doc_selection, openai_api_key, reasoning_effort, model_choice, current_pdf_content, history=[]):
175
  # If there's audio, transcribe it to text
176
  if audio:
177
  input_text = transcribe_audio(audio, openai_api_key)
178
 
179
+ # Determine which PDF content to use
180
+ pdf_content_to_use = current_pdf_content
 
 
181
 
182
  # Generate the response
183
+ response = generate_response(input_text, image, pdf_content_to_use, openai_api_key, reasoning_effort, model_choice)
184
 
185
  # Append the response to the history
186
  if input_text:
187
+ if doc_selection:
188
+ # Include the document name in the history
189
+ doc_name = next((doc[0] for doc in document_manager.get_document_list() if doc[1] == doc_selection), "Unknown Document")
190
+ history.append((f"User: {input_text} [Query on: {doc_name}]", f"Assistant: {response}"))
191
+ else:
192
+ history.append((f"User: {input_text}", f"Assistant: {response}"))
193
  else:
194
  history.append((f"User: [Uploaded content]", f"Assistant: {response}"))
195
 
196
+ return "", None, None, None, doc_selection, current_pdf_content, history
197
 
198
+ # Function to clear the chat history and reset selected document
199
  def clear_history():
200
+ return "", None, None, None, None, "", []
201
 
202
+ # Function to clear all documents
203
+ def clear_documents():
204
+ document_list = document_manager.clear_documents()
205
+ return document_list, None, "", None
 
206
 
207
  # Function to update visible components based on input type selection
208
  def update_input_type(choice):
209
  if choice == "Text":
210
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
211
  elif choice == "Image":
212
+ return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
213
  elif choice == "Voice":
214
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
215
  elif choice == "PDF":
216
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
217
 
218
  # Custom CSS styles with animations and button colors
219
  custom_css = """
 
346
  margin-left: auto;
347
  animation: slideInAssistant 0.5s ease-out;
348
  }
349
+ /* PDF preview panel */
350
+ .pdf-preview-panel {
351
+ border: 2px solid #ccc;
352
+ border-radius: 8px;
353
+ overflow: hidden;
354
+ height: 600px;
355
+ background-color: #f5f5f5;
356
+ }
357
+ /* PDF viewer iframe */
358
+ .pdf-viewer {
359
+ width: 100%;
360
+ height: 100%;
361
+ border: none;
362
+ }
363
+ /* Split view container */
364
+ .split-view-container {
365
+ display: flex;
366
+ gap: 20px;
367
+ }
368
+ .split-view-panel {
369
+ flex: 1;
370
+ min-width: 0; /* Allow panels to shrink below their content size */
371
+ }
372
  /* Animation keyframes */
373
  @keyframes fadeIn {
374
  0% { opacity: 0; }
 
382
  0% { transform: translateX(100%); }
383
  100% { transform: translateX(0); }
384
  }
385
+ /* Document management styles */
386
+ .document-manager {
387
+ background-color: #fff;
388
+ border-radius: 10px;
389
+ padding: 15px;
390
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
391
+ margin-bottom: 20px;
392
+ }
393
+ .document-manager-header {
394
+ display: flex;
395
+ justify-content: space-between;
396
+ align-items: center;
397
+ margin-bottom: 15px;
398
+ }
399
+ .document-list {
400
+ max-height: 200px;
401
+ overflow-y: auto;
402
+ border: 1px solid #eee;
403
+ border-radius: 5px;
404
+ padding: 10px;
405
+ }
406
  /* Mobile responsiveness */
407
  @media (max-width: 768px) {
408
  .gradio-header h1 {
 
421
  width: 100%;
422
  margin-left: 0;
423
  }
424
+ .split-view-container {
425
+ flex-direction: column;
426
+ }
427
  }
428
  """
429
 
 
432
  with gr.Blocks(css=custom_css) as demo:
433
  gr.Markdown("""
434
  <div class="gradio-header">
435
+ <h1>Enhanced Multimodal Chatbot</h1>
436
+ <h3>Interact with text, images, voice, and multiple PDFs</h3>
437
  </div>
438
  """)
439
 
 
441
  with gr.Accordion("Click to expand for details", open=False):
442
  gr.Markdown("""
443
  ### Description:
444
+ This enhanced multimodal chatbot handles text, image, voice, and PDF inputs with advanced document management.
445
+
446
+ - **Text Mode**: Ask questions or provide text for the assistant to respond.
447
+ - **Image Mode**: Upload an image for the assistant to analyze and discuss.
448
+ - **Voice Mode**: Upload or record audio that will be transcribed and processed.
449
+ - **PDF Mode**: Upload multiple PDFs, select which one to query, and view them side-by-side with the chat.
450
+
451
+ ### PDF Features:
452
+ - Upload and manage multiple PDFs in a single session
453
+ - Select which document to query from a dropdown menu
454
+ - View PDFs side-by-side with the chat interface
455
+ - Clear document library as needed
456
+
457
+ ### Model Options:
458
+ - "o1" is for image, voice, PDF and text chat
459
+ - "o3-mini" is for text, PDF and voice chat only
460
  """)
461
 
462
  # Store PDF content as a state variable
463
+ current_pdf_content = gr.State("")
464
 
465
  with gr.Row():
466
  openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
 
497
  visible=False
498
  )
499
 
500
+ # PDF input and document selection components
501
  pdf_input = gr.File(
502
  label="Upload your PDF",
503
  file_types=[".pdf"],
504
  visible=False
505
  )
506
+
507
+ # Dropdown for document selection
508
+ doc_selection = gr.Dropdown(
509
+ label="Select Document to Query",
510
+ choices=[],
511
+ interactive=True,
512
+ visible=False
513
  )
514
+
515
+ # PDF Viewer (initially hidden)
516
+ pdf_viewer = gr.HTML(
517
+ label="PDF Preview",
518
+ visible=False
519
  )
 
 
520
 
521
+ # Action buttons row
522
+ with gr.Row():
523
+ with gr.Column(scale=1):
524
+ reasoning_effort = gr.Dropdown(
525
+ label="Reasoning Effort",
526
+ choices=["low", "medium", "high"],
527
+ value="medium"
528
+ )
529
+
530
+ with gr.Column(scale=1):
531
+ model_choice = gr.Dropdown(
532
+ label="Select Model",
533
+ choices=["o1", "o3-mini"],
534
+ value="o1"
535
+ )
536
+
537
+ with gr.Column(scale=1):
538
+ submit_btn = gr.Button("Ask!", elem_id="submit-btn")
539
+
540
+ with gr.Column(scale=1):
541
+ clear_chat_btn = gr.Button("Clear Chat", elem_id="clear-history")
542
+
543
+ with gr.Column(scale=1, visible=False) as clear_docs_col:
544
+ clear_docs_btn = gr.Button("Clear All Documents", elem_id="clear-docs")
545
+
546
+ # Create a container for the split view layout when in PDF mode
547
+ with gr.Row(visible=False) as split_view_container:
548
+ with gr.Column(scale=1, elem_classes="split-view-panel") as pdf_panel:
549
+ pdf_display = gr.HTML(
550
+ """<div class="pdf-preview-panel">
551
+ <iframe class="pdf-viewer" id="pdf-viewer" src="about:blank"></iframe>
552
+ </div>"""
553
+ )
554
+
555
+ with gr.Column(scale=1, elem_classes="split-view-panel") as chat_panel:
556
+ chat_history = gr.Chatbot()
557
+
558
+ # Regular chat history display (when not in split view)
559
+ with gr.Row(visible=True) as regular_chat_container:
560
+ chat_history_regular = gr.Chatbot()
561
+
562
+ # Function to handle selection of a document from dropdown
563
+ def handle_doc_selection(doc_id):
564
+ if not doc_id:
565
+ return "", update_pdf_viewer(None)
566
+
567
+ content, path = get_selected_document_content(doc_id)
568
+ return content, update_pdf_viewer(path)
569
+
570
+ # Function to update the PDF viewer
571
+ def update_pdf_viewer(pdf_path):
572
+ if not pdf_path:
573
+ return """<div class="pdf-preview-panel">
574
+ <div style="padding: 20px; text-align: center;">No PDF selected</div>
575
+ </div>"""
576
+
577
+ # Create a data URL or temporary file path to display the PDF
578
+ return f"""<div class="pdf-preview-panel">
579
+ <iframe class="pdf-viewer" id="pdf-viewer" src="file={pdf_path}" type="application/pdf"></iframe>
580
+ </div>"""
581
+
582
+ # Function to toggle between split view and regular view based on input type
583
+ def toggle_view(choice):
584
+ if choice == "PDF":
585
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
586
+ else:
587
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
588
 
589
  # Connect the input type selector to the update function
590
  input_type.change(
591
  fn=update_input_type,
592
  inputs=[input_type],
593
+ outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, pdf_viewer]
594
+ )
595
+
596
+ # Toggle between split view and regular view when input type changes
597
+ input_type.change(
598
+ fn=toggle_view,
599
+ inputs=[input_type],
600
+ outputs=[split_view_container, regular_chat_container, clear_docs_col]
601
  )
602
 
603
  # Process PDF when uploaded
604
  pdf_input.change(
605
+ fn=handle_pdf_upload,
606
  inputs=[pdf_input],
607
+ outputs=[doc_selection, doc_selection]
608
+ )
609
+
610
+ # Update content when document is selected
611
+ doc_selection.change(
612
+ fn=handle_doc_selection,
613
+ inputs=[doc_selection],
614
+ outputs=[current_pdf_content, pdf_display]
615
  )
616
 
617
  # Button interactions
618
  submit_btn.click(
619
  fn=chatbot,
620
+ inputs=[
621
+ input_text, image_input, audio_input, pdf_input,
622
+ doc_selection, openai_api_key, reasoning_effort,
623
+ model_choice, current_pdf_content
624
+ ],
625
+ outputs=[
626
+ input_text, image_input, audio_input, pdf_input,
627
+ doc_selection, current_pdf_content, chat_history_regular
628
+ ]
629
+ )
630
+
631
+ # Also update the split view chat history when submitting
632
+ submit_btn.click(
633
+ fn=lambda history: history,
634
+ inputs=[chat_history_regular],
635
+ outputs=[chat_history]
636
  )
637
 
638
+ clear_chat_btn.click(
639
  fn=clear_history,
640
  inputs=[],
641
+ outputs=[input_text, image_input, audio_input, pdf_input, doc_selection, current_pdf_content, chat_history_regular]
642
+ )
643
+
644
+ # Also clear the split view chat history
645
+ clear_chat_btn.click(
646
+ fn=lambda: [],
647
+ inputs=[],
648
+ outputs=[chat_history]
649
+ )
650
+
651
+ # Clear all documents
652
+ clear_docs_btn.click(
653
+ fn=clear_documents,
654
+ inputs=[],
655
+ outputs=[doc_selection, doc_selection, current_pdf_content, pdf_display]
656
  )
657
 
658
  return demo