arad1367 commited on
Commit
0a97d2e
Β·
verified Β·
1 Parent(s): f21b6d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -50
app.py CHANGED
@@ -9,7 +9,7 @@ import torch
9
  import torchvision
10
  import subprocess
11
 
12
- # Run the commands from setup.sh to install poppler-utils
13
  def install_poppler():
14
  try:
15
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -34,7 +34,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
  # Convert the PDF to images
37
- images = convert_from_path(pdf_file.name)
38
  num_images = len(images)
39
 
40
  # Indexing the PDF in RAG
@@ -88,44 +88,9 @@ def process_pdf_and_query(pdf_file, user_query):
88
 
89
  return output_text[0], num_images
90
 
91
-
92
- pdf_input = gr.File(label="Upload PDF")
93
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
94
- output_text = gr.Textbox(label="Model Answer")
95
- output_images = gr.Textbox(label="Number of Images in PDF")
96
-
97
- # CSS styling
98
- css = """
99
- body {
100
- background-color: #282a36;
101
- font-family: Arial, sans-serif;
102
- color: #f8f8f2;
103
- }
104
- h1 {
105
- text-align: center;
106
- font-size: 2.5em;
107
- font-weight: bold;
108
- margin-bottom: 20px;
109
- }
110
- footer {
111
- margin-top: 20px;
112
- }
113
- .duplicate-button {
114
- text-align: center;
115
- background-color: #50fa7b;
116
- color: #282a36;
117
- font-weight: bold;
118
- border: none;
119
- padding: 10px;
120
- cursor: pointer;
121
- }
122
- """
123
-
124
  description = """
125
- ### About Multimodal RAG
126
- Multimodal Retrieval-Augmented Generation (RAG) integrates both images and text to provide more comprehensive and contextually accurate responses to user queries. It uses a retriever model like **ColPali** to search and retrieve relevant data and a large language model (LLM) like **Qwen/Qwen2-VL-2B-Instruct** to generate natural language answers based on the input.
127
-
128
- In this demo, **ColPali** is used as a multimodal retriever, and the **Byaldi** library from answer.ai simplifies the use of ColPali. We are utilizing **Qwen2-VL-2B-Instruct** for text generation, enabling both text and image-based queries.
129
  """
130
 
131
  footer = """
@@ -137,21 +102,24 @@ footer = """
137
  <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
138
  <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
139
  <br>
140
- Made with πŸ’– by <a href="https://github.com/arad1367" target="_blank">Pejman Ebrahimi</a>
141
  </div>
142
  """
143
 
144
- # Gradio Interface
145
- with gr.Blocks(theme='freddyaboulton/dracula_revamped', css=css) as demo:
146
- gr.Markdown("<h1>Multimodal RAG with Image Query</h1>")
147
  gr.Markdown(description)
148
- with gr.Row():
149
- pdf_input = gr.File(label="Upload PDF")
150
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
151
- output_text = gr.Textbox(label="Model Answer")
152
- output_images = gr.Textbox(label="Number of Images in PDF")
153
 
154
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
 
 
 
 
 
 
 
 
 
155
  gr.HTML(footer)
156
 
157
- demo.launch(debug=True)
 
9
  import torchvision
10
  import subprocess
11
 
12
+ # Run the commands from setup.sh to install poppler-utils -- This is necessary --> Noted by Pejman! --> I found this way instead of setup.sh
13
  def install_poppler():
14
  try:
15
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
  # Convert the PDF to images
37
+ images = convert_from_path(pdf_file.name)
38
  num_images = len(images)
39
 
40
  # Indexing the PDF in RAG
 
88
 
89
  return output_text[0], num_images
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  description = """
92
+ **Multimodal RAG** is a retrieval-augmented generation (RAG) model that works with multiple modalities, such as text and images, to retrieve relevant information from a knowledge base and generate coherent responses. In this demo, we use **ColPali**, a multimodal retriever capable of efficiently retrieving from large datasets, along with **Qwen2-VL-2B-Instruct**, a powerful large language model for answering questions based on the retrieved information.
93
+ Byaldi, developed by **Answer.ai**, is used to simplify the integration of ColPali into our pipeline.
 
 
94
  """
95
 
96
  footer = """
 
102
  <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
103
  <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
104
  <br>
105
+ Made with πŸ’– by Pejman Ebrahimi
106
  </div>
107
  """
108
 
109
+ with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
110
+ gr.Markdown("<h1 style='text-align: center; font-weight: bold;'>Multimodal RAG with Image Query - By <a href='https://github.com/arad1367' target='_blank'>Pejman Ebrahimi</a></h1>")
 
111
  gr.Markdown(description)
 
 
 
 
 
112
 
113
+ pdf_input = gr.File(label="Upload PDF")
114
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
115
+ output_text = gr.Textbox(label="Model Answer")
116
+ output_images = gr.Textbox(label="Number of Images in PDF")
117
+ duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
118
+
119
+ gr.Row([pdf_input, query_input])
120
+ gr.Row([output_text, output_images])
121
+ gr.Row([duplicate_button])
122
+
123
  gr.HTML(footer)
124
 
125
+ demo.launch(debug=True)