arad1367 commited on
Commit
f21b6d3
Β·
verified Β·
1 Parent(s): e50fed9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -57
app.py CHANGED
@@ -33,20 +33,24 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
33
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
 
36
  images = convert_from_path(pdf_file.name)
37
  num_images = len(images)
38
 
 
39
  RAG.index(
40
  input_path=pdf_file.name,
41
- index_name="image_index",
42
  store_collection_with_index=False,
43
  overwrite=True
44
  )
45
 
 
46
  results = RAG.search(user_query, k=1)
47
  if not results:
48
  return "No results found.", num_images
49
 
 
50
  image_index = results[0]["page_num"] - 1
51
  messages = [
52
  {
@@ -61,6 +65,7 @@ def process_pdf_and_query(pdf_file, user_query):
61
  }
62
  ]
63
 
 
64
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
65
  image_inputs, video_inputs = process_vision_info(messages)
66
  inputs = processor(
@@ -72,6 +77,7 @@ def process_pdf_and_query(pdf_file, user_query):
72
  )
73
  inputs = inputs.to("cuda")
74
 
 
75
  generated_ids = model.generate(**inputs, max_new_tokens=50)
76
  generated_ids_trimmed = [
77
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -82,37 +88,44 @@ def process_pdf_and_query(pdf_file, user_query):
82
 
83
  return output_text[0], num_images
84
 
 
 
 
 
 
 
 
85
  css = """
86
- .duplicate-button {
87
- background-color: #6272a4;
88
- color: white;
89
- font-weight: bold;
90
- border-radius: 5px;
91
- margin-top: 20px;
92
- padding: 10px;
93
- text-align: center;
94
- }
95
- .gradio-container {
96
- background-color: #282a36;
97
- color: #f8f8f2;
98
- font-family: 'Courier New', Courier, monospace;
99
- padding: 20px;
100
- border-radius: 10px;
101
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
102
- }
103
- .title {
104
- font-size: 30px;
105
- font-weight: bold;
106
- text-align: center;
107
- margin-bottom: 20px;
108
- }
109
  """
110
 
111
- explanation = """
112
- ### Multimodal RAG with Image Query
113
- This demo showcases the **Multimodal RAG (Retriever-Augmented Generation)** model. The RAG system integrates retrieval and generation, allowing it to retrieve relevant information from a multimodal database (like PDFs with text and images) and then generate detailed responses.
114
 
115
- We use **ColPali**, a state-of-the-art multimodal retriever, combined with the **Byaldi** library from **answer.ai**, which simplifies using ColPali. The language model used for generating answers is **Qwen/Qwen2-VL-2B-Instruct**, a powerful vision-language model capable of understanding both text and images.
116
  """
117
 
118
  footer = """
@@ -124,36 +137,21 @@ footer = """
124
  <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
125
  <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
126
  <br>
127
- Made with πŸ’– by Pejman Ebrahimi
128
  </div>
129
  """
130
 
131
- pdf_input = gr.File(label="Upload PDF") # Single PDF file input
132
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF") # User query input
133
- output_text = gr.Textbox(label="Model Answer") # Output for the model's answer
134
- output_images = gr.Textbox(label="Number of Images in PDF") # Output for number of images
135
- duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
136
-
137
- # Launch the Gradio app
138
- demo = gr.Interface(
139
- fn=process_pdf_and_query,
140
- inputs=[pdf_input, query_input], # List of inputs
141
- outputs=[output_text, output_images], # List of outputs
142
- title="",
143
- theme='freddyaboulton/dracula_revamped',
144
- css=css,
145
- description=explanation,
146
- allow_flagging="auto"
147
- )
148
-
149
- with demo:
150
- gr.HTML("""
151
- <div class='title'>
152
- Multimodal RAG with Image Query -
153
- <a href="https://github.com/arad1367" target="_blank" style="color: #ff79c6; text-decoration: none;">
154
- Pejman Ebrahimi
155
- </a>
156
- </div>
157
- """)
158
  gr.HTML(footer)
159
- duplicate_button
 
 
33
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
+ # Convert the PDF to images
37
  images = convert_from_path(pdf_file.name)
38
  num_images = len(images)
39
 
40
+ # Indexing the PDF in RAG
41
  RAG.index(
42
  input_path=pdf_file.name,
43
+ index_name="image_index", # index will be saved at index_root/index_name/
44
  store_collection_with_index=False,
45
  overwrite=True
46
  )
47
 
48
+ # Search the query in the RAG model
49
  results = RAG.search(user_query, k=1)
50
  if not results:
51
  return "No results found.", num_images
52
 
53
+ # Retrieve the page number and process image
54
  image_index = results[0]["page_num"] - 1
55
  messages = [
56
  {
 
65
  }
66
  ]
67
 
68
+ # Generate text with the Qwen model
69
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
  image_inputs, video_inputs = process_vision_info(messages)
71
  inputs = processor(
 
77
  )
78
  inputs = inputs.to("cuda")
79
 
80
+ # Generate the output response
81
  generated_ids = model.generate(**inputs, max_new_tokens=50)
82
  generated_ids_trimmed = [
83
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
88
 
89
  return output_text[0], num_images
90
 
91
+
92
+ pdf_input = gr.File(label="Upload PDF")
93
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
94
+ output_text = gr.Textbox(label="Model Answer")
95
+ output_images = gr.Textbox(label="Number of Images in PDF")
96
+
97
+ # CSS styling
98
  css = """
99
+ body {
100
+ background-color: #282a36;
101
+ font-family: Arial, sans-serif;
102
+ color: #f8f8f2;
103
+ }
104
+ h1 {
105
+ text-align: center;
106
+ font-size: 2.5em;
107
+ font-weight: bold;
108
+ margin-bottom: 20px;
109
+ }
110
+ footer {
111
+ margin-top: 20px;
112
+ }
113
+ .duplicate-button {
114
+ text-align: center;
115
+ background-color: #50fa7b;
116
+ color: #282a36;
117
+ font-weight: bold;
118
+ border: none;
119
+ padding: 10px;
120
+ cursor: pointer;
121
+ }
122
  """
123
 
124
+ description = """
125
+ ### About Multimodal RAG
126
+ Multimodal Retrieval-Augmented Generation (RAG) integrates both images and text to provide more comprehensive and contextually accurate responses to user queries. It uses a retriever model like **ColPali** to search and retrieve relevant data and a large language model (LLM) like **Qwen/Qwen2-VL-2B-Instruct** to generate natural language answers based on the input.
127
 
128
+ In this demo, **ColPali** is used as a multimodal retriever, and the **Byaldi** library from answer.ai simplifies the use of ColPali. We are utilizing **Qwen2-VL-2B-Instruct** for text generation, enabling both text and image-based queries.
129
  """
130
 
131
  footer = """
 
137
  <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
138
  <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
139
  <br>
140
+ Made with πŸ’– by <a href="https://github.com/arad1367" target="_blank">Pejman Ebrahimi</a>
141
  </div>
142
  """
143
 
144
+ # Gradio Interface
145
+ with gr.Blocks(theme='freddyaboulton/dracula_revamped', css=css) as demo:
146
+ gr.Markdown("<h1>Multimodal RAG with Image Query</h1>")
147
+ gr.Markdown(description)
148
+ with gr.Row():
149
+ pdf_input = gr.File(label="Upload PDF")
150
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
151
+ output_text = gr.Textbox(label="Model Answer")
152
+ output_images = gr.Textbox(label="Number of Images in PDF")
153
+
154
+ gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  gr.HTML(footer)
156
+
157
+ demo.launch(debug=True)