arad1367 commited on
Commit
122b92f
·
verified ·
1 Parent(s): e75727f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -80
app.py CHANGED
@@ -9,23 +9,17 @@ import torch
9
  import torchvision
10
  import subprocess
11
 
12
- # Run the commands from setup.sh to install poppler-utils
13
  def install_poppler():
14
  try:
15
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
16
  except FileNotFoundError:
17
  print("Poppler not found. Installing...")
18
- # Run the setup commands
19
  subprocess.run("apt-get update", shell=True)
20
  subprocess.run("apt-get install -y poppler-utils", shell=True)
21
 
22
- # Call the Poppler installation check
23
  install_poppler()
24
-
25
- # Install flash-attn if not already installed
26
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
27
 
28
- # Load the RAG Model and the Qwen2-VL-2B-Instruct model
29
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
30
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
31
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
@@ -33,24 +27,17 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
33
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
- # Convert the PDF to images
37
  images = convert_from_path(pdf_file.name)
38
  num_images = len(images)
39
-
40
- # Indexing the PDF in RAG
41
  RAG.index(
42
  input_path=pdf_file.name,
43
- index_name="image_index",
44
  store_collection_with_index=False,
45
  overwrite=True
46
  )
47
-
48
- # Search the query in the RAG model
49
  results = RAG.search(user_query, k=1)
50
  if not results:
51
  return "No results found.", num_images
52
-
53
- # Retrieve the page number and process image
54
  image_index = results[0]["page_num"] - 1
55
  messages = [
56
  {
@@ -64,8 +51,6 @@ def process_pdf_and_query(pdf_file, user_query):
64
  ],
65
  }
66
  ]
67
-
68
- # Generate text with the Qwen model
69
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
  image_inputs, video_inputs = process_vision_info(messages)
71
  inputs = processor(
@@ -76,8 +61,6 @@ def process_pdf_and_query(pdf_file, user_query):
76
  return_tensors="pt",
77
  )
78
  inputs = inputs.to("cuda")
79
-
80
- # Generate the output response
81
  generated_ids = model.generate(**inputs, max_new_tokens=50)
82
  generated_ids_trimmed = [
83
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -85,29 +68,8 @@ def process_pdf_and_query(pdf_file, user_query):
85
  output_text = processor.batch_decode(
86
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
87
  )
88
-
89
  return output_text[0], num_images
90
 
91
-
92
- pdf_input = gr.File(label="Upload PDF")
93
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
94
- output_text = gr.Textbox(label="Model Answer")
95
- output_images = gr.Textbox(label="Number of Images in PDF")
96
-
97
-
98
- explanation = """
99
- <div style="text-align: center; margin-bottom: 20px;">
100
- <h2 style="font-weight: bold; font-size: 24px;">Multimodal RAG (Retrieval-Augmented Generation)</h2>
101
- <p>
102
- This application utilizes the ColPali model as a multimodal retriever,
103
- which retrieves relevant information from documents and generates answers
104
- using the Qwen/Qwen2-VL-2B-Instruct LLM (Large Language Model)
105
- via the Byaldi library, developed by Answer.ai.
106
- </p>
107
- </div>
108
- """
109
-
110
-
111
  footer = """
112
  <div style="text-align: center; margin-top: 20px;">
113
  <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
@@ -121,49 +83,30 @@ footer = """
121
  </div>
122
  """
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  demo = gr.Interface(
126
  fn=process_pdf_and_query,
127
- inputs=[pdf_input, query_input],
128
- outputs=[output_text, output_images],
129
- title="<div style='text-align: center; font-weight: bold; font-size: 28px;'>Multimodal RAG with Image Query - By <a href='https://github.com/arad1367' style='color: blue;'>Pejman Ebrahimi</a></div>",
130
- theme='freddyaboulton/dracula_revamped',
 
131
  )
132
 
133
-
134
- with demo:
135
- gr.HTML(explanation)
136
- gr.HTML(footer)
137
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
138
-
139
- submit_btn = gr.Button("Submit", elem_classes="submit-button")
140
-
141
-
142
- css = """
143
- <style>
144
- .submit-button {
145
- background-color: green;
146
- color: white;
147
- border: none;
148
- border-radius: 5px;
149
- padding: 10px 20px;
150
- font-size: 16px;
151
- cursor: pointer;
152
- margin: 10px; /* Add some space between buttons */
153
- }
154
- .duplicate-button {
155
- background-color: green;
156
- color: white;
157
- border: none;
158
- border-radius: 5px;
159
- padding: 10px 20px;
160
- font-size: 16px;
161
- cursor: pointer;
162
- margin: 10px; /* Add some space between buttons */
163
- }
164
- </style>
165
- """
166
- gr.HTML(css)
167
-
168
- # Launch the Gradio app
169
- demo.launch(debug=True)
 
9
  import torchvision
10
  import subprocess
11
 
 
12
  def install_poppler():
13
  try:
14
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
15
  except FileNotFoundError:
16
  print("Poppler not found. Installing...")
 
17
  subprocess.run("apt-get update", shell=True)
18
  subprocess.run("apt-get install -y poppler-utils", shell=True)
19
 
 
20
  install_poppler()
 
 
21
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
22
 
 
23
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
24
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
25
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
 
27
 
28
  @spaces.GPU()
29
  def process_pdf_and_query(pdf_file, user_query):
 
30
  images = convert_from_path(pdf_file.name)
31
  num_images = len(images)
 
 
32
  RAG.index(
33
  input_path=pdf_file.name,
34
+ index_name="image_index",
35
  store_collection_with_index=False,
36
  overwrite=True
37
  )
 
 
38
  results = RAG.search(user_query, k=1)
39
  if not results:
40
  return "No results found.", num_images
 
 
41
  image_index = results[0]["page_num"] - 1
42
  messages = [
43
  {
 
51
  ],
52
  }
53
  ]
 
 
54
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
  image_inputs, video_inputs = process_vision_info(messages)
56
  inputs = processor(
 
61
  return_tensors="pt",
62
  )
63
  inputs = inputs.to("cuda")
 
 
64
  generated_ids = model.generate(**inputs, max_new_tokens=50)
65
  generated_ids_trimmed = [
66
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
68
  output_text = processor.batch_decode(
69
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
70
  )
 
71
  return output_text[0], num_images
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  footer = """
74
  <div style="text-align: center; margin-top: 20px;">
75
  <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
 
83
  </div>
84
  """
85
 
86
+ pdf_input = gr.File(label="Upload PDF")
87
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
88
+ output_text = gr.Textbox(label="Model Answer")
89
+ output_images = gr.Textbox(label="Number of Images in PDF")
90
+ duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button", color="green")
91
+
92
+ explanation_text = """
93
+ <div style='text-align: center; font-size: 16px;'>
94
+ <h2 style='font-weight: bold;'>Multimodal RAG Overview</h2>
95
+ <p>This application utilizes a Multimodal RAG (Retrieve-and-Generate) approach, enabling users to query information from PDF documents
96
+ by extracting relevant text and images. The ColPali model serves as a multimodal retriever, while the Byaldi library simplifies
97
+ the integration of ColPali. The Qwen/Qwen2-VL-2B-Instruct LLM enhances the generation of responses based on the retrieved content.</p>
98
+ </div>
99
+ """
100
 
101
  demo = gr.Interface(
102
  fn=process_pdf_and_query,
103
+ inputs=[pdf_input, query_input],
104
+ outputs=[output_text, output_images],
105
+ title="<div style='text-align: center; font-size: 24px; font-weight: bold;'>Multimodal RAG with Image Query</div>",
106
+ description=explanation_text,
107
+ theme='freddyaboulton/dracula_revamped'
108
  )
109
 
110
+ demo.launch(debug=True)
111
+ demo.append(duplicate_button)
112
+ demo.append(gr.HTML(footer))