arad1367 commited on
Commit
ec95781
Β·
verified Β·
1 Parent(s): 122b92f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -42
app.py CHANGED
@@ -9,17 +9,23 @@ import torch
9
  import torchvision
10
  import subprocess
11
 
 
12
  def install_poppler():
13
  try:
14
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
15
  except FileNotFoundError:
16
  print("Poppler not found. Installing...")
 
17
  subprocess.run("apt-get update", shell=True)
18
  subprocess.run("apt-get install -y poppler-utils", shell=True)
19
 
 
20
  install_poppler()
 
 
21
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
22
 
 
23
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
24
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
25
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
@@ -27,17 +33,24 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", trust_rem
27
 
28
  @spaces.GPU()
29
  def process_pdf_and_query(pdf_file, user_query):
30
- images = convert_from_path(pdf_file.name)
 
31
  num_images = len(images)
 
 
32
  RAG.index(
33
  input_path=pdf_file.name,
34
- index_name="image_index",
35
  store_collection_with_index=False,
36
  overwrite=True
37
  )
 
 
38
  results = RAG.search(user_query, k=1)
39
  if not results:
40
  return "No results found.", num_images
 
 
41
  image_index = results[0]["page_num"] - 1
42
  messages = [
43
  {
@@ -51,6 +64,8 @@ def process_pdf_and_query(pdf_file, user_query):
51
  ],
52
  }
53
  ]
 
 
54
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
  image_inputs, video_inputs = process_vision_info(messages)
56
  inputs = processor(
@@ -61,6 +76,8 @@ def process_pdf_and_query(pdf_file, user_query):
61
  return_tensors="pt",
62
  )
63
  inputs = inputs.to("cuda")
 
 
64
  generated_ids = model.generate(**inputs, max_new_tokens=50)
65
  generated_ids_trimmed = [
66
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -68,45 +85,39 @@ def process_pdf_and_query(pdf_file, user_query):
68
  output_text = processor.batch_decode(
69
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
70
  )
 
71
  return output_text[0], num_images
72
 
73
- footer = """
74
- <div style="text-align: center; margin-top: 20px;">
75
- <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
76
- <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
77
- <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
78
- <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
79
- <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
80
- <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
81
- <br>
82
- Made with πŸ’– by Pejman Ebrahimi
83
- </div>
84
- """
85
-
86
- pdf_input = gr.File(label="Upload PDF")
87
- query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
88
- output_text = gr.Textbox(label="Model Answer")
89
- output_images = gr.Textbox(label="Number of Images in PDF")
90
- duplicate_button = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button", color="green")
91
-
92
- explanation_text = """
93
- <div style='text-align: center; font-size: 16px;'>
94
- <h2 style='font-weight: bold;'>Multimodal RAG Overview</h2>
95
- <p>This application utilizes a Multimodal RAG (Retrieve-and-Generate) approach, enabling users to query information from PDF documents
96
- by extracting relevant text and images. The ColPali model serves as a multimodal retriever, while the Byaldi library simplifies
97
- the integration of ColPali. The Qwen/Qwen2-VL-2B-Instruct LLM enhances the generation of responses based on the retrieved content.</p>
98
- </div>
99
- """
100
-
101
- demo = gr.Interface(
102
- fn=process_pdf_and_query,
103
- inputs=[pdf_input, query_input],
104
- outputs=[output_text, output_images],
105
- title="<div style='text-align: center; font-size: 24px; font-weight: bold;'>Multimodal RAG with Image Query</div>",
106
- description=explanation_text,
107
- theme='freddyaboulton/dracula_revamped'
108
- )
109
-
110
- demo.launch(debug=True)
111
- demo.append(duplicate_button)
112
- demo.append(gr.HTML(footer))
 
9
  import torchvision
10
  import subprocess
11
 
12
+ # Run the commands from setup.sh to install poppler-utils
13
  def install_poppler():
14
  try:
15
  subprocess.run(["pdfinfo"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
16
  except FileNotFoundError:
17
  print("Poppler not found. Installing...")
18
+ # Run the setup commands
19
  subprocess.run("apt-get update", shell=True)
20
  subprocess.run("apt-get install -y poppler-utils", shell=True)
21
 
22
+ # Call the Poppler installation check
23
  install_poppler()
24
+
25
+ # Install flash-attn if not already installed
26
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
27
 
28
+ # Load the RAG Model and the Qwen2-VL-2B-Instruct model
29
  RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
30
  model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
31
  trust_remote_code=True, torch_dtype=torch.bfloat16).cuda().eval()
 
33
 
34
  @spaces.GPU()
35
  def process_pdf_and_query(pdf_file, user_query):
36
+ # Convert the PDF to images
37
+ images = convert_from_path(pdf_file.name) # pdf_file.name gives the file path
38
  num_images = len(images)
39
+
40
+ # Indexing the PDF in RAG
41
  RAG.index(
42
  input_path=pdf_file.name,
43
+ index_name="image_index", # index will be saved at index_root/index_name/
44
  store_collection_with_index=False,
45
  overwrite=True
46
  )
47
+
48
+ # Search the query in the RAG model
49
  results = RAG.search(user_query, k=1)
50
  if not results:
51
  return "No results found.", num_images
52
+
53
+ # Retrieve the page number and process image
54
  image_index = results[0]["page_num"] - 1
55
  messages = [
56
  {
 
64
  ],
65
  }
66
  ]
67
+
68
+ # Generate text with the Qwen model
69
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
  image_inputs, video_inputs = process_vision_info(messages)
71
  inputs = processor(
 
76
  return_tensors="pt",
77
  )
78
  inputs = inputs.to("cuda")
79
+
80
+ # Generate the output response
81
  generated_ids = model.generate(**inputs, max_new_tokens=50)
82
  generated_ids_trimmed = [
83
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 
85
  output_text = processor.batch_decode(
86
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
87
  )
88
+
89
  return output_text[0], num_images
90
 
91
+
92
+ with gr.Blocks(theme='freddyaboulton/dracula_revamped') as demo:
93
+ gr.HTML("<h1 style='text-align: center; font-size: 30px;'><a href='https://github.com/arad1367'>Multimodal RAG with Image Query - By Pejman Ebrahimi</a></h1>")
94
+ gr.Markdown("Multimodal RAG is a technique that combines both textual and visual data to provide more accurate and comprehensive results. In this application, we use ColPali, a multimodal retriever, and Byaldi, a new library by answer.ai to easily use ColPali. We also use Qwen/Qwen2-VL-2B-Instruct LLM.")
95
+
96
+ pdf_input = gr.File(label="Upload PDF")
97
+ query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the PDF")
98
+ output_text = gr.Textbox(label="Model Answer")
99
+ output_images = gr.Textbox(label="Number of Images in PDF")
100
+
101
+ submit_btn = gr.Button("Submit", variant="primary")
102
+ submit_btn.style(full_width=True)
103
+
104
+ duplicate_btn = gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
105
+ duplicate_btn.style(full_width=True)
106
+
107
+ submit_btn.click(fn=process_pdf_and_query, inputs=[pdf_input, query_input], outputs=[output_text, output_images])
108
+
109
+ footer = """
110
+ <div style="text-align: center; margin-top: 20px;">
111
+ <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
112
+ <a href="https://github.com/arad1367" target="_blank">GitHub</a> |
113
+ <a href="https://arad1367.pythonanywhere.com/" target="_blank">Live demo of my PhD defense</a> |
114
+ <a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct" target="_blank">Qwen/Qwen2-VL-2B-Instruct</a> |
115
+ <a href="https://github.com/AnswerDotAI/byaldi" target="_blank">Byaldi</a> |
116
+ <a href="https://github.com/illuin-tech/colpali" target="_blank">ColPali</a>
117
+ <br>
118
+ Made with πŸ’– by Pejman Ebrahimi
119
+ </div>
120
+ """
121
+ gr.HTML(footer)
122
+
123
+ demo.launch(debug=True)