Muzammil6376 commited on
Commit
583b178
Β·
verified Β·
1 Parent(s): e735775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -65
app.py CHANGED
@@ -22,11 +22,9 @@ from langchain_huggingface import HuggingFaceEmbeddings
22
 
23
 
24
 
25
-
26
- # ── Globals ───────────────────────────────────────────────────────────────────
27
  retriever = None # FAISS retriever for multimodal content
28
  current_pdf_name = None # Name of the currently loaded PDF
29
- combined_texts = None # Combined text + image captions corpus
30
 
31
  # ── Setup: directories ─────────────────────────────────────────────────────────
32
  FIGURES_DIR = "figures"
@@ -34,76 +32,89 @@ if os.path.exists(FIGURES_DIR):
34
  shutil.rmtree(FIGURES_DIR)
35
  os.makedirs(FIGURES_DIR, exist_ok=True)
36
 
37
- # ── Models & Clients ───────────────────────────────────────────────────────────
38
- # Chat model (Mistral-7B-Instruct)
39
- chat_client = InferenceClient(model="google/gemma-3-27b-it")
40
- # Text embeddings (BAAI BGE)
41
- embeddings = HuggingFaceEmbeddings(model_name="google/gemma-3-27b-it")
42
- # Image captioning (BLIP)
43
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
44
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
45
 
46
 
47
  def generate_caption(image_path: str) -> str:
48
- """
49
- Generates a natural-language caption for an image using BLIP.
50
- """
51
- image = Image.open(image_path).convert('RGB')
52
  inputs = blip_processor(image, return_tensors="pt")
53
  out = blip_model.generate(**inputs)
54
- caption = blip_processor.decode(out[0], skip_special_tokens=True)
55
- return caption
 
 
 
 
 
 
 
 
 
 
 
56
 
57
 
58
  def process_pdf(pdf_file) -> str:
59
  """
60
- Parses the uploaded PDF into text chunks and image captions,
61
- builds a FAISS index, and prepares the retriever.
62
- Returns status message.
63
  """
64
  global current_pdf_name, retriever, combined_texts
 
65
  if pdf_file is None:
66
  return "❌ Please upload a PDF file."
67
 
68
- # Save PDF locally for unstructured
69
  pdf_path = pdf_file.name
70
  current_pdf_name = os.path.basename(pdf_path)
71
 
72
- # Extract text, table, and image blocks
73
  elements = partition_pdf(
74
  filename=pdf_path,
75
  strategy=PartitionStrategy.HI_RES,
76
  extract_image_block_types=["Image", "Table"],
77
- extract_image_block_output_dir=FIGURES_DIR
78
  )
79
 
80
- # Separate text and image elements
81
- text_elements = [el.text for el in elements if el.category not in ["Image", "Table"] and el.text]
82
- image_files = [os.path.join(FIGURES_DIR, f)
83
- for f in os.listdir(FIGURES_DIR)
84
- if f.lower().endswith((".png", ".jpg", ".jpeg"))]
85
-
86
- # Generate captions for each image
87
- captions = []
88
- for img in image_files:
89
- cap = generate_caption(img)
90
- captions.append(cap)
91
-
92
- # Combine all pieces for indexing
 
 
93
  combined_texts = text_elements + captions
94
 
95
- # Create FAISS index and retriever
96
- index = FAISS.from_texts(combined_texts, embeddings)
 
 
 
 
 
 
97
  retriever = index.as_retriever(search_kwargs={"k": 2})
98
 
99
- status = f"βœ… Indexed '{current_pdf_name}' β€” {len(text_elements)} text blocks + {len(captions)} image captions"
100
- return status
101
 
102
 
103
  def ask_question(question: str) -> str:
104
- """
105
- Retrieves relevant chunks from the FAISS index and generates an answer via chat model.
106
- """
107
  global retriever
108
  if retriever is None:
109
  return "❌ Please upload and process a PDF first."
@@ -119,56 +130,65 @@ def ask_question(question: str) -> str:
119
  f"Question: {question}\n"
120
  "Answer:"
121
  )
122
-
123
- response = chat_client.chat_completion(
124
  messages=[{"role": "user", "content": prompt}],
125
  max_tokens=128,
126
- temperature=0.5
127
  )
128
- answer = response["choices"][0]["message"]["content"].strip()
129
- return answer
130
 
131
 
132
  def clear_interface():
133
- """Resets global state and clears the figures directory."""
134
  global retriever, current_pdf_name, combined_texts
135
  retriever = None
136
  current_pdf_name = None
137
- combined_texts = None
138
- shutil.rmtree(FIGURES_DIR)
139
  os.makedirs(FIGURES_DIR, exist_ok=True)
140
  return ""
141
 
 
142
  # ── Gradio UI ────────────────────────────────────────────────────────────────
143
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
144
  with gr.Blocks(theme=theme, css="""
145
  .container { border-radius: 10px; padding: 15px; }
146
- .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
147
- .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
148
- .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
 
 
 
 
149
  """) as demo:
150
- gr.Markdown("<div class='main-title'>DocQueryAI (Multimodal)</div>")
151
 
152
  with gr.Row():
153
  with gr.Column():
154
  gr.Markdown("## πŸ“„ Document Input")
155
- pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
156
- pdf_file = gr.File(file_types=[".pdf"], type="filepath")
157
  process_btn = gr.Button("πŸ“€ Process Document", variant="primary")
158
- status_box = gr.Textbox(label="Status", interactive=False)
159
 
160
  with gr.Column():
161
  gr.Markdown("## ❓ Ask Questions")
162
- question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
163
- ask_btn = gr.Button("πŸ” Ask Question", variant="primary")
164
- answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
 
165
 
166
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
167
- gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS + BLIP | Gradio</div>")
168
 
169
- process_btn.click(fn=process_pdf, inputs=[pdf_file], outputs=[status_box])
170
- ask_btn.click(fn=ask_question, inputs=[question_input], outputs=[answer_output])
171
- clear_btn.click(fn=clear_interface, outputs=[status_box, answer_output])
 
 
 
 
 
172
 
173
  if __name__ == "__main__":
174
- demo.launch(debug=True, share=True)
 
22
 
23
 
24
 
 
 
25
  retriever = None # FAISS retriever for multimodal content
26
  current_pdf_name = None # Name of the currently loaded PDF
27
+ combined_texts: List[str] = [] # Combined text + image captions corpus
28
 
29
  # ── Setup: directories ─────────────────────────────────────────────────────────
30
  FIGURES_DIR = "figures"
 
32
  shutil.rmtree(FIGURES_DIR)
33
  os.makedirs(FIGURES_DIR, exist_ok=True)
34
 
35
+ # ── Clients & Models ───────────────────────────────────────────────────────────
36
+ hf = InferenceClient() # will use HUGGINGFACEHUB_API_TOKEN from env
37
+
38
+ # BLIP captioner (small local model download)
 
 
39
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
40
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
41
 
42
 
43
  def generate_caption(image_path: str) -> str:
44
+ """Ask BLIP to caption a local image."""
45
+ image = Image.open(image_path).convert("RGB")
 
 
46
  inputs = blip_processor(image, return_tensors="pt")
47
  out = blip_model.generate(**inputs)
48
+ return blip_processor.decode(out[0], skip_special_tokens=True)
49
+
50
+
51
+ def embed_texts(texts: List[str]) -> List[List[float]]:
52
+ """
53
+ Call the HF embeddings endpoint.
54
+ Uses `google/Gemma-Embeddings-v1.0` (or any other hosted embeddings model).
55
+ """
56
+ resp = hf.embeddings(
57
+ model="google/Gemma-Embeddings-v1.0",
58
+ inputs=texts,
59
+ )
60
+ return resp["embeddings"]
61
 
62
 
63
  def process_pdf(pdf_file) -> str:
64
  """
65
+ Parse the PDF, caption its images, combine text+captions, embed remotely,
66
+ build FAISS index, and prepare retriever.
 
67
  """
68
  global current_pdf_name, retriever, combined_texts
69
+
70
  if pdf_file is None:
71
  return "❌ Please upload a PDF file."
72
 
73
+ # Save and name
74
  pdf_path = pdf_file.name
75
  current_pdf_name = os.path.basename(pdf_path)
76
 
77
+ # Extract blocks
78
  elements = partition_pdf(
79
  filename=pdf_path,
80
  strategy=PartitionStrategy.HI_RES,
81
  extract_image_block_types=["Image", "Table"],
82
+ extract_image_block_output_dir=FIGURES_DIR,
83
  )
84
 
85
+ # Split text vs. images
86
+ text_elements = [
87
+ el.text for el in elements
88
+ if el.category not in ["Image", "Table"] and el.text
89
+ ]
90
+ image_files = [
91
+ os.path.join(FIGURES_DIR, f)
92
+ for f in os.listdir(FIGURES_DIR)
93
+ if f.lower().endswith((".png", ".jpg", ".jpeg"))
94
+ ]
95
+
96
+ # Caption images
97
+ captions = [generate_caption(img) for img in image_files]
98
+
99
+ # Combine
100
  combined_texts = text_elements + captions
101
 
102
+ # Remote embeddings
103
+ vectors = embed_texts(combined_texts)
104
+
105
+ # Build FAISS
106
+ index = FAISS.from_embeddings(
107
+ texts=combined_texts,
108
+ embeddings=vectors,
109
+ )
110
  retriever = index.as_retriever(search_kwargs={"k": 2})
111
 
112
+ return f"βœ… Indexed '{current_pdf_name}' β€” " \
113
+ f"{len(text_elements)} text blocks + {len(captions)} image captions"
114
 
115
 
116
  def ask_question(question: str) -> str:
117
+ """Retrieve top-k chunks from FAISS and call chat_completions endpoint."""
 
 
118
  global retriever
119
  if retriever is None:
120
  return "❌ Please upload and process a PDF first."
 
130
  f"Question: {question}\n"
131
  "Answer:"
132
  )
133
+ response = hf.chat_completion(
134
+ model="google/gemma-3-27b-it",
135
  messages=[{"role": "user", "content": prompt}],
136
  max_tokens=128,
137
+ temperature=0.5,
138
  )
139
+ return response["choices"][0]["message"]["content"].strip()
 
140
 
141
 
142
  def clear_interface():
143
+ """Reset state and clear extracted images."""
144
  global retriever, current_pdf_name, combined_texts
145
  retriever = None
146
  current_pdf_name = None
147
+ combined_texts = []
148
+ shutil.rmtree(FIGURES_DIR, ignore_errors=True)
149
  os.makedirs(FIGURES_DIR, exist_ok=True)
150
  return ""
151
 
152
+
153
  # ── Gradio UI ────────────────────────────────────────────────────────────────
154
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
155
  with gr.Blocks(theme=theme, css="""
156
  .container { border-radius: 10px; padding: 15px; }
157
+ .pdf-active { border-left: 3px solid #6366f1;
158
+ padding-left: 10px;
159
+ background-color: rgba(99,102,241,0.1); }
160
+ .footer { text-align: center; margin-top: 30px;
161
+ font-size: 0.8em; color: #666; }
162
+ .main-title { text-align: center; font-size: 64px;
163
+ font-weight: bold; margin-bottom: 20px; }
164
  """) as demo:
165
+ gr.Markdown("<div class='main-title'>DocQueryAI (Remote‐RAG)</div>")
166
 
167
  with gr.Row():
168
  with gr.Column():
169
  gr.Markdown("## πŸ“„ Document Input")
170
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
 
171
  process_btn = gr.Button("πŸ“€ Process Document", variant="primary")
172
+ status_box = gr.Textbox(label="Status", interactive=False)
173
 
174
  with gr.Column():
175
  gr.Markdown("## ❓ Ask Questions")
176
+ question_input = gr.Textbox(lines=3,
177
+ placeholder="Enter your question here…")
178
+ ask_btn = gr.Button("πŸ” Ask Question", variant="primary")
179
+ answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
180
 
181
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
182
+ gr.Markdown("<div class='footer'>Powered by HF Inference + BLIP + FAISS | Gradio</div>")
183
 
184
+ process_btn.click(fn=process_pdf,
185
+ inputs=[pdf_file],
186
+ outputs=[status_box])
187
+ ask_btn.click(fn=ask_question,
188
+ inputs=[question_input],
189
+ outputs=[answer_output])
190
+ clear_btn.click(fn=clear_interface,
191
+ outputs=[status_box, answer_output])
192
 
193
  if __name__ == "__main__":
194
+ demo.launch(debug=True, share=True)