Muzammil6376 commited on
Commit
2a4ba68
Β·
verified Β·
1 Parent(s): 7133a05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -52
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import os
2
  import shutil
3
- import PyPDF2
 
4
  import gradio as gr
5
  from PIL import Image
6
- from typing import List
7
  # Unstructured for rich PDF parsing
8
  from unstructured.partition.pdf import partition_pdf
9
  from unstructured.partition.utils.constants import PartitionStrategy
@@ -14,18 +15,14 @@ from transformers import BlipProcessor, BlipForConditionalGeneration
14
  # Hugging Face Inference client
15
  from huggingface_hub import InferenceClient
16
 
17
- # LangChain vectorstore and embeddings
18
- from langchain_community.vectorstores import FAISS
19
- from langchain_huggingface import HuggingFaceEmbeddings
20
-
21
-
22
-
23
-
24
 
25
  # ── Globals ───────────────────────────────────────────────────────────────────
26
  retriever = None # FAISS retriever for multimodal content
27
  current_pdf_name = None # Name of the currently loaded PDF
28
  combined_texts: List[str] = [] # Combined text + image captions corpus
 
29
 
30
  # ── Setup: directories ─────────────────────────────────────────────────────────
31
  FIGURES_DIR = "figures"
@@ -38,7 +35,7 @@ hf = InferenceClient() # uses HUGGINGFACEHUB_API_TOKEN env var
38
 
39
  # BLIP captioner
40
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
41
- blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
42
 
43
 
44
  def generate_caption(image_path: str) -> str:
@@ -55,60 +52,68 @@ def embed_texts(texts: List[str]) -> List[List[float]]:
55
  return resp["embeddings"]
56
 
57
 
58
- def process_pdf(pdf_file) -> str:
59
  """
60
- Parse PDF, extract text and images, caption images,
61
- embed all chunks remotely, build FAISS index.
 
62
  """
63
- global retriever, current_pdf_name, combined_texts
64
 
65
  if pdf_file is None:
66
- return "❌ Please upload a PDF file."
67
 
68
- pdf_path = pdf_file.name
69
- current_pdf_name = os.path.basename(pdf_path)
 
 
 
 
70
 
71
- # Attempt rich parsing
72
  try:
73
- from pdf2image.exceptions import PDFInfoNotInstalledError
74
  elements = partition_pdf(
75
- filename=pdf_path,
76
  strategy=PartitionStrategy.HI_RES,
77
- extract_image_block_types=["Image","Table"],
78
  extract_image_block_output_dir=FIGURES_DIR,
79
  )
80
  text_elements = [el.text for el in elements if el.category not in ["Image","Table"] and el.text]
81
- image_files = [os.path.join(FIGURES_DIR, f) for f in os.listdir(FIGURES_DIR)
82
- if f.lower().endswith((".png",".jpg",".jpeg"))]
83
  except Exception:
84
- # Fallback to text-only
85
- from pypdf import PdfReader
86
- reader = PdfReader(pdf_path)
87
- text_elements = [page.extract_text() or "" for page in reader.pages]
88
  image_files = []
89
 
90
  captions = [generate_caption(img) for img in image_files]
91
- combined_texts = text_elements + captions
 
 
 
 
 
 
92
 
93
  vectors = embed_texts(combined_texts)
94
  index = FAISS.from_embeddings(texts=combined_texts, embeddings=vectors)
95
  retriever = index.as_retriever(search_kwargs={"k":2})
96
 
97
- return f"βœ… Indexed '{current_pdf_name}' β€” {len(text_elements)} text blocks + {len(captions)} image captions"
 
98
 
99
 
100
- def ask_question(question: str) -> str:
101
- """Retrieve from FAISS and call chat completion."""
102
  global retriever
103
  if retriever is None:
104
- return "❌ Please process a PDF first."
105
  if not question.strip():
106
  return "❌ Please enter a question."
107
 
108
  docs = retriever.get_relevant_documents(question)
109
  context = "\n\n".join(doc.page_content for doc in docs)
110
  prompt = (
111
- "Use the following excerpts to answer the question:\n\n"
112
  f"{context}\n\nQuestion: {question}\nAnswer:"
113
  )
114
  response = hf.chat_completion(
@@ -120,34 +125,81 @@ def ask_question(question: str) -> str:
120
  return response["choices"][0]["message"]["content"].strip()
121
 
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def clear_interface():
124
- """Reset all state and clear extracted images."""
125
- global retriever, current_pdf_name, combined_texts
126
  retriever = None
127
  current_pdf_name = None
128
  combined_texts = []
 
129
  shutil.rmtree(FIGURES_DIR, ignore_errors=True)
130
  os.makedirs(FIGURES_DIR, exist_ok=True)
131
- return ""
132
-
133
- # Gradio UI
134
- with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")) as demo:
135
- gr.Markdown("# DocQueryAI (Remote‐RAG)")
 
 
 
 
 
 
136
  with gr.Row():
137
  with gr.Column():
138
- pdf_file = gr.File(file_types=[".pdf"], type="filepath")
139
- process_btn = gr.Button("Process PDF")
140
- status_box = gr.Textbox(interactive=False)
 
 
141
  with gr.Column():
142
- question_input = gr.Textbox(lines=3)
143
- ask_btn = gr.Button("Ask")
144
- answer_output = gr.Textbox(interactive=False)
145
- clear_btn = gr.Button("Clear All")
146
-
147
- process_btn.click(fn=process_pdf, inputs=[pdf_file], outputs=[status_box])
148
- ask_btn.click(fn=ask_question, inputs=[question_input], outputs=[answer_output])
149
- clear_btn.click(fn=clear_interface, outputs=[status_box, answer_output])
 
 
 
 
 
 
 
 
 
150
 
151
  if __name__ == "__main__":
152
- demo.launch()
 
153
 
 
1
  import os
2
  import shutil
3
+ from typing import List
4
+
5
  import gradio as gr
6
  from PIL import Image
7
+
8
  # Unstructured for rich PDF parsing
9
  from unstructured.partition.pdf import partition_pdf
10
  from unstructured.partition.utils.constants import PartitionStrategy
 
15
  # Hugging Face Inference client
16
  from huggingface_hub import InferenceClient
17
 
18
+ # FAISS vectorstore
19
+ from langchain.vectorstores.faiss import FAISS
 
 
 
 
 
20
 
21
  # ── Globals ───────────────────────────────────────────────────────────────────
22
  retriever = None # FAISS retriever for multimodal content
23
  current_pdf_name = None # Name of the currently loaded PDF
24
  combined_texts: List[str] = [] # Combined text + image captions corpus
25
+ pdf_text: str = "" # Full PDF text for summary/keywords
26
 
27
  # ── Setup: directories ─────────────────────────────────────────────────────────
28
  FIGURES_DIR = "figures"
 
35
 
36
  # BLIP captioner
37
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
38
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
39
 
40
 
41
  def generate_caption(image_path: str) -> str:
 
52
  return resp["embeddings"]
53
 
54
 
55
+ def process_pdf(pdf_file):
56
  """
57
+ Reads & extracts text and images from the PDF, captions images,
58
+ splits & embeds chunks, builds FAISS index, and stores full text.
59
+ Returns filename, status, and enables Q&A box.
60
  """
61
+ global retriever, current_pdf_name, combined_texts, pdf_text
62
 
63
  if pdf_file is None:
64
+ return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
65
 
66
+ current_pdf_name = os.path.basename(pdf_file.name)
67
+ # extract full text for summary/keywords
68
+ from pypdf import PdfReader
69
+ reader = PdfReader(pdf_file.name)
70
+ pages = [page.extract_text() or "" for page in reader.pages]
71
+ pdf_text = "\n\n".join(pages)
72
 
73
+ # parse with unstructured for images
74
  try:
 
75
  elements = partition_pdf(
76
+ filename=pdf_file.name,
77
  strategy=PartitionStrategy.HI_RES,
78
+ extract_image_block_types=["Image", "Table"],
79
  extract_image_block_output_dir=FIGURES_DIR,
80
  )
81
  text_elements = [el.text for el in elements if el.category not in ["Image","Table"] and el.text]
82
+ image_files = [os.path.join(FIGURES_DIR, f) for f in os.listdir(FIGURES_DIR)
83
+ if f.lower().endswith((".png",".jpg",".jpeg"))]
84
  except Exception:
85
+ text_elements = pages
 
 
 
86
  image_files = []
87
 
88
  captions = [generate_caption(img) for img in image_files]
89
+ # split text elements into chunks
90
+ from langchain.text_splitter import CharacterTextSplitter
91
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
92
+ chunks = []
93
+ for t in text_elements:
94
+ chunks.extend(splitter.split_text(t))
95
+ combined_texts = chunks + captions
96
 
97
  vectors = embed_texts(combined_texts)
98
  index = FAISS.from_embeddings(texts=combined_texts, embeddings=vectors)
99
  retriever = index.as_retriever(search_kwargs={"k":2})
100
 
101
+ status = f"βœ… Indexed '{current_pdf_name}' β€” {len(chunks)} text chunks + {len(captions)} image captions"
102
+ return current_pdf_name, status, gr.update(interactive=True)
103
 
104
 
105
+ def ask_question(pdf_name, question):
106
+ """Retrieve relevant chunks and generate answer via remote LLM."""
107
  global retriever
108
  if retriever is None:
109
+ return "❌ Please upload and index a PDF first."
110
  if not question.strip():
111
  return "❌ Please enter a question."
112
 
113
  docs = retriever.get_relevant_documents(question)
114
  context = "\n\n".join(doc.page_content for doc in docs)
115
  prompt = (
116
+ "Use the following document excerpts to answer the question.\n\n"
117
  f"{context}\n\nQuestion: {question}\nAnswer:"
118
  )
119
  response = hf.chat_completion(
 
125
  return response["choices"][0]["message"]["content"].strip()
126
 
127
 
128
+ def generate_summary():
129
+ """Ask remote LLM for concise summary using full text."""
130
+ if not pdf_text:
131
+ return "❌ Please upload and index a PDF first."
132
+ ctx = pdf_text[:2000]
133
+ resp = hf.chat_completion(
134
+ model="google/gemma-3-27b-it",
135
+ messages=[{"role":"user","content":f"Summarize concisely:\n\n{ctx}..."}],
136
+ max_tokens=150,
137
+ temperature=0.5,
138
+ )
139
+ return resp["choices"][0]["message"]["content"].strip()
140
+
141
+
142
+ def extract_keywords():
143
+ """Ask remote LLM to extract key terms from full text."""
144
+ if not pdf_text:
145
+ return "❌ Please upload and index a PDF first."
146
+ ctx = pdf_text[:2000]
147
+ resp = hf.chat_completion(
148
+ model="google/gemma-3-27b-it",
149
+ messages=[{"role":"user","content":f"Extract 10-15 key terms:\n\n{ctx}..."}],
150
+ max_tokens=60,
151
+ temperature=0.5,
152
+ )
153
+ return resp["choices"][0]["message"]["content"].strip()
154
+
155
+
156
  def clear_interface():
157
+ """Reset state and clear extracted images."""
158
+ global retriever, current_pdf_name, combined_texts, pdf_text
159
  retriever = None
160
  current_pdf_name = None
161
  combined_texts = []
162
+ pdf_text = ""
163
  shutil.rmtree(FIGURES_DIR, ignore_errors=True)
164
  os.makedirs(FIGURES_DIR, exist_ok=True)
165
+ return None, "", gr.update(interactive=False)
166
+
167
+ # ── Gradio UI ────────────────────────────────────────────────────────────────
168
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
169
+ with gr.Blocks(theme=theme, css="""
170
+ .container { border-radius: 10px; padding: 15px; }
171
+ .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
172
+ .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
173
+ .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
174
+ """) as demo:
175
+ gr.Markdown("<div class='main-title'>DocQueryAI (Multimodal RAG)</div>")
176
  with gr.Row():
177
  with gr.Column():
178
+ gr.Markdown("## πŸ“„ Document Input")
179
+ pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
180
+ pdf_file = gr.File(file_types=[".pdf"], type="filepath")
181
+ upload_button = gr.Button("πŸ“€ Process Document", variant="primary")
182
+ status_box = gr.Textbox(label="Status", interactive=False)
183
  with gr.Column():
184
+ gr.Markdown("## ❓ Ask Questions")
185
+ question_input = gr.Textbox(lines=3, placeholder="Enter your question here…", interactive=False)
186
+ ask_button = gr.Button("πŸ” Ask Question", variant="primary", interactive=False)
187
+ answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
188
+ with gr.Row():
189
+ summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary", interactive=False)
190
+ summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
191
+ keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary", interactive=False)
192
+ keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False)
193
+ clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
194
+ gr.Markdown("<div class='footer'>Powered by HF Inference + FAISS + BLIP | Gradio</div>")
195
+
196
+ upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input])
197
+ ask_button.click(ask_question, [pdf_display, question_input], answer_output)
198
+ summary_button.click(generate_summary, [], summary_output)
199
+ keywords_button.click(extract_keywords, [], keywords_output)
200
+ clear_button.click(clear_interface, [], [pdf_display, status_box, question_input])
201
 
202
  if __name__ == "__main__":
203
+ demo.launch(debug=True)
204
+
205