Muzammil6376 commited on
Commit
7f824f1
Β·
verified Β·
1 Parent(s): cd322c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -183
app.py CHANGED
@@ -1,252 +1,177 @@
1
  # app.py
2
  import os
3
  import tempfile
4
- from pathlib import Path
5
  import base64
6
- import fitz # PyMuPDF
7
- from PIL import Image
8
  import io
9
 
10
  import gradio as gr
11
  from huggingface_hub import InferenceClient
12
-
13
- # Import vectorstore and embeddings from updated packages
14
  from langchain_community.vectorstores import FAISS
15
  from langchain_huggingface import HuggingFaceEmbeddings
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
 
18
- # ── Globals ───────────────────────────────────────────────────────────────────
19
  index = None
20
  retriever = None
21
- current_pdf_name = None
22
  extracted_content = None
23
- extracted_images = []
24
 
25
- # ── Single Multimodal Model ──────────────────────────────────────────────────
26
  multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
27
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
28
 
29
- # Create temp dirs
30
- temp_dir = tempfile.mkdtemp()
31
- figures_dir = os.path.join(temp_dir, "figures")
32
- os.makedirs(figures_dir, exist_ok=True)
33
 
 
34
  def encode_image_to_base64(image_path):
35
- with open(image_path, "rb") as image_file:
36
- return base64.b64encode(image_file.read()).decode('utf-8')
 
37
 
38
- def extract_images_from_pdf_pymupdf(pdf_path):
39
- extracted_images = []
40
- image_descriptions = []
 
 
 
 
41
  try:
42
- pdf_document = fitz.open(pdf_path)
43
- for page_num in range(len(pdf_document)):
44
- page = pdf_document.load_page(page_num)
45
- for img_index, img in enumerate(page.get_images()):
46
  xref = img[0]
47
- pix = fitz.Pixmap(pdf_document, xref)
48
  if pix.n - pix.alpha < 4:
49
- img_data = pix.tobytes("png")
50
- img_pil = Image.open(io.BytesIO(img_data))
51
- image_filename = f"page_{page_num}_img_{img_index}.png"
52
- image_path = os.path.join(figures_dir, image_filename)
53
- img_pil.save(image_path)
54
- desc = analyze_image_with_multimodal_model(image_path)
55
- extracted_images.append(image_path)
56
- image_descriptions.append(desc)
57
  pix = None
58
- pdf_document.close()
59
- return extracted_images, image_descriptions
60
  except Exception as e:
61
- print(f"Error extracting images: {e}")
62
- return [], []
 
63
 
64
- def analyze_image_with_multimodal_model(image_path):
65
  try:
66
  b64 = encode_image_to_base64(image_path)
67
  prompt = (
68
- "Analyze this image and provide a detailed description. Include any text, data, "
69
- "charts, diagrams, tables, or important visual elements you can see.\n"
70
- "Image: [Image data provided]\nDescription:"
71
  )
72
- resp = multimodal_client.text_generation(
73
  prompt=prompt, max_new_tokens=200, temperature=0.3
74
  )
75
- return "[IMAGE CONTENT]: " + resp.strip()
 
 
 
 
 
 
 
76
  except Exception as e:
77
- return f"[IMAGE CONTENT]: Could not analyze image - {e}"
 
78
 
79
- def process_pdf_multimodal(pdf_file):
80
- global current_pdf_name, index, retriever, extracted_content, extracted_images
81
- if pdf_file is None:
82
- return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
83
 
84
- current_pdf_name = os.path.basename(pdf_file.name)
85
- extracted_images.clear()
86
- for f in os.listdir(figures_dir):
87
- os.remove(os.path.join(figures_dir, f))
88
 
 
89
  try:
90
- # Text extraction
91
- pdf_document = fitz.open(pdf_file.name)
92
- text_elements = []
93
- for i in range(len(pdf_document)):
94
- p = pdf_document.load_page(i)
95
- t = p.get_text().strip()
96
- if t:
97
- text_elements.append(f"[PAGE {i+1}]\n{t}")
98
- pdf_document.close()
99
-
100
- # Image extraction & analysis
101
- imgs, img_descs = extract_images_from_pdf_pymupdf(pdf_file.name)
102
- extracted_images.extend(imgs)
103
-
104
- # Combine content and split
105
- all_content = text_elements + img_descs
106
  extracted_content = "\n\n".join(all_content)
107
  if not extracted_content:
108
- return current_pdf_name, "❌ No content extracted.", gr.update(interactive=False)
109
 
110
  splitter = RecursiveCharacterTextSplitter(
111
  chunk_size=1000, chunk_overlap=200, add_start_index=True
112
  )
113
  chunks = splitter.split_text(extracted_content)
114
-
115
  index = FAISS.from_texts(chunks, embeddings)
116
  retriever = index.as_retriever(search_kwargs={"k": 3})
117
 
118
- status = (
119
- f"βœ… Processed '{current_pdf_name}' β€” "
120
- f"{len(chunks)} chunks "
121
- f"({len(text_elements)} pages, {len(img_descs)} images analyzed)"
122
- )
123
- return current_pdf_name, status, gr.update(interactive=True)
124
 
125
  except Exception as e:
126
- return current_pdf_name, f"❌ Error processing PDF: {e}", gr.update(interactive=False)
 
127
 
128
- def ask_multimodal_question(pdf_name, question):
129
  global retriever
130
  if not retriever:
131
- return "❌ Please upload and process a PDF first."
132
  if not question.strip():
133
- return "❌ Please enter a question."
134
 
 
135
  try:
136
  docs = retriever.invoke(question)
137
- context = "\n\n".join(d.page_content for d in docs)
138
- prompt = (
139
- "You are an AI assistant analyzing a document that contains both text and visual elements.\n\n"
140
- f"RETRIEVED CONTEXT:\n{context}\n\n"
141
- f"QUESTION: {question}\n"
142
- "Please provide a comprehensive answer based on the retrieved context above. "
143
- "If you reference visual elements, mention them explicitly.\nANSWER:"
144
- )
145
- resp = multimodal_client.text_generation(
146
- prompt=prompt, max_new_tokens=300, temperature=0.5
147
- )
148
- return resp.strip()
149
- except Exception as e:
150
- return f"❌ Error generating answer: {e}"
151
-
152
- def generate_multimodal_summary():
153
- if not extracted_content:
154
- return "❌ Please upload and process a PDF first."
155
  try:
156
- preview = extracted_content[:4000]
157
- messages = [
158
- {"role":"user","content":[{"type":"text","text":
159
- "Please provide a comprehensive summary of this document content. The content includes both textual "
160
- f"information and descriptions of visual elements.\n\nDOCUMENT CONTENT:\n{preview}\n\nSUMMARY:"
161
- }]}
162
- ]
163
- resp = multimodal_client.chat_completion(
164
- messages=messages, max_tokens=250, temperature=0.3
165
  )
166
- return resp["choices"][0]["message"]["content"].strip()
 
 
 
167
  except Exception as e:
168
- return f"❌ Error generating summary: {e}"
169
 
170
- def extract_multimodal_keywords():
171
- if not extracted_content:
172
- return "❌ Please upload and process a PDF first."
173
- try:
174
- preview = extracted_content[:3000]
175
- messages = [
176
- {"role":"user","content":[{"type":"text","text":
177
- "Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. "
178
- f"DOCUMENT CONTENT:\n{preview}\n\nKEY TERMS:"
179
- }]}
180
- ]
181
- resp = multimodal_client.chat_completion(
182
- messages=messages, max_tokens=120, temperature=0.3
183
- )
184
- return resp["choices"][0]["message"]["content"].strip()
185
- except Exception as e:
186
- return f"❌ Error extracting keywords: {e}"
187
-
188
- def clear_multimodal_interface():
189
- global index, retriever, current_pdf_name, extracted_content, extracted_images
190
- for f in os.listdir(figures_dir):
191
- try: os.remove(os.path.join(figures_dir, f))
192
- except: pass
193
- index = retriever = None
194
- current_pdf_name = extracted_content = None
195
- extracted_images.clear()
196
- return None, "", gr.update(interactive=False)
197
-
198
- # ── Gradio UI ────────────────────────────────────────────────────────────────
199
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
200
-
201
- with gr.Blocks(theme=theme, css="""
202
- .container { border-radius: 10px; padding: 15px; }
203
- .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
204
- .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
205
- .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
206
- .multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; }
207
- .model-info { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px; margin: 10px 0; font-size: 12px; color: #64748b; }
208
- """) as demo:
209
- gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
210
- gr.Markdown("<div style='text-align:center;'><span class='multimodal-badge'>🧠 Single Model β€’ Text + Vision</span></div>")
211
- gr.Markdown("""
212
- <div class='model-info'>
213
- <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
214
- </div>
215
- """)
216
-
217
  with gr.Row():
218
  with gr.Column():
219
- gr.Markdown("## πŸ“„ Document Input")
220
- pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
221
- pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
222
- upload_button = gr.Button("πŸ”„ Process with Multimodal AI", variant="primary")
223
- status_box = gr.Textbox(label="Processing Status", interactive=False)
224
  with gr.Column():
225
- gr.Markdown("## ❓ Ask Questions")
226
- question_input = gr.Textbox(lines=3, placeholder="Ask about text or visual content...", interactive=False)
227
- ask_button = gr.Button("πŸ” Ask Multimodal AI", variant="primary")
228
- answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
229
 
230
- with gr.Row():
231
- with gr.Column():
232
- summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
233
- summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
234
- with gr.Column():
235
- keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
236
- keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
237
-
238
- clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
239
- gr.Markdown("""
240
- <div class='footer'>
241
- <strong>Unified Multimodal Pipeline:</strong> One model handles text, images, charts, tables, diagrams, and mixed content queries
242
- </div>
243
- """)
244
-
245
- upload_button.click(process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input])
246
- ask_button.click(ask_multimodal_question, [pdf_display, question_input], answer_output)
247
- summary_button.click(generate_multimodal_summary, [], summary_output)
248
- keywords_button.click(extract_multimodal_keywords, [], keywords_output)
249
- clear_button.click(clear_multimodal_interface, [], [pdf_file, pdf_display, question_input])
250
 
251
  if __name__ == "__main__":
252
- demo.launch(debug=True)
 
1
  # app.py
2
  import os
3
  import tempfile
 
4
  import base64
5
+ from pathlib import Path
 
6
  import io
7
 
8
  import gradio as gr
9
  from huggingface_hub import InferenceClient
 
 
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_huggingface import HuggingFaceEmbeddings
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
 
14
+ # ── Globals ───────────────────────────────────────────────────────────────
15
  index = None
16
  retriever = None
 
17
  extracted_content = None
 
18
 
19
+ # ── Inference & Embeddings ─────────────────────────────────────────────────
20
  multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
21
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
22
 
23
+ # Temporary dirs for image extraction
24
+ TMP_DIR = tempfile.mkdtemp()
25
+ FIGURES_DIR = os.path.join(TMP_DIR, "figures")
26
+ os.makedirs(FIGURES_DIR, exist_ok=True)
27
 
28
+ # ── Helpers ─────────────────────────────────────────────────────────────────
29
  def encode_image_to_base64(image_path):
30
+ with open(image_path, "rb") as f:
31
+ return base64.b64encode(f.read()).decode()
32
+
33
 
34
+ def extract_images_from_pdf(pdf_path):
35
+ from fitz import open as fitz_open
36
+ from PIL import Image
37
+ import fitz
38
+
39
+ extracted = []
40
+ descriptions = []
41
  try:
42
+ doc = fitz_open(pdf_path)
43
+ for p in range(len(doc)):
44
+ page = doc.load_page(p)
45
+ for img in page.get_images():
46
  xref = img[0]
47
+ pix = fitz.Pixmap(doc, xref)
48
  if pix.n - pix.alpha < 4:
49
+ png = pix.tobytes("png")
50
+ img_pil = Image.open(io.BytesIO(png))
51
+ fname = f"page_{p}_img_{xref}.png"
52
+ path = os.path.join(FIGURES_DIR, fname)
53
+ img_pil.save(path)
54
+ desc = analyze_image(path)
55
+ extracted.append(path)
56
+ descriptions.append(desc)
57
  pix = None
58
+ doc.close()
 
59
  except Exception as e:
60
+ print(f"Image extraction error: {e}")
61
+ return extracted, descriptions
62
+
63
 
64
+ def analyze_image(image_path):
65
  try:
66
  b64 = encode_image_to_base64(image_path)
67
  prompt = (
68
+ "Analyze this image and provide a detailed description. "
69
+ "Include any text, charts, tables, or important visual elements.\n"
70
+ "Image: [data]\nDescription:"
71
  )
72
+ raw = multimodal_client.text_generation(
73
  prompt=prompt, max_new_tokens=200, temperature=0.3
74
  )
75
+ # Handle dict or list wrapping
76
+ if isinstance(raw, dict):
77
+ out = raw.get("generated_text", str(raw))
78
+ elif isinstance(raw, list) and raw and isinstance(raw[0], dict):
79
+ out = raw[0].get("generated_text", str(raw))
80
+ else:
81
+ out = str(raw)
82
+ return f"[IMAGE]: {out.strip()}"
83
  except Exception as e:
84
+ return f"[IMAGE ERROR]: {e}"
85
+
86
 
87
+ def process_pdf(pdf_file):
88
+ global index, retriever, extracted_content
89
+ if not pdf_file:
90
+ return None, "❌ Upload a PDF.", gr.update(interactive=False)
91
 
92
+ # clear old images
93
+ for f in os.listdir(FIGURES_DIR):
94
+ os.remove(os.path.join(FIGURES_DIR, f))
 
95
 
96
+ path = pdf_file.name if isinstance(pdf_file, Path) else pdf_file
97
  try:
98
+ import fitz
99
+ doc = fitz.open(path)
100
+ pages = []
101
+ for i in range(len(doc)):
102
+ txt = doc.load_page(i).get_text().strip()
103
+ if txt:
104
+ pages.append(f"[Page {i+1}]\n" + txt)
105
+ doc.close()
106
+
107
+ imgs, descs = extract_images_from_pdf(path)
108
+ all_content = pages + descs
 
 
 
 
 
109
  extracted_content = "\n\n".join(all_content)
110
  if not extracted_content:
111
+ return pdf_file.name, "❌ No content extracted.", gr.update(interactive=False)
112
 
113
  splitter = RecursiveCharacterTextSplitter(
114
  chunk_size=1000, chunk_overlap=200, add_start_index=True
115
  )
116
  chunks = splitter.split_text(extracted_content)
 
117
  index = FAISS.from_texts(chunks, embeddings)
118
  retriever = index.as_retriever(search_kwargs={"k": 3})
119
 
120
+ msg = f"βœ… Processed {pdf_file.name} β€” {len(chunks)} chunks."
121
+ return pdf_file.name, msg, gr.update(interactive=True)
 
 
 
 
122
 
123
  except Exception as e:
124
+ return pdf_file.name if pdf_file else None, f"❌ PDF error: {e}", gr.update(interactive=False)
125
+
126
 
127
+ def ask_question(doc_name, question):
128
  global retriever
129
  if not retriever:
130
+ return "❌ Process a PDF first."
131
  if not question.strip():
132
+ return "❌ Enter a question."
133
 
134
+ # retrieve
135
  try:
136
  docs = retriever.invoke(question)
137
+ except Exception:
138
+ docs = retriever.get_relevant_documents(question)
139
+
140
+ context = "\n\n".join(d.page_content for d in docs)
141
+ prompt = (
142
+ "You are an AI assistant with both text and visual context.\n"
143
+ f"CONTEXT:\n{context}\nQUESTION: {question}\nAnswer:"
144
+ )
 
 
 
 
 
 
 
 
 
 
145
  try:
146
+ raw = multimodal_client.text_generation(
147
+ prompt=prompt, max_new_tokens=300, temperature=0.5
 
 
 
 
 
 
 
148
  )
149
+ if isinstance(raw, dict): out = raw.get("generated_text", str(raw))
150
+ elif isinstance(raw, list) and raw and isinstance(raw[0], dict): out = raw[0].get("generated_text", str(raw))
151
+ else: out = str(raw)
152
+ return out.strip()
153
  except Exception as e:
154
+ return f"❌ Generation error: {e}"
155
 
156
+ # ── Gradio UI ───────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
158
+ with gr.Blocks(theme=theme) as demo:
159
+ gr.Markdown("## 🧠 Unified MultiModal RAG")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  with gr.Row():
161
  with gr.Column():
162
+ pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"], type="file")
163
+ proc_btn = gr.Button("πŸ”„ Process PDF", variant="primary")
164
+ pdf_disp = gr.Textbox(label="Active Doc", interactive=False)
165
+ status = gr.Textbox(label="Status", interactive=False)
 
166
  with gr.Column():
167
+ q_in = gr.Textbox(label="Ask your question…", lines=3, interactive=False)
168
+ ask_btn = gr.Button("πŸ” Ask", variant="primary", interactive=False)
169
+ ans_out = gr.Textbox(label="Answer", lines=6, interactive=False)
 
170
 
171
+ proc_btn.click(process_pdf, [pdf_in], [pdf_disp, status, q_in])
172
+ # enable ask button only after processing
173
+ proc_btn.click(lambda *_: gr.update(interactive=True), [], [], [ask_btn])
174
+ ask_btn.click(ask_question, [pdf_disp, q_in], ans_out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  if __name__ == "__main__":
177
+ demo.launch(debug=True)