Muzammil6376 commited on
Commit
919ab87
Β·
verified Β·
1 Parent(s): 71a558b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -92
app.py CHANGED
@@ -2,13 +2,14 @@ import os
2
  import gradio as gr
3
  import tempfile
4
  from pathlib import Path
 
5
 
6
  # Import vectorstore and embeddings from langchain community package
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  # Text splitter to break large documents into manageable chunks
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- # HF Inference client for running multimodal models
12
  from huggingface_hub import InferenceClient
13
  # Unstructured for PDF processing with image extraction
14
  from unstructured.partition.pdf import partition_pdf
@@ -19,54 +20,75 @@ index = None # FAISS index storing document embeddings
19
  retriever = None # Retriever to fetch relevant chunks
20
  current_pdf_name = None # Name of the currently loaded PDF
21
  extracted_content = None # Combined text and image descriptions
 
22
 
23
- # ── HF Inference clients ─────────────────────────────────────────────────────
24
- # Text generation client (using a good open model)
25
- text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
26
- # Vision client for image analysis
27
- vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
28
 
29
- # ── Embeddings ───────────────────────────────────────────────────────────────
30
- # Use BGE embeddings for vectorizing text chunks
31
- embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
32
 
33
  # Create temporary directories for processing
34
  temp_dir = tempfile.mkdtemp()
35
  figures_dir = os.path.join(temp_dir, "figures")
36
  os.makedirs(figures_dir, exist_ok=True)
37
 
38
- def extract_image_description(image_path):
 
 
 
 
 
39
  """
40
- Analyze an extracted image using vision model to get text description.
41
  Args:
42
  image_path: Path to the extracted image file
43
  Returns:
44
  Text description of the image content
45
  """
46
  try:
47
- # Read image and send to vision model
48
- with open(image_path, "rb") as img_file:
49
- # Use vision client to analyze the image
50
- response = vision_client.text_to_image_generation(
51
- prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
52
- image=img_file.read()
53
- )
54
- return f"Image content: {response}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  except Exception as e:
56
- return f"Image content: [Could not analyze image - {str(e)}]"
57
 
58
  def process_pdf_multimodal(pdf_file):
59
  """
60
- 1. Extracts text and images from PDF using unstructured
61
- 2. Analyzes extracted images with vision model
62
- 3. Combines text and image descriptions
63
- 4. Creates FAISS index for retrieval
64
- Args:
65
- pdf_file: Uploaded PDF file
66
- Returns:
67
- - PDF filename, status message, and UI updates
68
  """
69
- global current_pdf_name, index, retriever, extracted_content
70
 
71
  if pdf_file is None:
72
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
@@ -74,7 +96,8 @@ def process_pdf_multimodal(pdf_file):
74
  current_pdf_name = os.path.basename(pdf_file.name)
75
 
76
  try:
77
- # Clear previous figures
 
78
  for file in os.listdir(figures_dir):
79
  os.remove(os.path.join(figures_dir, file))
80
 
@@ -91,22 +114,27 @@ def process_pdf_multimodal(pdf_file):
91
  text_elements = []
92
  for element in elements:
93
  if element.category not in ["Image", "Table"]:
94
- text_elements.append(element.text)
 
95
 
96
- # Process extracted images
97
  image_descriptions = []
98
  if os.path.exists(figures_dir):
99
  for image_file in os.listdir(figures_dir):
100
  if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
101
  image_path = os.path.join(figures_dir, image_file)
102
- description = extract_image_description(image_path)
 
103
  image_descriptions.append(description)
104
 
105
- # Combine text and image descriptions
106
  all_content = text_elements + image_descriptions
107
  extracted_content = "\n\n".join(all_content)
108
 
109
- # Split into chunks
 
 
 
110
  text_splitter = RecursiveCharacterTextSplitter(
111
  chunk_size=1000,
112
  chunk_overlap=200,
@@ -114,13 +142,14 @@ def process_pdf_multimodal(pdf_file):
114
  )
115
  chunks = text_splitter.split_text(extracted_content)
116
 
117
- # Create FAISS index
118
  index = FAISS.from_texts(chunks, embeddings)
119
  retriever = index.as_retriever(search_kwargs={"k": 3})
120
 
121
  # Status message
122
  num_images = len(image_descriptions)
123
- status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} text chunks, {num_images} images analyzed"
 
124
 
125
  return current_pdf_name, status, gr.update(interactive=True)
126
 
@@ -130,14 +159,9 @@ def process_pdf_multimodal(pdf_file):
130
 
131
  def ask_multimodal_question(pdf_name, question):
132
  """
133
- Answer questions using both text and image content from the PDF.
134
- Args:
135
- pdf_name: Display name (unused)
136
- question: User's question
137
- Returns:
138
- Generated answer combining text and visual information
139
  """
140
- global retriever
141
 
142
  if index is None or retriever is None:
143
  return "❌ Please upload and process a PDF first."
@@ -146,26 +170,41 @@ def ask_multimodal_question(pdf_name, question):
146
  return "❌ Please enter a question."
147
 
148
  try:
149
- # Retrieve relevant chunks (text + image descriptions)
150
  docs = retriever.get_relevant_documents(question)
151
  context = "\n\n".join(doc.page_content for doc in docs)
152
 
153
- # Enhanced prompt for multimodal content
154
- prompt = (
155
- "You are an AI assistant analyzing a document that contains both text and images. "
156
- "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
157
- "to answer the question comprehensively.\n\n"
158
- f"Document Content:\n{context}\n\n"
159
- f"Question: {question}\n\n"
160
- "Provide a detailed answer based on both the textual information and visual elements described above. "
161
- "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
162
- "Answer:"
163
- )
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Generate response
166
- response = text_client.chat_completion(
167
- messages=[{"role": "user", "content": prompt}],
168
- max_tokens=256,
 
 
 
169
  temperature=0.5
170
  )
171
 
@@ -177,25 +216,41 @@ def ask_multimodal_question(pdf_name, question):
177
 
178
  def generate_multimodal_summary():
179
  """
180
- Generate a summary considering both text and visual elements.
181
  """
182
  if not extracted_content:
183
  return "❌ Please upload and process a PDF first."
184
 
185
  try:
186
- # Use first 3000 characters for summary
187
- content_preview = extracted_content[:3000]
188
 
189
- prompt = (
190
- "Provide a comprehensive summary of this document that contains both text and visual elements "
191
- "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
192
- f"{content_preview}..."
193
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- response = text_client.chat_completion(
196
- messages=[{"role": "user", "content": prompt}],
197
- max_tokens=200,
198
- temperature=0.5
199
  )
200
 
201
  return response["choices"][0]["message"]["content"].strip()
@@ -205,7 +260,7 @@ def generate_multimodal_summary():
205
 
206
  def extract_multimodal_keywords():
207
  """
208
- Extract keywords from both text and visual content.
209
  """
210
  if not extracted_content:
211
  return "❌ Please upload and process a PDF first."
@@ -213,16 +268,35 @@ def extract_multimodal_keywords():
213
  try:
214
  content_preview = extracted_content[:3000]
215
 
216
- prompt = (
217
- "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
218
- "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
219
- f"{content_preview}..."
220
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- response = text_client.chat_completion(
223
- messages=[{"role": "user", "content": prompt}],
224
- max_tokens=100,
225
- temperature=0.5
226
  )
227
 
228
  return response["choices"][0]["message"]["content"].strip()
@@ -234,7 +308,7 @@ def clear_multimodal_interface():
234
  """
235
  Reset all global state and clear UI.
236
  """
237
- global index, retriever, current_pdf_name, extracted_content
238
 
239
  # Clear figures directory
240
  try:
@@ -246,6 +320,7 @@ def clear_multimodal_interface():
246
  # Reset globals
247
  index = retriever = None
248
  current_pdf_name = extracted_content = None
 
249
 
250
  return None, "", gr.update(interactive=False)
251
 
@@ -271,30 +346,46 @@ with gr.Blocks(theme=theme, css="""
271
  display: inline-block;
272
  margin: 10px auto;
273
  }
 
 
 
 
 
 
 
 
 
274
  """) as demo:
275
 
276
  # Application title with multimodal badge
277
- gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
278
- gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>πŸ–ΌοΈ Text + Images + Charts</span></div>")
 
 
 
 
 
 
 
279
 
280
  with gr.Row():
281
  with gr.Column():
282
  gr.Markdown("## πŸ“„ Document Input")
283
  pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
284
  pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
285
- upload_button = gr.Button("πŸ”„ Process Document (Extract Text + Images)", variant="primary")
286
  status_box = gr.Textbox(label="Processing Status", interactive=False)
287
 
288
  with gr.Column():
289
  gr.Markdown("## ❓ Ask Questions")
290
- gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
291
  question_input = gr.Textbox(
292
  lines=3,
293
- placeholder="Ask about text, images, charts, or any content in the PDF...",
294
  interactive=False
295
  )
296
- ask_button = gr.Button("πŸ” Ask Question", variant="primary")
297
- answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
298
 
299
  # Analysis tools
300
  with gr.Row():
@@ -310,8 +401,8 @@ with gr.Blocks(theme=theme, css="""
310
 
311
  gr.Markdown("""
312
  <div class='footer'>
313
- Powered by LangChain + Unstructured + Vision AI + FAISS |
314
- Supports: Text, Images, Charts, Tables, Diagrams
315
  </div>
316
  """)
317
 
 
2
  import gradio as gr
3
  import tempfile
4
  from pathlib import Path
5
+ import base64
6
 
7
  # Import vectorstore and embeddings from langchain community package
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  # Text splitter to break large documents into manageable chunks
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ # HF Inference client for multimodal model
13
  from huggingface_hub import InferenceClient
14
  # Unstructured for PDF processing with image extraction
15
  from unstructured.partition.pdf import partition_pdf
 
20
  retriever = None # Retriever to fetch relevant chunks
21
  current_pdf_name = None # Name of the currently loaded PDF
22
  extracted_content = None # Combined text and image descriptions
23
+ extracted_images = [] # Store image paths for multimodal queries
24
 
25
+ # ── Single Multimodal Model ──────────────────────────────────────────────────
26
+ # Using a single multimodal model that can handle both text and images
27
+ multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
 
 
28
 
29
+ # ── Multimodal Embeddings ────────────────────────────────────────────────────
30
+ # Using CLIP-based embeddings that can handle both text and images
31
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
32
 
33
  # Create temporary directories for processing
34
  temp_dir = tempfile.mkdtemp()
35
  figures_dir = os.path.join(temp_dir, "figures")
36
  os.makedirs(figures_dir, exist_ok=True)
37
 
38
+ def encode_image_to_base64(image_path):
39
+ """Convert image to base64 for API calls"""
40
+ with open(image_path, "rb") as image_file:
41
+ return base64.b64encode(image_file.read()).decode('utf-8')
42
+
43
+ def analyze_image_with_multimodal_model(image_path):
44
  """
45
+ Analyze an extracted image using the multimodal model.
46
  Args:
47
  image_path: Path to the extracted image file
48
  Returns:
49
  Text description of the image content
50
  """
51
  try:
52
+ # Encode image to base64
53
+ image_base64 = encode_image_to_base64(image_path)
54
+
55
+ # Create multimodal prompt
56
+ messages = [
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {
61
+ "type": "text",
62
+ "text": "Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive."
63
+ },
64
+ {
65
+ "type": "image_url",
66
+ "image_url": {
67
+ "url": f"data:image/jpeg;base64,{image_base64}"
68
+ }
69
+ }
70
+ ]
71
+ }
72
+ ]
73
+
74
+ # Use multimodal model for image analysis
75
+ response = multimodal_client.chat_completion(
76
+ messages=messages,
77
+ max_tokens=200,
78
+ temperature=0.3
79
+ )
80
+
81
+ description = response["choices"][0]["message"]["content"].strip()
82
+ return f"[IMAGE CONTENT]: {description}"
83
+
84
  except Exception as e:
85
+ return f"[IMAGE CONTENT]: Could not analyze image - {str(e)}"
86
 
87
  def process_pdf_multimodal(pdf_file):
88
  """
89
+ Process PDF with single multimodal model for both text and images.
 
 
 
 
 
 
 
90
  """
91
+ global current_pdf_name, index, retriever, extracted_content, extracted_images
92
 
93
  if pdf_file is None:
94
  return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
 
96
  current_pdf_name = os.path.basename(pdf_file.name)
97
 
98
  try:
99
+ # Clear previous data
100
+ extracted_images.clear()
101
  for file in os.listdir(figures_dir):
102
  os.remove(os.path.join(figures_dir, file))
103
 
 
114
  text_elements = []
115
  for element in elements:
116
  if element.category not in ["Image", "Table"]:
117
+ if element.text.strip(): # Only add non-empty text
118
+ text_elements.append(element.text.strip())
119
 
120
+ # Process extracted images with multimodal model
121
  image_descriptions = []
122
  if os.path.exists(figures_dir):
123
  for image_file in os.listdir(figures_dir):
124
  if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
125
  image_path = os.path.join(figures_dir, image_file)
126
+ extracted_images.append(image_path) # Store for later use
127
+ description = analyze_image_with_multimodal_model(image_path)
128
  image_descriptions.append(description)
129
 
130
+ # Combine all content
131
  all_content = text_elements + image_descriptions
132
  extracted_content = "\n\n".join(all_content)
133
 
134
+ if not extracted_content.strip():
135
+ return current_pdf_name, "❌ No content could be extracted from the PDF.", gr.update(interactive=False)
136
+
137
+ # Split into chunks for embedding
138
  text_splitter = RecursiveCharacterTextSplitter(
139
  chunk_size=1000,
140
  chunk_overlap=200,
 
142
  )
143
  chunks = text_splitter.split_text(extracted_content)
144
 
145
+ # Create FAISS index with multimodal embeddings
146
  index = FAISS.from_texts(chunks, embeddings)
147
  retriever = index.as_retriever(search_kwargs={"k": 3})
148
 
149
  # Status message
150
  num_images = len(image_descriptions)
151
+ num_text_elements = len(text_elements)
152
+ status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} chunks ({num_text_elements} text sections, {num_images} images analyzed)"
153
 
154
  return current_pdf_name, status, gr.update(interactive=True)
155
 
 
159
 
160
  def ask_multimodal_question(pdf_name, question):
161
  """
162
+ Answer questions using the single multimodal model with retrieved context.
 
 
 
 
 
163
  """
164
+ global retriever, extracted_images
165
 
166
  if index is None or retriever is None:
167
  return "❌ Please upload and process a PDF first."
 
170
  return "❌ Please enter a question."
171
 
172
  try:
173
+ # Retrieve relevant chunks
174
  docs = retriever.get_relevant_documents(question)
175
  context = "\n\n".join(doc.page_content for doc in docs)
176
 
177
+ # Create messages for multimodal model
178
+ messages = [
179
+ {
180
+ "role": "user",
181
+ "content": [
182
+ {
183
+ "type": "text",
184
+ "text": f"""You are an AI assistant analyzing a document that contains both text and visual elements.
185
+
186
+ RETRIEVED CONTEXT:
187
+ {context}
188
+
189
+ QUESTION: {question}
190
+
191
+ Please provide a comprehensive answer based on the retrieved context above. The context includes both textual information and descriptions of images, charts, tables, and other visual elements from the document.
192
+
193
+ If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
194
+
195
+ ANSWER:"""
196
+ }
197
+ ]
198
+ }
199
+ ]
200
 
201
+ # If question seems to be about images and we have extracted images,
202
+ # we could potentially include an image in the query (for advanced use cases)
203
+
204
+ # Generate response with multimodal model
205
+ response = multimodal_client.chat_completion(
206
+ messages=messages,
207
+ max_tokens=300,
208
  temperature=0.5
209
  )
210
 
 
216
 
217
  def generate_multimodal_summary():
218
  """
219
+ Generate summary using the multimodal model.
220
  """
221
  if not extracted_content:
222
  return "❌ Please upload and process a PDF first."
223
 
224
  try:
225
+ # Use first 4000 characters for summary
226
+ content_preview = extracted_content[:4000]
227
 
228
+ messages = [
229
+ {
230
+ "role": "user",
231
+ "content": [
232
+ {
233
+ "type": "text",
234
+ "text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
235
+
236
+ DOCUMENT CONTENT:
237
+ {content_preview}
238
+
239
+ Create a well-structured summary that captures:
240
+ 1. Main topics and key points from the text
241
+ 2. Important information from visual elements (charts, images, tables)
242
+ 3. Overall document purpose and conclusions
243
+
244
+ SUMMARY:"""
245
+ }
246
+ ]
247
+ }
248
+ ]
249
 
250
+ response = multimodal_client.chat_completion(
251
+ messages=messages,
252
+ max_tokens=250,
253
+ temperature=0.3
254
  )
255
 
256
  return response["choices"][0]["message"]["content"].strip()
 
260
 
261
  def extract_multimodal_keywords():
262
  """
263
+ Extract keywords using the multimodal model.
264
  """
265
  if not extracted_content:
266
  return "❌ Please upload and process a PDF first."
 
268
  try:
269
  content_preview = extracted_content[:3000]
270
 
271
+ messages = [
272
+ {
273
+ "role": "user",
274
+ "content": [
275
+ {
276
+ "type": "text",
277
+ "text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
278
+
279
+ DOCUMENT CONTENT:
280
+ {content_preview}
281
+
282
+ Extract key terms that represent:
283
+ - Main topics and concepts
284
+ - Important technical terms
285
+ - Key findings or data points
286
+ - Visual elements mentioned (chart types, image subjects)
287
+
288
+ Format as a comma-separated list.
289
+
290
+ KEY TERMS:"""
291
+ }
292
+ ]
293
+ }
294
+ ]
295
 
296
+ response = multimodal_client.chat_completion(
297
+ messages=messages,
298
+ max_tokens=120,
299
+ temperature=0.3
300
  )
301
 
302
  return response["choices"][0]["message"]["content"].strip()
 
308
  """
309
  Reset all global state and clear UI.
310
  """
311
+ global index, retriever, current_pdf_name, extracted_content, extracted_images
312
 
313
  # Clear figures directory
314
  try:
 
320
  # Reset globals
321
  index = retriever = None
322
  current_pdf_name = extracted_content = None
323
+ extracted_images.clear()
324
 
325
  return None, "", gr.update(interactive=False)
326
 
 
346
  display: inline-block;
347
  margin: 10px auto;
348
  }
349
+ .model-info {
350
+ background: #f8fafc;
351
+ border: 1px solid #e2e8f0;
352
+ border-radius: 8px;
353
+ padding: 10px;
354
+ margin: 10px 0;
355
+ font-size: 12px;
356
+ color: #64748b;
357
+ }
358
  """) as demo:
359
 
360
  # Application title with multimodal badge
361
+ gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
362
+ gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🧠 Single Model β€’ Text + Vision</span></div>")
363
+
364
+ # Model information
365
+ gr.Markdown("""
366
+ <div class='model-info'>
367
+ <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision (Multimodal) + CLIP Embeddings (Text+Image) + Unstructured (PDF Processing)
368
+ </div>
369
+ """)
370
 
371
  with gr.Row():
372
  with gr.Column():
373
  gr.Markdown("## πŸ“„ Document Input")
374
  pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
375
  pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
376
+ upload_button = gr.Button("πŸ”„ Process with Multimodal AI", variant="primary")
377
  status_box = gr.Textbox(label="Processing Status", interactive=False)
378
 
379
  with gr.Column():
380
  gr.Markdown("## ❓ Ask Questions")
381
+ gr.Markdown("*Single AI model understands both text and visual content*")
382
  question_input = gr.Textbox(
383
  lines=3,
384
+ placeholder="Ask about text content, images, charts, tables, or any visual elements...",
385
  interactive=False
386
  )
387
+ ask_button = gr.Button("πŸ” Ask Multimodal AI", variant="primary")
388
+ answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
389
 
390
  # Analysis tools
391
  with gr.Row():
 
401
 
402
  gr.Markdown("""
403
  <div class='footer'>
404
+ <strong>Unified Multimodal Pipeline:</strong> One model handles text analysis, image understanding, and question answering<br>
405
+ Supports: Text β€’ Images β€’ Charts β€’ Tables β€’ Diagrams β€’ Mixed Content Queries
406
  </div>
407
  """)
408