Muzammil6376 commited on
Commit
b42840f
Β·
verified Β·
1 Parent(s): 3b36cb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -69
app.py CHANGED
@@ -40,6 +40,55 @@ def encode_image_to_base64(image_path):
40
  with open(image_path, "rb") as image_file:
41
  return base64.b64encode(image_file.read()).decode('utf-8')
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def analyze_image_with_multimodal_model(image_path):
44
  """
45
  Analyze an extracted image using the multimodal model.
@@ -52,33 +101,22 @@ def analyze_image_with_multimodal_model(image_path):
52
  # Encode image to base64
53
  image_base64 = encode_image_to_base64(image_path)
54
 
55
- # Create multimodal prompt
56
- messages = [
57
- {
58
- "role": "user",
59
- "content": [
60
- {
61
- "type": "text",
62
- "text": "Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive."
63
- },
64
- {
65
- "type": "image_url",
66
- "image_url": {
67
- "url": f"data:image/jpeg;base64,{image_base64}"
68
- }
69
- }
70
- ]
71
- }
72
- ]
73
 
74
  # Use multimodal model for image analysis
75
- response = multimodal_client.chat_completion(
76
- messages=messages,
77
- max_tokens=200,
 
78
  temperature=0.3
79
  )
80
 
81
- description = response["choices"][0]["message"]["content"].strip()
82
  return f"[IMAGE CONTENT]: {description}"
83
 
84
  except Exception as e:
@@ -86,7 +124,7 @@ def analyze_image_with_multimodal_model(image_path):
86
 
87
  def process_pdf_multimodal(pdf_file):
88
  """
89
- Process PDF with single multimodal model for both text and images.
90
  """
91
  global current_pdf_name, index, retriever, extracted_content, extracted_images
92
 
@@ -101,31 +139,21 @@ def process_pdf_multimodal(pdf_file):
101
  for file in os.listdir(figures_dir):
102
  os.remove(os.path.join(figures_dir, file))
103
 
104
- # Extract elements from PDF including images
105
- elements = partition_pdf(
106
- pdf_file.name,
107
- strategy=PartitionStrategy.HI_RES,
108
- extract_image_block_types=["Image", "Table"],
109
- extract_image_block_output_dir=figures_dir,
110
- extract_image_block_to_payload=False
111
- )
112
-
113
- # Separate text elements
114
  text_elements = []
115
- for element in elements:
116
- if element.category not in ["Image", "Table"]:
117
- if element.text.strip(): # Only add non-empty text
118
- text_elements.append(element.text.strip())
119
 
120
- # Process extracted images with multimodal model
121
- image_descriptions = []
122
- if os.path.exists(figures_dir):
123
- for image_file in os.listdir(figures_dir):
124
- if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
125
- image_path = os.path.join(figures_dir, image_file)
126
- extracted_images.append(image_path) # Store for later use
127
- description = analyze_image_with_multimodal_model(image_path)
128
- image_descriptions.append(description)
 
 
129
 
130
  # Combine all content
131
  all_content = text_elements + image_descriptions
@@ -148,8 +176,8 @@ def process_pdf_multimodal(pdf_file):
148
 
149
  # Status message
150
  num_images = len(image_descriptions)
151
- num_text_elements = len(text_elements)
152
- status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} chunks ({num_text_elements} text sections, {num_images} images analyzed)"
153
 
154
  return current_pdf_name, status, gr.update(interactive=True)
155
 
@@ -174,14 +202,8 @@ def ask_multimodal_question(pdf_name, question):
174
  docs = retriever.get_relevant_documents(question)
175
  context = "\n\n".join(doc.page_content for doc in docs)
176
 
177
- # Create messages for multimodal model
178
- messages = [
179
- {
180
- "role": "user",
181
- "content": [
182
- {
183
- "type": "text",
184
- "text": f"""You are an AI assistant analyzing a document that contains both text and visual elements.
185
 
186
  RETRIEVED CONTEXT:
187
  {context}
@@ -193,23 +215,15 @@ Please provide a comprehensive answer based on the retrieved context above. The
193
  If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
194
 
195
  ANSWER:"""
196
- }
197
- ]
198
- }
199
- ]
200
-
201
- # If question seems to be about images and we have extracted images,
202
- # we could potentially include an image in the query (for advanced use cases)
203
 
204
  # Generate response with multimodal model
205
- response = multimodal_client.chat_completion(
206
- messages=messages,
207
- max_tokens=300,
208
  temperature=0.5
209
  )
210
 
211
- answer = response["choices"][0]["message"]["content"].strip()
212
- return answer
213
 
214
  except Exception as e:
215
  return f"❌ Error generating answer: {str(e)}"
@@ -364,7 +378,7 @@ with gr.Blocks(theme=theme, css="""
364
  # Model information
365
  gr.Markdown("""
366
  <div class='model-info'>
367
- <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision (Multimodal) + CLIP Embeddings (Text+Image) + Unstructured (PDF Processing)
368
  </div>
369
  """)
370
 
 
40
  with open(image_path, "rb") as image_file:
41
  return base64.b64encode(image_file.read()).decode('utf-8')
42
 
43
+ def extract_images_from_pdf_pymupdf(pdf_path):
44
+ """
45
+ Extract images from PDF using PyMuPDF (works on HF Spaces)
46
+ Args:
47
+ pdf_path: Path to the PDF file
48
+ Returns:
49
+ List of image paths and their descriptions
50
+ """
51
+ extracted_images = []
52
+ image_descriptions = []
53
+
54
+ try:
55
+ # Open PDF with PyMuPDF
56
+ pdf_document = fitz.open(pdf_path)
57
+
58
+ for page_num in range(len(pdf_document)):
59
+ page = pdf_document.load_page(page_num)
60
+ image_list = page.get_images()
61
+
62
+ for img_index, img in enumerate(image_list):
63
+ # Get image data
64
+ xref = img[0]
65
+ pix = fitz.Pixmap(pdf_document, xref)
66
+
67
+ # Convert to PIL Image
68
+ if pix.n - pix.alpha < 4: # GRAY or RGB
69
+ img_data = pix.tobytes("png")
70
+ img_pil = Image.open(io.BytesIO(img_data))
71
+
72
+ # Save image
73
+ image_filename = f"page_{page_num}_img_{img_index}.png"
74
+ image_path = os.path.join(figures_dir, image_filename)
75
+ img_pil.save(image_path)
76
+
77
+ # Analyze image with multimodal model
78
+ description = analyze_image_with_multimodal_model(image_path)
79
+
80
+ extracted_images.append(image_path)
81
+ image_descriptions.append(description)
82
+
83
+ pix = None # Free memory
84
+
85
+ pdf_document.close()
86
+ return extracted_images, image_descriptions
87
+
88
+ except Exception as e:
89
+ print(f"Error extracting images: {e}")
90
+ return [], []
91
+
92
  def analyze_image_with_multimodal_model(image_path):
93
  """
94
  Analyze an extracted image using the multimodal model.
 
101
  # Encode image to base64
102
  image_base64 = encode_image_to_base64(image_path)
103
 
104
+ # Simple text-based prompt for HF Inference API
105
+ prompt = f"""Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive.
106
+
107
+ Image: [Image data provided]
108
+
109
+ Description:"""
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  # Use multimodal model for image analysis
112
+ # Note: Simplified for HF Spaces compatibility
113
+ response = multimodal_client.text_generation(
114
+ prompt=prompt,
115
+ max_new_tokens=200,
116
  temperature=0.3
117
  )
118
 
119
+ description = response.strip()
120
  return f"[IMAGE CONTENT]: {description}"
121
 
122
  except Exception as e:
 
124
 
125
  def process_pdf_multimodal(pdf_file):
126
  """
127
+ Process PDF using PyMuPDF (HF Spaces compatible).
128
  """
129
  global current_pdf_name, index, retriever, extracted_content, extracted_images
130
 
 
139
  for file in os.listdir(figures_dir):
140
  os.remove(os.path.join(figures_dir, file))
141
 
142
+ # Extract text using PyMuPDF
143
+ pdf_document = fitz.open(pdf_file.name)
 
 
 
 
 
 
 
 
144
  text_elements = []
 
 
 
 
145
 
146
+ for page_num in range(len(pdf_document)):
147
+ page = pdf_document.load_page(page_num)
148
+ text = page.get_text()
149
+ if text.strip():
150
+ text_elements.append(f"[PAGE {page_num + 1}]\n{text.strip()}")
151
+
152
+ pdf_document.close()
153
+
154
+ # Extract images using PyMuPDF
155
+ image_paths, image_descriptions = extract_images_from_pdf_pymupdf(pdf_file.name)
156
+ extracted_images.extend(image_paths)
157
 
158
  # Combine all content
159
  all_content = text_elements + image_descriptions
 
176
 
177
  # Status message
178
  num_images = len(image_descriptions)
179
+ num_text_pages = len(text_elements)
180
+ status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} chunks ({num_text_pages} pages, {num_images} images analyzed)"
181
 
182
  return current_pdf_name, status, gr.update(interactive=True)
183
 
 
202
  docs = retriever.get_relevant_documents(question)
203
  context = "\n\n".join(doc.page_content for doc in docs)
204
 
205
+ # Create prompt for text generation
206
+ prompt = f"""You are an AI assistant analyzing a document that contains both text and visual elements.
 
 
 
 
 
 
207
 
208
  RETRIEVED CONTEXT:
209
  {context}
 
215
  If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
216
 
217
  ANSWER:"""
 
 
 
 
 
 
 
218
 
219
  # Generate response with multimodal model
220
+ response = multimodal_client.text_generation(
221
+ prompt=prompt,
222
+ max_new_tokens=300,
223
  temperature=0.5
224
  )
225
 
226
+ return response.strip()
 
227
 
228
  except Exception as e:
229
  return f"❌ Error generating answer: {str(e)}"
 
378
  # Model information
379
  gr.Markdown("""
380
  <div class='model-info'>
381
+ <strong>πŸ€– Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
382
  </div>
383
  """)
384