Muzammil6376 commited on
Commit
7154bdc
Β·
verified Β·
1 Parent(s): 87baec5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -27
app.py CHANGED
@@ -44,11 +44,48 @@ vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
44
  # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
45
  # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
46
 
47
- # ── Embeddings ───────────────────────────────────────────────────────────────
48
- # Use BGE embeddings from BAAI for vectorizing text chunks
49
- embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
50
-
51
- def extract_image_description_advanced(image_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
  Enhanced image description using multiple vision models
54
  """
@@ -167,29 +204,27 @@ def process_pdf_multimodal_advanced(pdf_file):
167
  # Combine all content
168
  all_content = text_elements + visual_descriptions
169
 
170
- # Advanced text splitting
171
- text_splitter = RecursiveCharacterTextSplitter(
172
- chunk_size=800, # Smaller chunks for better retrieval
173
- chunk_overlap=150,
174
- add_start_index=True,
175
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
176
- )
177
-
178
- combined_content = "\n\n".join(all_content)
179
- chunks = text_splitter.split_text(combined_content)
180
-
181
- # Create FAISS index with better retrieval settings
182
- index = FAISS.from_texts(chunks, embeddings)
183
- retriever = index.as_retriever(
184
- search_type="mmr", # Maximum marginal relevance
185
- search_kwargs={
186
- "k": 4,
187
- "fetch_k": 8,
188
- "lambda_mult": 0.7
189
- }
190
- )
191
 
192
- status = f"βœ… Advanced processing complete for '{current_pdf_name}'\nπŸ“„ {len(text_elements)} text sections\nπŸ–ΌοΈ {len(extracted_images)} visual elements\nπŸ“¦ {len(chunks)} searchable chunks"
193
 
194
  return current_pdf_name, status, gr.update(interactive=True)
195
 
 
44
  # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
45
  # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
46
 
47
+ # ── Multimodal Embeddings ───────────────────────────────────────────────────
48
+ # Primary: CLIP embeddings for excellent text-image alignment
49
+ try:
50
+ embeddings = HuggingFaceEmbeddings(
51
+ model_name="sentence-transformers/clip-ViT-B-32",
52
+ model_kwargs={'device': 'cpu'}, # Ensure CPU usage for HF Spaces
53
+ encode_kwargs={'normalize_embeddings': True}
54
+ )
55
+ print("βœ… Using CLIP embeddings for multimodal support")
56
+ except Exception as e:
57
+ print(f"⚠️ CLIP failed, falling back to BGE: {e}")
58
+ # Fallback to BGE embeddings
59
+ embeddings = HuggingFaceEmbeddings(
60
+ model_name="BAAI/bge-base-en-v1.5",
61
+ model_kwargs={'device': 'cpu'},
62
+ encode_kwargs={'normalize_embeddings': True}
63
+ )
64
+
65
+ def create_multimodal_embeddings(text_chunks, image_descriptions):
66
+ """
67
+ Create embeddings that combine text and visual information
68
+ """
69
+ try:
70
+ all_chunks = []
71
+
72
+ # Process text chunks
73
+ for chunk in text_chunks:
74
+ # Add context markers for better embedding
75
+ enhanced_chunk = f"Document text: {chunk}"
76
+ all_chunks.append(enhanced_chunk)
77
+
78
+ # Process image descriptions with special formatting
79
+ for img_desc in image_descriptions:
80
+ # Mark visual content for better embedding alignment
81
+ enhanced_desc = f"Visual content: {img_desc}"
82
+ all_chunks.append(enhanced_desc)
83
+
84
+ return all_chunks
85
+
86
+ except Exception as e:
87
+ print(f"Error creating multimodal embeddings: {e}")
88
+ return text_chunks + image_descriptions
89
  """
90
  Enhanced image description using multiple vision models
91
  """
 
204
  # Combine all content
205
  all_content = text_elements + visual_descriptions
206
 
207
+ # Combine text and visual content with enhanced embedding strategy
208
+ text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []
209
+
210
+ # Create multimodal embeddings
211
+ all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)
212
+
213
+ # Create FAISS index with optimized settings for multimodal content
214
+ if all_chunks:
215
+ index = FAISS.from_texts(all_chunks, embeddings)
216
+ retriever = index.as_retriever(
217
+ search_type="mmr", # Maximum marginal relevance for diverse results
218
+ search_kwargs={
219
+ "k": 5, # Get more results for multimodal content
220
+ "fetch_k": 10, # Broader initial search
221
+ "lambda_mult": 0.6 # Balance between relevance and diversity
222
+ }
223
+ )
224
+ else:
225
+ raise Exception("No content extracted from PDF")
 
 
226
 
227
+ status = f"βœ… Advanced processing complete for '{current_pdf_name}'\nπŸ“„ {len(text_elements)} text sections\nπŸ–ΌοΈ {len(extracted_images)} visual elements\nπŸ“¦ {len(all_chunks)} total searchable chunks"
228
 
229
  return current_pdf_name, status, gr.update(interactive=True)
230