Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,11 +44,48 @@ vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
|
|
44 |
# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
|
45 |
# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
|
46 |
|
47 |
-
# ββ Embeddings
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
"""
|
53 |
Enhanced image description using multiple vision models
|
54 |
"""
|
@@ -167,29 +204,27 @@ def process_pdf_multimodal_advanced(pdf_file):
|
|
167 |
# Combine all content
|
168 |
all_content = text_elements + visual_descriptions
|
169 |
|
170 |
-
#
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
}
|
190 |
-
)
|
191 |
|
192 |
-
status = f"β
Advanced processing complete for '{current_pdf_name}'\nπ {len(text_elements)} text sections\nπΌοΈ {len(extracted_images)} visual elements\nπ¦ {len(
|
193 |
|
194 |
return current_pdf_name, status, gr.update(interactive=True)
|
195 |
|
|
|
44 |
# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
|
45 |
# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
|
46 |
|
47 |
+
# ββ Multimodal Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
48 |
+
# Primary: CLIP embeddings for excellent text-image alignment
|
49 |
+
try:
|
50 |
+
embeddings = HuggingFaceEmbeddings(
|
51 |
+
model_name="sentence-transformers/clip-ViT-B-32",
|
52 |
+
model_kwargs={'device': 'cpu'}, # Ensure CPU usage for HF Spaces
|
53 |
+
encode_kwargs={'normalize_embeddings': True}
|
54 |
+
)
|
55 |
+
print("β
Using CLIP embeddings for multimodal support")
|
56 |
+
except Exception as e:
|
57 |
+
print(f"β οΈ CLIP failed, falling back to BGE: {e}")
|
58 |
+
# Fallback to BGE embeddings
|
59 |
+
embeddings = HuggingFaceEmbeddings(
|
60 |
+
model_name="BAAI/bge-base-en-v1.5",
|
61 |
+
model_kwargs={'device': 'cpu'},
|
62 |
+
encode_kwargs={'normalize_embeddings': True}
|
63 |
+
)
|
64 |
+
|
65 |
+
def create_multimodal_embeddings(text_chunks, image_descriptions):
|
66 |
+
"""
|
67 |
+
Create embeddings that combine text and visual information
|
68 |
+
"""
|
69 |
+
try:
|
70 |
+
all_chunks = []
|
71 |
+
|
72 |
+
# Process text chunks
|
73 |
+
for chunk in text_chunks:
|
74 |
+
# Add context markers for better embedding
|
75 |
+
enhanced_chunk = f"Document text: {chunk}"
|
76 |
+
all_chunks.append(enhanced_chunk)
|
77 |
+
|
78 |
+
# Process image descriptions with special formatting
|
79 |
+
for img_desc in image_descriptions:
|
80 |
+
# Mark visual content for better embedding alignment
|
81 |
+
enhanced_desc = f"Visual content: {img_desc}"
|
82 |
+
all_chunks.append(enhanced_desc)
|
83 |
+
|
84 |
+
return all_chunks
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Error creating multimodal embeddings: {e}")
|
88 |
+
return text_chunks + image_descriptions
|
89 |
"""
|
90 |
Enhanced image description using multiple vision models
|
91 |
"""
|
|
|
204 |
# Combine all content
|
205 |
all_content = text_elements + visual_descriptions
|
206 |
|
207 |
+
# Combine text and visual content with enhanced embedding strategy
|
208 |
+
text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []
|
209 |
+
|
210 |
+
# Create multimodal embeddings
|
211 |
+
all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)
|
212 |
+
|
213 |
+
# Create FAISS index with optimized settings for multimodal content
|
214 |
+
if all_chunks:
|
215 |
+
index = FAISS.from_texts(all_chunks, embeddings)
|
216 |
+
retriever = index.as_retriever(
|
217 |
+
search_type="mmr", # Maximum marginal relevance for diverse results
|
218 |
+
search_kwargs={
|
219 |
+
"k": 5, # Get more results for multimodal content
|
220 |
+
"fetch_k": 10, # Broader initial search
|
221 |
+
"lambda_mult": 0.6 # Balance between relevance and diversity
|
222 |
+
}
|
223 |
+
)
|
224 |
+
else:
|
225 |
+
raise Exception("No content extracted from PDF")
|
|
|
|
|
226 |
|
227 |
+
status = f"β
Advanced processing complete for '{current_pdf_name}'\nπ {len(text_elements)} text sections\nπΌοΈ {len(extracted_images)} visual elements\nπ¦ {len(all_chunks)} total searchable chunks"
|
228 |
|
229 |
return current_pdf_name, status, gr.update(interactive=True)
|
230 |
|