Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on 23 days ago

Commit

7154bdc

verified ·

1 Parent(s): 87baec5

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -27

app.py CHANGED Viewed

@@ -44,11 +44,48 @@ vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
 # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium")  # For conversational AI
 # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b")   # For instruction following
-# ── Embeddings ───────────────────────────────────────────────────────────────
-# Use BGE embeddings from BAAI for vectorizing text chunks
-embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-def extract_image_description_advanced(image_path):
     """
     Enhanced image description using multiple vision models
     """
@@ -167,29 +204,27 @@ def process_pdf_multimodal_advanced(pdf_file):
         # Combine all content
         all_content = text_elements + visual_descriptions
-        # Advanced text splitting
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=800,  # Smaller chunks for better retrieval
-            chunk_overlap=150,
-            add_start_index=True,
-            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
-        )
-        combined_content = "\n\n".join(all_content)
-        chunks = text_splitter.split_text(combined_content)
-        # Create FAISS index with better retrieval settings
-        index = FAISS.from_texts(chunks, embeddings)
-        retriever = index.as_retriever(
-            search_type="mmr",  # Maximum marginal relevance
-            search_kwargs={
-                "k": 4,
-                "fetch_k": 8,
-                "lambda_mult": 0.7
-            }
-        )
-        status = f"✅ Advanced processing complete for '{current_pdf_name}'\n📄 {len(text_elements)} text sections\n🖼️ {len(extracted_images)} visual elements\n📦 {len(chunks)} searchable chunks"
         return current_pdf_name, status, gr.update(interactive=True)

 # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium")  # For conversational AI
 # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b")   # For instruction following
+# ── Multimodal Embeddings ───────────────────────────────────────────────────
+# Primary: CLIP embeddings for excellent text-image alignment
+try:
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/clip-ViT-B-32",
+        model_kwargs={'device': 'cpu'},  # Ensure CPU usage for HF Spaces
+        encode_kwargs={'normalize_embeddings': True}
+    )
+    print("✅ Using CLIP embeddings for multimodal support")
+except Exception as e:
+    print(f"⚠️ CLIP failed, falling back to BGE: {e}")
+    # Fallback to BGE embeddings
+    embeddings = HuggingFaceEmbeddings(
+        model_name="BAAI/bge-base-en-v1.5",
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': True}
+    )
+def create_multimodal_embeddings(text_chunks, image_descriptions):
+    """
+    Create embeddings that combine text and visual information
+    """
+    try:
+        all_chunks = []
+        # Process text chunks
+        for chunk in text_chunks:
+            # Add context markers for better embedding
+            enhanced_chunk = f"Document text: {chunk}"
+            all_chunks.append(enhanced_chunk)
+        # Process image descriptions with special formatting
+        for img_desc in image_descriptions:
+            # Mark visual content for better embedding alignment
+            enhanced_desc = f"Visual content: {img_desc}"
+            all_chunks.append(enhanced_desc)
+        return all_chunks
+    except Exception as e:
+        print(f"Error creating multimodal embeddings: {e}")
+        return text_chunks + image_descriptions
     """
     Enhanced image description using multiple vision models
     """
         # Combine all content
         all_content = text_elements + visual_descriptions
+        # Combine text and visual content with enhanced embedding strategy
+        text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []
+        # Create multimodal embeddings
+        all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)
+        # Create FAISS index with optimized settings for multimodal content
+        if all_chunks:
+            index = FAISS.from_texts(all_chunks, embeddings)
+            retriever = index.as_retriever(
+                search_type="mmr",  # Maximum marginal relevance for diverse results
+                search_kwargs={
+                    "k": 5,           # Get more results for multimodal content
+                    "fetch_k": 10,    # Broader initial search
+                    "lambda_mult": 0.6  # Balance between relevance and diversity
+                }
+            )
+        else:
+            raise Exception("No content extracted from PDF")
+        status = f"✅ Advanced processing complete for '{current_pdf_name}'\n📄 {len(text_elements)} text sections\n🖼️ {len(extracted_images)} visual elements\n📦 {len(all_chunks)} total searchable chunks"
         return current_pdf_name, status, gr.update(interactive=True)