Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on 28 days ago

Commit

919ab87

verified ·

1 Parent(s): 71a558b

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -92

app.py CHANGED Viewed

@@ -2,13 +2,14 @@ import os
 import gradio as gr
 import tempfile
 from pathlib import Path
 # Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# HF Inference client for running multimodal models
 from huggingface_hub import InferenceClient
 # Unstructured for PDF processing with image extraction
 from unstructured.partition.pdf import partition_pdf
@@ -19,54 +20,75 @@ index = None               # FAISS index storing document embeddings
 retriever = None           # Retriever to fetch relevant chunks
 current_pdf_name = None    # Name of the currently loaded PDF
 extracted_content = None   # Combined text and image descriptions
-# ── HF Inference clients ─────────────────────────────────────────────────────
-# Text generation client (using a good open model)
-text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
-# Vision client for image analysis
-vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
-# ── Embeddings ───────────────────────────────────────────────────────────────
-# Use BGE embeddings for vectorizing text chunks
-embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
 # Create temporary directories for processing
 temp_dir = tempfile.mkdtemp()
 figures_dir = os.path.join(temp_dir, "figures")
 os.makedirs(figures_dir, exist_ok=True)
-def extract_image_description(image_path):
     """
-    Analyze an extracted image using vision model to get text description.
     Args:
         image_path: Path to the extracted image file
     Returns:
         Text description of the image content
     """
     try:
-        # Read image and send to vision model
-        with open(image_path, "rb") as img_file:
-            # Use vision client to analyze the image
-            response = vision_client.text_to_image_generation(
-                prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
-                image=img_file.read()
-            )
-            return f"Image content: {response}"
     except Exception as e:
-        return f"Image content: [Could not analyze image - {str(e)}]"
 def process_pdf_multimodal(pdf_file):
     """
-    1. Extracts text and images from PDF using unstructured
-    2. Analyzes extracted images with vision model
-    3. Combines text and image descriptions
-    4. Creates FAISS index for retrieval
-    Args:
-        pdf_file: Uploaded PDF file
-    Returns:
-        - PDF filename, status message, and UI updates
     """
-    global current_pdf_name, index, retriever, extracted_content
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
@@ -74,7 +96,8 @@ def process_pdf_multimodal(pdf_file):
     current_pdf_name = os.path.basename(pdf_file.name)
     try:
-        # Clear previous figures
         for file in os.listdir(figures_dir):
             os.remove(os.path.join(figures_dir, file))
@@ -91,22 +114,27 @@ def process_pdf_multimodal(pdf_file):
         text_elements = []
         for element in elements:
             if element.category not in ["Image", "Table"]:
-                text_elements.append(element.text)
-        # Process extracted images
         image_descriptions = []
         if os.path.exists(figures_dir):
             for image_file in os.listdir(figures_dir):
                 if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                     image_path = os.path.join(figures_dir, image_file)
-                    description = extract_image_description(image_path)
                     image_descriptions.append(description)
-        # Combine text and image descriptions
         all_content = text_elements + image_descriptions
         extracted_content = "\n\n".join(all_content)
-        # Split into chunks
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=200,
@@ -114,13 +142,14 @@ def process_pdf_multimodal(pdf_file):
         )
         chunks = text_splitter.split_text(extracted_content)
-        # Create FAISS index
         index = FAISS.from_texts(chunks, embeddings)
         retriever = index.as_retriever(search_kwargs={"k": 3})
         # Status message
         num_images = len(image_descriptions)
-        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed"
         return current_pdf_name, status, gr.update(interactive=True)
@@ -130,14 +159,9 @@ def process_pdf_multimodal(pdf_file):
 def ask_multimodal_question(pdf_name, question):
     """
-    Answer questions using both text and image content from the PDF.
-    Args:
-        pdf_name: Display name (unused)
-        question: User's question
-    Returns:
-        Generated answer combining text and visual information
     """
-    global retriever
     if index is None or retriever is None:
         return "❌ Please upload and process a PDF first."
@@ -146,26 +170,41 @@ def ask_multimodal_question(pdf_name, question):
         return "❌ Please enter a question."
     try:
-        # Retrieve relevant chunks (text + image descriptions)
         docs = retriever.get_relevant_documents(question)
         context = "\n\n".join(doc.page_content for doc in docs)
-        # Enhanced prompt for multimodal content
-        prompt = (
-            "You are an AI assistant analyzing a document that contains both text and images. "
-            "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
-            "to answer the question comprehensively.\n\n"
-            f"Document Content:\n{context}\n\n"
-            f"Question: {question}\n\n"
-            "Provide a detailed answer based on both the textual information and visual elements described above. "
-            "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
-            "Answer:"
-        )
-        # Generate response
-        response = text_client.chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=256,
             temperature=0.5
         )
@@ -177,25 +216,41 @@ def ask_multimodal_question(pdf_name, question):
 def generate_multimodal_summary():
     """
-    Generate a summary considering both text and visual elements.
     """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
-        # Use first 3000 characters for summary
-        content_preview = extracted_content[:3000]
-        prompt = (
-            "Provide a comprehensive summary of this document that contains both text and visual elements "
-            "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
-            f"{content_preview}..."
-        )
-        response = text_client.chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=200,
-            temperature=0.5
         )
         return response["choices"][0]["message"]["content"].strip()
@@ -205,7 +260,7 @@ def generate_multimodal_summary():
 def extract_multimodal_keywords():
     """
-    Extract keywords from both text and visual content.
     """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
@@ -213,16 +268,35 @@ def extract_multimodal_keywords():
     try:
         content_preview = extracted_content[:3000]
-        prompt = (
-            "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
-            "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
-            f"{content_preview}..."
-        )
-        response = text_client.chat_completion(
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=100,
-            temperature=0.5
         )
         return response["choices"][0]["message"]["content"].strip()
@@ -234,7 +308,7 @@ def clear_multimodal_interface():
     """
     Reset all global state and clear UI.
     """
-    global index, retriever, current_pdf_name, extracted_content
     # Clear figures directory
     try:
@@ -246,6 +320,7 @@ def clear_multimodal_interface():
     # Reset globals
     index = retriever = None
     current_pdf_name = extracted_content = None
     return None, "", gr.update(interactive=False)
@@ -271,30 +346,46 @@ with gr.Blocks(theme=theme, css="""
         display: inline-block;
         margin: 10px auto;
     }
 """) as demo:
     # Application title with multimodal badge
-    gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
-    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🖼️ Text + Images + Charts</span></div>")
     with gr.Row():
         with gr.Column():
             gr.Markdown("## 📄 Document Input")
             pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
             pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
-            upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary")
             status_box = gr.Textbox(label="Processing Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
-            gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
             question_input = gr.Textbox(
                 lines=3,
-                placeholder="Ask about text, images, charts, or any content in the PDF...",
                 interactive=False
             )
-            ask_button = gr.Button("🔍 Ask Question", variant="primary")
-            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
     # Analysis tools
     with gr.Row():
@@ -310,8 +401,8 @@ with gr.Blocks(theme=theme, css="""
     gr.Markdown("""
     <div class='footer'>
-        Powered by LangChain + Unstructured + Vision AI + FAISS |
-        Supports: Text, Images, Charts, Tables, Diagrams
     </div>
     """)

 import gradio as gr
 import tempfile
 from pathlib import Path
+import base64
 # Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# HF Inference client for multimodal model
 from huggingface_hub import InferenceClient
 # Unstructured for PDF processing with image extraction
 from unstructured.partition.pdf import partition_pdf
 retriever = None           # Retriever to fetch relevant chunks
 current_pdf_name = None    # Name of the currently loaded PDF
 extracted_content = None   # Combined text and image descriptions
+extracted_images = []      # Store image paths for multimodal queries
+# ── Single Multimodal Model ──────────────────────────────────────────────────
+# Using a single multimodal model that can handle both text and images
+multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
+# ── Multimodal Embeddings ────────────────────────────────────────────────────
+# Using CLIP-based embeddings that can handle both text and images
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
 # Create temporary directories for processing
 temp_dir = tempfile.mkdtemp()
 figures_dir = os.path.join(temp_dir, "figures")
 os.makedirs(figures_dir, exist_ok=True)
+def encode_image_to_base64(image_path):
+    """Convert image to base64 for API calls"""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def analyze_image_with_multimodal_model(image_path):
     """
+    Analyze an extracted image using the multimodal model.
     Args:
         image_path: Path to the extracted image file
     Returns:
         Text description of the image content
     """
     try:
+        # Encode image to base64
+        image_base64 = encode_image_to_base64(image_path)
+        # Create multimodal prompt
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive."
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_base64}"
+                        }
+                    }
+                ]
+            }
+        ]
+        # Use multimodal model for image analysis
+        response = multimodal_client.chat_completion(
+            messages=messages,
+            max_tokens=200,
+            temperature=0.3
+        )
+        description = response["choices"][0]["message"]["content"].strip()
+        return f"[IMAGE CONTENT]: {description}"
     except Exception as e:
+        return f"[IMAGE CONTENT]: Could not analyze image - {str(e)}"
 def process_pdf_multimodal(pdf_file):
     """
+    Process PDF with single multimodal model for both text and images.
     """
+    global current_pdf_name, index, retriever, extracted_content, extracted_images
     if pdf_file is None:
         return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
     current_pdf_name = os.path.basename(pdf_file.name)
     try:
+        # Clear previous data
+        extracted_images.clear()
         for file in os.listdir(figures_dir):
             os.remove(os.path.join(figures_dir, file))
         text_elements = []
         for element in elements:
             if element.category not in ["Image", "Table"]:
+                if element.text.strip():  # Only add non-empty text
+                    text_elements.append(element.text.strip())
+        # Process extracted images with multimodal model
         image_descriptions = []
         if os.path.exists(figures_dir):
             for image_file in os.listdir(figures_dir):
                 if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                     image_path = os.path.join(figures_dir, image_file)
+                    extracted_images.append(image_path)  # Store for later use
+                    description = analyze_image_with_multimodal_model(image_path)
                     image_descriptions.append(description)
+        # Combine all content
         all_content = text_elements + image_descriptions
         extracted_content = "\n\n".join(all_content)
+        if not extracted_content.strip():
+            return current_pdf_name, "❌ No content could be extracted from the PDF.", gr.update(interactive=False)
+        # Split into chunks for embedding
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=200,
         )
         chunks = text_splitter.split_text(extracted_content)
+        # Create FAISS index with multimodal embeddings
         index = FAISS.from_texts(chunks, embeddings)
         retriever = index.as_retriever(search_kwargs={"k": 3})
         # Status message
         num_images = len(image_descriptions)
+        num_text_elements = len(text_elements)
+        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} chunks ({num_text_elements} text sections, {num_images} images analyzed)"
         return current_pdf_name, status, gr.update(interactive=True)
 def ask_multimodal_question(pdf_name, question):
     """
+    Answer questions using the single multimodal model with retrieved context.
     """
+    global retriever, extracted_images
     if index is None or retriever is None:
         return "❌ Please upload and process a PDF first."
         return "❌ Please enter a question."
     try:
+        # Retrieve relevant chunks
         docs = retriever.get_relevant_documents(question)
         context = "\n\n".join(doc.page_content for doc in docs)
+        # Create messages for multimodal model
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""You are an AI assistant analyzing a document that contains both text and visual elements.
+RETRIEVED CONTEXT:
+{context}
+QUESTION: {question}
+Please provide a comprehensive answer based on the retrieved context above. The context includes both textual information and descriptions of images, charts, tables, and other visual elements from the document.
+If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
+ANSWER:"""
+                    }
+                ]
+            }
+        ]
+        # If question seems to be about images and we have extracted images,
+        # we could potentially include an image in the query (for advanced use cases)
+        # Generate response with multimodal model
+        response = multimodal_client.chat_completion(
+            messages=messages,
+            max_tokens=300,
             temperature=0.5
         )
 def generate_multimodal_summary():
     """
+    Generate summary using the multimodal model.
     """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
+        # Use first 4000 characters for summary
+        content_preview = extracted_content[:4000]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
+DOCUMENT CONTENT:
+{content_preview}
+Create a well-structured summary that captures:
+1. Main topics and key points from the text
+2. Important information from visual elements (charts, images, tables)
+3. Overall document purpose and conclusions
+SUMMARY:"""
+                    }
+                ]
+            }
+        ]
+        response = multimodal_client.chat_completion(
+            messages=messages,
+            max_tokens=250,
+            temperature=0.3
         )
         return response["choices"][0]["message"]["content"].strip()
 def extract_multimodal_keywords():
     """
+    Extract keywords using the multimodal model.
     """
     if not extracted_content:
         return "❌ Please upload and process a PDF first."
     try:
         content_preview = extracted_content[:3000]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
+DOCUMENT CONTENT:
+{content_preview}
+Extract key terms that represent:
+- Main topics and concepts
+- Important technical terms
+- Key findings or data points
+- Visual elements mentioned (chart types, image subjects)
+Format as a comma-separated list.
+KEY TERMS:"""
+                    }
+                ]
+            }
+        ]
+        response = multimodal_client.chat_completion(
+            messages=messages,
+            max_tokens=120,
+            temperature=0.3
         )
         return response["choices"][0]["message"]["content"].strip()
     """
     Reset all global state and clear UI.
     """
+    global index, retriever, current_pdf_name, extracted_content, extracted_images
     # Clear figures directory
     try:
     # Reset globals
     index = retriever = None
     current_pdf_name = extracted_content = None
+    extracted_images.clear()
     return None, "", gr.update(interactive=False)
         display: inline-block;
         margin: 10px auto;
     }
+    .model-info {
+        background: #f8fafc;
+        border: 1px solid #e2e8f0;
+        border-radius: 8px;
+        padding: 10px;
+        margin: 10px 0;
+        font-size: 12px;
+        color: #64748b;
+    }
 """) as demo:
     # Application title with multimodal badge
+    gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
+    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🧠 Single Model • Text + Vision</span></div>")
+    # Model information
+    gr.Markdown("""
+    <div class='model-info'>
+    <strong>🤖 Powered by:</strong> Microsoft Phi-3.5-Vision (Multimodal) + CLIP Embeddings (Text+Image) + Unstructured (PDF Processing)
+    </div>
+    """)
     with gr.Row():
         with gr.Column():
             gr.Markdown("## 📄 Document Input")
             pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
             pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
+            upload_button = gr.Button("🔄 Process with Multimodal AI", variant="primary")
             status_box = gr.Textbox(label="Processing Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
+            gr.Markdown("*Single AI model understands both text and visual content*")
             question_input = gr.Textbox(
                 lines=3,
+                placeholder="Ask about text content, images, charts, tables, or any visual elements...",
                 interactive=False
             )
+            ask_button = gr.Button("🔍 Ask Multimodal AI", variant="primary")
+            answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
     # Analysis tools
     with gr.Row():
     gr.Markdown("""
     <div class='footer'>
+        <strong>Unified Multimodal Pipeline:</strong> One model handles text analysis, image understanding, and question answering<br>
+        Supports: Text • Images • Charts • Tables • Diagrams • Mixed Content Queries
     </div>
     """)