Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on 22 days ago

Commit

87baec5

verified ·

1 Parent(s): 3341c6c

Update app.py

Browse files

Files changed (1) hide show

app.py +489 -113

app.py CHANGED Viewed

@@ -1,132 +1,508 @@
-# app.py
 import os
-from pathlib import Path
 import gradio as gr
 from PIL import Image
-from huggingface_hub import InferenceClient
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain_community.llms import HuggingFaceEndpoint
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.utils.constants import PartitionStrategy
-# ————— Config & Folders —————
-PDF_DIR = Path("pdfs")
-FIG_DIR = Path("figures")
-PDF_DIR.mkdir(exist_ok=True)
-FIG_DIR.mkdir(exist_ok=True)
-# ————— Read your HF_TOKEN secret —————
-hf_token = os.environ["HF_TOKEN"]
-# ————— Embeddings & LLM Setup —————
-embedding_model = HuggingFaceEmbeddings(
-    model_name="sentence-transformers/all-MiniLM-L6-v2"
-)
-llm = HuggingFaceEndpoint(
-    endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-base",
-    huggingfacehub_api_token=hf_token,
-    temperature=0.5,
-    max_length=512,
-)
-TEMPLATE = """
-Use the following context to answer the question. If unknown, say so.
-Context: {context}
-Question: {question}
-Answer (up to 3 sentences):
-"""
-prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
-# ————— Inference client for image captioning —————
-vision_client = InferenceClient(
-    model="Salesforce/blip-image-captioning-base",
-    token=hf_token,
-)
-# Globals (initialized after processing)
-vector_store = None
-qa_chain = None
-def extract_image_caption(path: str) -> str:
-    with Image.open(path) as img:
-        return vision_client.image_to_text(img)
-def process_pdf(local_path: str) -> str:
-    """Ingest a local PDF file, extract text & images, chunk, embed, and index."""
-    global vector_store, qa_chain
-    # Move the uploaded PDF into our PDFs folder
-    src = Path(local_path)
-    if src.suffix.lower() != ".pdf":
-        return "❗ Error: Uploaded file is not a PDF."
-    dest = PDF_DIR / src.name
-    src.rename(dest)
-    # Partition PDF into text + image blocks
-    elems = partition_pdf(
-        str(dest),
-        strategy=PartitionStrategy.HI_RES,
-        extract_image_block_types=["Image", "Table"],
-        extract_image_block_output_dir=str(FIG_DIR),
-    )
-    # Collect text blocks
-    texts = [el.text for el in elems if el.category not in ("Image", "Table")]
-    # Generate captions for each extracted image
-    for img_file in FIG_DIR.iterdir():
-        texts.append(extract_image_caption(str(img_file)))
-    # Chunk and embed
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    docs = splitter.split_text("\n\n".join(texts))
-    vector_store = FAISS.from_texts(docs, embedding_model)
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        retriever=vector_store.as_retriever(),
-        chain_type_kwargs={"prompt": prompt},
-    )
-    return f"✅ Processed `{dest.name}` into {len(docs)} chunks."
-def answer_query(question: str) -> str:
-    if qa_chain is None:
-        return "❗ Please upload and process a PDF first."
-    return qa_chain.run(question)
-# ————— Gradio UI —————
-with gr.Blocks() as demo:
-    gr.Markdown("## 📄📷 Multimodal RAG — Hugging Face Spaces")
     with gr.Row():
-        # just accept any local file; we check for .pdf in process_pdf()
-        pdf_in = gr.File(label="Upload PDF", type="filepath")
-        btn_proc = gr.Button("Process PDF")
-        status = gr.Textbox(label="Status")
     with gr.Row():
-        q_in = gr.Textbox(label="Your Question")
-        btn_ask = gr.Button("Ask")
-        ans_out = gr.Textbox(label="Answer")
-    btn_proc.click(fn=process_pdf, inputs=pdf_in, outputs=status)
-    btn_ask.click(fn=answer_query, inputs=q_in, outputs=ans_out)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
+import base64
 from PIL import Image
+import io
+import requests
+# Import vectorstore and embeddings from langchain community package
 from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# Text splitter to break large documents into manageable chunks
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# HF Inference client for running chat completions
+from huggingface_hub import InferenceClient
+# Unstructured for advanced PDF processing with image/table extraction
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.utils.constants import PartitionStrategy
+# ── Globals ───────────────────────────────────────────────────────────────────
+index = None               # FAISS index storing document embeddings
+retriever = None           # Retriever to fetch relevant chunks
+current_pdf_name = None    # Name of the currently loaded PDF
+pdf_text = None            # Full text of the uploaded PDF
+extracted_images = []      # List to store extracted images and their descriptions
+# Create directories for storing extracted figures
+FIGURES_DIR = "extracted_figures/"
+os.makedirs(FIGURES_DIR, exist_ok=True)
+# ── HF Inference clients for different models ─────────────────────────────────
+# Text generation model
+text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
+# Vision-Language Models (choose one based on your needs and HF availability)
+# Option 1: BLIP-2 for general image understanding
+vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
+# Option 2: Alternative vision models you can use:
+# vision_client = InferenceClient(model="microsoft/git-base-coco")
+# vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
+# vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")
+# For more advanced multimodal tasks, you can use:
+# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium")  # For conversational AI
+# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b")   # For instruction following
+# ── Embeddings ───────────────────────────────────────────────────────────────
+# Use BGE embeddings from BAAI for vectorizing text chunks
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+def extract_image_description_advanced(image_path):
+    """
+    Enhanced image description using multiple vision models
+    """
+    try:
+        # Load and process image
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+        # Method 1: Use BLIP-2 for detailed image captioning
+        try:
+            description = vision_client.image_to_text(image_bytes)
+            base_description = description if isinstance(description, str) else description.get('generated_text', '')
+        except Exception as e:
+            print(f"BLIP-2 failed: {e}")
+            base_description = "Image could not be processed with vision model"
+        # Method 2: Enhance with text-based analysis using the text model
+        enhancement_prompt = f"""
+        Analyze this image description and provide a detailed analysis focusing on:
+        1. Any text, numbers, or data visible
+        2. Charts, graphs, or tables
+        3. Key visual elements and their significance
+        4. Context and meaning
+        Description: {base_description}
+        Provide a comprehensive analysis:
+        """
+        try:
+            response = text_client.chat_completion(
+                messages=[{"role": "user", "content": enhancement_prompt}],
+                max_tokens=300,
+                temperature=0.3
+            )
+            enhanced_description = response["choices"][0]["message"]["content"].strip()
+        except Exception as e:
+            print(f"Text enhancement failed: {e}")
+            enhanced_description = base_description
+        return f"Visual Element Analysis:\n{enhanced_description}"
+    except Exception as e:
+        print(f"Error processing image {image_path}: {str(e)}")
+        return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"
+def process_pdf_multimodal_advanced(pdf_file):
+    """
+    Advanced multimodal PDF processing with enhanced vision capabilities
+    """
+    global current_pdf_name, index, retriever, pdf_text, extracted_images
+    if pdf_file is None:
+        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
+    current_pdf_name = os.path.basename(pdf_file.name)
+    extracted_images = []
+    # Clear existing figures directory
+    for file in os.listdir(FIGURES_DIR):
+        try:
+            os.remove(os.path.join(FIGURES_DIR, file))
+        except:
+            pass
+    try:
+        # Process PDF with unstructured
+        elements = partition_pdf(
+            pdf_file.name,
+            strategy=PartitionStrategy.HI_RES,
+            extract_image_block_types=["Image", "Table"],
+            extract_image_block_output_dir=FIGURES_DIR,
+            extract_image_block_to_payload=False,
+            # Additional parameters for better extraction
+            infer_table_structure=True,
+            chunking_strategy="by_title",
+            max_characters=1000,
+            combine_text_under_n_chars=100
+        )
+        # Process elements
+        text_elements = []
+        visual_descriptions = []
+        for element in elements:
+            if element.category in ["Image", "Table"]:
+                # Handle image/table elements
+                continue
+            elif element.category == "Title":
+                text_elements.append(f"TITLE: {element.text}")
+            elif element.category == "Header":
+                text_elements.append(f"HEADER: {element.text}")
+            else:
+                if hasattr(element, 'text') and element.text.strip():
+                    text_elements.append(element.text)
+        pdf_text = "\n\n".join(text_elements)
+        # Process extracted visual elements
+        if os.path.exists(FIGURES_DIR):
+            for filename in sorted(os.listdir(FIGURES_DIR)):
+                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
+                    image_path = os.path.join(FIGURES_DIR, filename)
+                    # Get enhanced description
+                    description = extract_image_description_advanced(image_path)
+                    visual_descriptions.append(description)
+                    extracted_images.append({
+                        'path': image_path,
+                        'description': description,
+                        'filename': filename,
+                        'type': 'table' if 'table' in filename.lower() else 'image'
+                    })
+        # Combine all content
+        all_content = text_elements + visual_descriptions
+        # Advanced text splitting
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800,  # Smaller chunks for better retrieval
+            chunk_overlap=150,
+            add_start_index=True,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+        )
+        combined_content = "\n\n".join(all_content)
+        chunks = text_splitter.split_text(combined_content)
+        # Create FAISS index with better retrieval settings
+        index = FAISS.from_texts(chunks, embeddings)
+        retriever = index.as_retriever(
+            search_type="mmr",  # Maximum marginal relevance
+            search_kwargs={
+                "k": 4,
+                "fetch_k": 8,
+                "lambda_mult": 0.7
+            }
+        )
+        status = f"✅ Advanced processing complete for '{current_pdf_name}'\n📄 {len(text_elements)} text sections\n🖼️ {len(extracted_images)} visual elements\n📦 {len(chunks)} searchable chunks"
+        return current_pdf_name, status, gr.update(interactive=True)
+    except Exception as e:
+        error_msg = f"❌ Processing error: {str(e)}"
+        return current_pdf_name, error_msg, gr.update(interactive=False)
+def ask_question_multimodal_advanced(pdf_name, question):
+    """
+    Advanced multimodal question answering with smart routing
+    """
+    global retriever, extracted_images
+    if index is None or retriever is None:
+        return "❌ Please upload and process a PDF first."
+    if not question.strip():
+        return "❌ Please enter a question."
+    try:
+        # Retrieve relevant chunks
+        docs = retriever.get_relevant_documents(question)
+        context = "\n\n".join([doc.page_content for doc in docs])
+        # Enhanced visual query detection
+        visual_keywords = [
+            'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
+            'visual', 'show', 'display', 'plot', 'data', 'visualization',
+            'illustration', 'screenshot', 'photo', 'drawing'
+        ]
+        is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)
+        # Smart context enhancement
+        if is_visual_query and extracted_images:
+            # Prioritize visual content for visual queries
+            visual_context = "\n\n".join([img['description'] for img in extracted_images])
+            enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
+        else:
+            enhanced_context = context
+        # Advanced prompting based on query type
+        if is_visual_query:
+            system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
+            You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
+            When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
+        else:
+            system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
+            Use the context provided to give detailed and helpful responses."""
+        prompt = f"""{system_prompt}
+Context: {enhanced_context}
+Question: {question}
+Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""
+        response = text_client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=400,
+            temperature=0.4
+        )
+        answer = response["choices"][0]["message"]["content"].strip()
+        return answer
+    except Exception as e:
+        return f"❌ Error generating answer: {str(e)}"
+def analyze_document_structure():
+    """
+    New feature: Analyze the overall structure of the document
+    """
+    global pdf_text, extracted_images
+    if not pdf_text and not extracted_images:
+        return "❌ Please upload and process a PDF first."
+    try:
+        structure_prompt = f"""
+        Analyze the structure and organization of this document. Provide insights about:
+        1. Document type and purpose
+        2. Main sections and topics
+        3. Visual elements present ({len(extracted_images)} images/tables/charts)
+        4. Key information hierarchy
+        5. Overall document quality and completeness
+        Text content sample: {pdf_text[:1000]}
+        Visual elements: {len(extracted_images)} items detected
+        Provide a structural analysis:
+        """
+        response = text_client.chat_completion(
+            messages=[{"role": "user", "content": structure_prompt}],
+            max_tokens=300,
+            temperature=0.3
+        )
+        return response["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        return f"❌ Error analyzing structure: {str(e)}"
+# [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]
+def generate_summary_multimodal():
+    """Enhanced summary generation considering both text and visual content"""
+    global pdf_text, extracted_images
+    if not pdf_text and not extracted_images:
+        return "❌ Please upload and process a PDF first."
+    try:
+        content_parts = []
+        if pdf_text:
+            content_parts.append(f"Text Content:\n{pdf_text[:2000]}")
+        if extracted_images:
+            visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
+            content_parts.append(f"Visual Content:\n{visual_summary}")
+        combined_content = "\n\n".join(content_parts)
+        prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
+        Focus on key findings, main topics, and insights from charts, tables, or images.
+        Content: {combined_content}
+        Summary:"""
+        response = text_client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=250,
+            temperature=0.5
+        )
+        return response["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        return f"❌ Error generating summary: {str(e)}"
+def extract_keywords_multimodal():
+    """Enhanced keyword extraction from both text and visual content"""
+    global pdf_text, extracted_images
+    if not pdf_text and not extracted_images:
+        return "❌ Please upload and process a PDF first."
+    try:
+        content_parts = []
+        if pdf_text:
+            content_parts.append(f"Text: {pdf_text[:1500]}")
+        if extracted_images:
+            visual_content = "\n".join([img['description'][:150] for img in extracted_images])
+            content_parts.append(f"Visual Content: {visual_content}")
+        combined_content = "\n\n".join(content_parts)
+        prompt = f"""Extract key terms, concepts, and topics from this document content.
+        Include technical terms, important concepts, and themes from both text and visual elements.
+        Content: {combined_content}
+        Key terms and concepts:"""
+        response = text_client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=120,
+            temperature=0.5
+        )
+        return response["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        return f"❌ Error extracting keywords: {str(e)}"
+def show_extracted_images():
+    """Display information about extracted images"""
+    global extracted_images
+    if not extracted_images:
+        return "No visual elements extracted from the current document."
+    info = f"📊 Extracted {len(extracted_images)} visual elements:\n\n"
+    for i, img in enumerate(extracted_images, 1):
+        element_type = "📊 Table" if img['type'] == 'table' else "🖼️ Image"
+        info += f"{i}. {element_type}: {img['filename']}\n"
+        info += f"   Description: {img['description'][:150]}...\n\n"
+        if i >= 5:  # Limit display to first 5
+            remaining = len(extracted_images) - 5
+            if remaining > 0:
+                info += f"... and {remaining} more visual elements."
+            break
+    return info
+def clear_interface_multimodal():
+    """Enhanced clear function for multimodal system"""
+    global index, retriever, current_pdf_name, pdf_text, extracted_images
+    index = retriever = None
+    current_pdf_name = pdf_text = None
+    extracted_images = []
+    if os.path.exists(FIGURES_DIR):
+        for file in os.listdir(FIGURES_DIR):
+            try:
+                os.remove(os.path.join(FIGURES_DIR, file))
+            except:
+                pass
+    return None, "", gr.update(interactive=False), "", "", "", "", ""
+# Enhanced Gradio UI
+theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
+with gr.Blocks(theme=theme, css="""
+    .container { border-radius: 10px; padding: 15px; }
+    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
+    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
+    .main-title {
+        text-align: center;
+        font-size: 56px;
+        font-weight: bold;
+        margin-bottom: 20px;
+        background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+    .feature-badge {
+        background: linear-gradient(45deg, #10b981, #3b82f6);
+        color: white;
+        padding: 4px 12px;
+        border-radius: 15px;
+        font-size: 11px;
+        margin: 2px;
+        display: inline-block;
+    }
+""") as demo:
+    gr.Markdown("<div class='main-title'>🤖 DocQueryAI Pro</div>")
+    gr.Markdown("""
+    <div style='text-align: center; margin-bottom: 25px;'>
+        <span class='feature-badge'>🔍 Advanced RAG</span>
+        <span class='feature-badge'>🖼️ Vision AI</span>
+        <span class='feature-badge'>📊 Table Analysis</span>
+        <span class='feature-badge'>📈 Chart Understanding</span>
+        <span class='feature-badge'>🧠 Smart Retrieval</span>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 📄 Document Processing")
+            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
+            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
+            upload_button = gr.Button("🚀 Process with AI Vision", variant="primary", size="lg")
+            status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)
+        with gr.Column():
+            gr.Markdown("## 💬 Intelligent Q&A")
+            gr.Markdown("*Ask about any content: text, images, charts, tables, or data visualizations*")
+            question_input = gr.Textbox(
+                lines=3,
+                placeholder="Examples:\n• What does the chart show?\n• Summarize the table data\n• Explain the main findings",
+                label="Your Question"
+            )
+            ask_button = gr.Button("🔍 Get AI Answer", variant="primary", size="lg")
+            answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
     with gr.Row():
+        with gr.Column():
+            summary_button = gr.Button("📋 Generate Summary", variant="secondary")
+            summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)
+        with gr.Column():
+            keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
+            keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)
     with gr.Row():
+        with gr.Column():
+            structure_button = gr.Button("🏗️ Analyze Structure", variant="secondary")
+            structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)
+        with gr.Column():
+            images_button = gr.Button("🖼️ Show Visual Elements", variant="secondary")
+            images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)
+    with gr.Row():
+        clear_button = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
+    gr.Markdown("""
+    <div class='footer'>
+        🚀 <strong>Powered by Advanced AI</strong><br>
+        🔧 HuggingFace Transformers • LangChain • FAISS • Unstructured<br>
+        🎯 Multimodal RAG: Text + Vision + Tables + Charts
+    </div>
+    """)
+    # Event bindings
+    upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
+    ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
+    summary_button.click(generate_summary_multimodal, [], summary_output)
+    keywords_button.click(extract_keywords_multimodal, [], keywords_output)
+    structure_button.click(analyze_document_structure, [], structure_output)
+    images_button.click(show_extracted_images, [], images_output)
+    clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])
 if __name__ == "__main__":
+    demo.launch(debug=True, share=True)