Spaces:

Muzammil6376
/

Multimodal

Sleeping

File size: 13,431 Bytes

67a56f6
 
a6c0d87
 
3fdd093
87baec5
3fdd093
87baec5
 
ced2810
a6c0d87
87baec5
a6c0d87
40696fb
 
225229c
87baec5
 
 
 
a6c0d87
87baec5
a6c0d87
 
87baec5
a6c0d87
 
87baec5
a6c0d87
 
 
009e0ad
a6c0d87
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
87baec5
 
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
 
 
87baec5
a6c0d87
87baec5
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
 
 
 
a6c0d87
 
87baec5
 
a6c0d87
87baec5
 
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7154bdc
a6c0d87
 
 
87baec5
a6c0d87
 
 
87baec5
 
 
 
a6c0d87
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
 
87baec5
 
 
 
 
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
87baec5
a6c0d87
87baec5
 
a6c0d87
 
87baec5
 
 
 
 
 
 
 
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
 
 
a6c0d87
 
87baec5
a6c0d87
 
 
 
87baec5
 
 
 
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
 
 
a6c0d87
87baec5
a6c0d87
 
 
 
 
87baec5
 
 
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
 
 
 
 
87baec5
a6c0d87
 
 
 
 
 
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
a6c0d87
87baec5
 
 
 
 
 
 
 
a6c0d87
87baec5
 
 
a6c0d87
 
87baec5
a6c0d87
 
 
87baec5
a6c0d87
87baec5
 
 
a6c0d87
 
 
7fdd092
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
87baec5
 
a6c0d87
 
87baec5
 
a6c0d87
 
87baec5
a6c0d87
 
7fdd092
a6c0d87
d179e57
87baec5
 
a6c0d87
87baec5
 
a6c0d87
87baec5
a6c0d87
 
 
87baec5
 
a6c0d87
 
87baec5
 
 
 
a6c0d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d179e57
3fdd093
87baec5

import os
import gradio as gr
import tempfile
from pathlib import Path

# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# HF Inference client for running multimodal models
from huggingface_hub import InferenceClient
# Unstructured for PDF processing with image extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy

# ── Globals ───────────────────────────────────────────────────────────────────
index = None               # FAISS index storing document embeddings
retriever = None           # Retriever to fetch relevant chunks
current_pdf_name = None    # Name of the currently loaded PDF
extracted_content = None   # Combined text and image descriptions

# ── HF Inference clients ─────────────────────────────────────────────────────
# Text generation client (using a good open model)
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# Vision client for image analysis
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")

# ── Embeddings ───────────────────────────────────────────────────────────────
# Use BGE embeddings for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Create temporary directories for processing
temp_dir = tempfile.mkdtemp()
figures_dir = os.path.join(temp_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)

def extract_image_description(image_path):
    """
    Analyze an extracted image using vision model to get text description.
    Args:
        image_path: Path to the extracted image file
    Returns:
        Text description of the image content
    """
    try:
        # Read image and send to vision model
        with open(image_path, "rb") as img_file:
            # Use vision client to analyze the image
            response = vision_client.text_to_image_generation(
                prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
                image=img_file.read()
            )
            return f"Image content: {response}"
    except Exception as e:
        return f"Image content: [Could not analyze image - {str(e)}]"

def process_pdf_multimodal(pdf_file):
    """
    1. Extracts text and images from PDF using unstructured
    2. Analyzes extracted images with vision model
    3. Combines text and image descriptions
    4. Creates FAISS index for retrieval
    Args:
        pdf_file: Uploaded PDF file
    Returns:
        - PDF filename, status message, and UI updates
    """
    global current_pdf_name, index, retriever, extracted_content

    if pdf_file is None:
        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

    current_pdf_name = os.path.basename(pdf_file.name)
    
    try:
        # Clear previous figures
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
        
        # Extract elements from PDF including images
        elements = partition_pdf(
            pdf_file.name,
            strategy=PartitionStrategy.HI_RES,
            extract_image_block_types=["Image", "Table"],
            extract_image_block_output_dir=figures_dir,
            extract_image_block_to_payload=False
        )
        
        # Separate text elements
        text_elements = []
        for element in elements:
            if element.category not in ["Image", "Table"]:
                text_elements.append(element.text)
        
        # Process extracted images
        image_descriptions = []
        if os.path.exists(figures_dir):
            for image_file in os.listdir(figures_dir):
                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_path = os.path.join(figures_dir, image_file)
                    description = extract_image_description(image_path)
                    image_descriptions.append(description)
        
        # Combine text and image descriptions
        all_content = text_elements + image_descriptions
        extracted_content = "\n\n".join(all_content)
        
        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            add_start_index=True
        )
        chunks = text_splitter.split_text(extracted_content)
        
        # Create FAISS index
        index = FAISS.from_texts(chunks, embeddings)
        retriever = index.as_retriever(search_kwargs={"k": 3})
        
        # Status message
        num_images = len(image_descriptions)
        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed"
        
        return current_pdf_name, status, gr.update(interactive=True)
        
    except Exception as e:
        error_msg = f"❌ Error processing PDF: {str(e)}"
        return current_pdf_name, error_msg, gr.update(interactive=False)

def ask_multimodal_question(pdf_name, question):
    """
    Answer questions using both text and image content from the PDF.
    Args:
        pdf_name: Display name (unused)
        question: User's question
    Returns:
        Generated answer combining text and visual information
    """
    global retriever
    
    if index is None or retriever is None:
        return "❌ Please upload and process a PDF first."
    
    if not question.strip():
        return "❌ Please enter a question."
    
    try:
        # Retrieve relevant chunks (text + image descriptions)
        docs = retriever.get_relevant_documents(question)
        context = "\n\n".join(doc.page_content for doc in docs)
        
        # Enhanced prompt for multimodal content
        prompt = (
            "You are an AI assistant analyzing a document that contains both text and images. "
            "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
            "to answer the question comprehensively.\n\n"
            f"Document Content:\n{context}\n\n"
            f"Question: {question}\n\n"
            "Provide a detailed answer based on both the textual information and visual elements described above. "
            "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
            "Answer:"
        )
        
        # Generate response
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=256,
            temperature=0.5
        )
        
        answer = response["choices"][0]["message"]["content"].strip()
        return answer
        
    except Exception as e:
        return f"❌ Error generating answer: {str(e)}"

def generate_multimodal_summary():
    """
    Generate a summary considering both text and visual elements.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        # Use first 3000 characters for summary
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Provide a comprehensive summary of this document that contains both text and visual elements "
            "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error generating summary: {str(e)}"

def extract_multimodal_keywords():
    """
    Extract keywords from both text and visual content.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
            "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error extracting keywords: {str(e)}"

def clear_multimodal_interface():
    """
    Reset all global state and clear UI.
    """
    global index, retriever, current_pdf_name, extracted_content
    
    # Clear figures directory
    try:
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
    except:
        pass
    
    # Reset globals
    index = retriever = None
    current_pdf_name = extracted_content = None
    
    return None, "", gr.update(interactive=False)

# ── Gradio UI ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

with gr.Blocks(theme=theme, css="""
    .container { border-radius: 10px; padding: 15px; }
    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
    .main-title {
        text-align: center;
        font-size: 64px;
        font-weight: bold;
        margin-bottom: 20px;
    }
    .multimodal-badge {
        background: linear-gradient(45deg, #6366f1, #8b5cf6);
        color: white;
        padding: 5px 15px;
        border-radius: 20px;
        font-size: 14px;
        display: inline-block;
        margin: 10px auto;
    }
""") as demo:
    
    # Application title with multimodal badge
    gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🖼️ Text + Images + Charts</span></div>")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## 📄 Document Input")
            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
            upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary")
            status_box = gr.Textbox(label="Processing Status", interactive=False)

        with gr.Column():
            gr.Markdown("## ❓ Ask Questions")
            gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
            question_input = gr.Textbox(
                lines=3, 
                placeholder="Ask about text, images, charts, or any content in the PDF...",
                interactive=False
            )
            ask_button = gr.Button("🔍 Ask Question", variant="primary")
            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

    # Analysis tools
    with gr.Row():
        with gr.Column():
            summary_button = gr.Button("📋 Generate Summary", variant="secondary")
            summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
        with gr.Column():
            keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
            keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)

    # Clear button
    clear_button = gr.Button("🗑️ Clear All", variant="secondary")
    
    gr.Markdown("""
    <div class='footer'>
        Powered by LangChain + Unstructured + Vision AI + FAISS | 
        Supports: Text, Images, Charts, Tables, Diagrams
    </div>
    """)

    # Event bindings
    upload_button.click(
        process_pdf_multimodal, 
        [pdf_file], 
        [pdf_display, status_box, question_input]
    )
    ask_button.click(
        ask_multimodal_question, 
        [pdf_display, question_input], 
        answer_output
    )
    summary_button.click(generate_multimodal_summary, [], summary_output)
    keywords_button.click(extract_multimodal_keywords, [], keywords_output)
    clear_button.click(
        clear_multimodal_interface, 
        [], 
        [pdf_file, pdf_display, question_input]
    )

if __name__ == "__main__":
    demo.launch(debug=True, share=True)