Multimodal / app.py
Muzammil6376's picture
Update app.py
a6c0d87 verified
raw
history blame
13.4 kB
import os
import gradio as gr
import tempfile
from pathlib import Path
# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# HF Inference client for running multimodal models
from huggingface_hub import InferenceClient
# Unstructured for PDF processing with image extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy
# ── Globals ───────────────────────────────────────────────────────────────────
index = None # FAISS index storing document embeddings
retriever = None # Retriever to fetch relevant chunks
current_pdf_name = None # Name of the currently loaded PDF
extracted_content = None # Combined text and image descriptions
# ── HF Inference clients ─────────────────────────────────────────────────────
# Text generation client (using a good open model)
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# Vision client for image analysis
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
# ── Embeddings ───────────────────────────────────────────────────────────────
# Use BGE embeddings for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
# Create temporary directories for processing
temp_dir = tempfile.mkdtemp()
figures_dir = os.path.join(temp_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)
def extract_image_description(image_path):
"""
Analyze an extracted image using vision model to get text description.
Args:
image_path: Path to the extracted image file
Returns:
Text description of the image content
"""
try:
# Read image and send to vision model
with open(image_path, "rb") as img_file:
# Use vision client to analyze the image
response = vision_client.text_to_image_generation(
prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
image=img_file.read()
)
return f"Image content: {response}"
except Exception as e:
return f"Image content: [Could not analyze image - {str(e)}]"
def process_pdf_multimodal(pdf_file):
"""
1. Extracts text and images from PDF using unstructured
2. Analyzes extracted images with vision model
3. Combines text and image descriptions
4. Creates FAISS index for retrieval
Args:
pdf_file: Uploaded PDF file
Returns:
- PDF filename, status message, and UI updates
"""
global current_pdf_name, index, retriever, extracted_content
if pdf_file is None:
return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
current_pdf_name = os.path.basename(pdf_file.name)
try:
# Clear previous figures
for file in os.listdir(figures_dir):
os.remove(os.path.join(figures_dir, file))
# Extract elements from PDF including images
elements = partition_pdf(
pdf_file.name,
strategy=PartitionStrategy.HI_RES,
extract_image_block_types=["Image", "Table"],
extract_image_block_output_dir=figures_dir,
extract_image_block_to_payload=False
)
# Separate text elements
text_elements = []
for element in elements:
if element.category not in ["Image", "Table"]:
text_elements.append(element.text)
# Process extracted images
image_descriptions = []
if os.path.exists(figures_dir):
for image_file in os.listdir(figures_dir):
if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(figures_dir, image_file)
description = extract_image_description(image_path)
image_descriptions.append(description)
# Combine text and image descriptions
all_content = text_elements + image_descriptions
extracted_content = "\n\n".join(all_content)
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True
)
chunks = text_splitter.split_text(extracted_content)
# Create FAISS index
index = FAISS.from_texts(chunks, embeddings)
retriever = index.as_retriever(search_kwargs={"k": 3})
# Status message
num_images = len(image_descriptions)
status = f"βœ… Processed '{current_pdf_name}' β€” {len(chunks)} text chunks, {num_images} images analyzed"
return current_pdf_name, status, gr.update(interactive=True)
except Exception as e:
error_msg = f"❌ Error processing PDF: {str(e)}"
return current_pdf_name, error_msg, gr.update(interactive=False)
def ask_multimodal_question(pdf_name, question):
"""
Answer questions using both text and image content from the PDF.
Args:
pdf_name: Display name (unused)
question: User's question
Returns:
Generated answer combining text and visual information
"""
global retriever
if index is None or retriever is None:
return "❌ Please upload and process a PDF first."
if not question.strip():
return "❌ Please enter a question."
try:
# Retrieve relevant chunks (text + image descriptions)
docs = retriever.get_relevant_documents(question)
context = "\n\n".join(doc.page_content for doc in docs)
# Enhanced prompt for multimodal content
prompt = (
"You are an AI assistant analyzing a document that contains both text and images. "
"Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
"to answer the question comprehensively.\n\n"
f"Document Content:\n{context}\n\n"
f"Question: {question}\n\n"
"Provide a detailed answer based on both the textual information and visual elements described above. "
"If the answer involves data from charts, tables, or images, mention that explicitly.\n"
"Answer:"
)
# Generate response
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=256,
temperature=0.5
)
answer = response["choices"][0]["message"]["content"].strip()
return answer
except Exception as e:
return f"❌ Error generating answer: {str(e)}"
def generate_multimodal_summary():
"""
Generate a summary considering both text and visual elements.
"""
if not extracted_content:
return "❌ Please upload and process a PDF first."
try:
# Use first 3000 characters for summary
content_preview = extracted_content[:3000]
prompt = (
"Provide a comprehensive summary of this document that contains both text and visual elements "
"(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
f"{content_preview}..."
)
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
except Exception as e:
return f"❌ Error generating summary: {str(e)}"
def extract_multimodal_keywords():
"""
Extract keywords from both text and visual content.
"""
if not extracted_content:
return "❌ Please upload and process a PDF first."
try:
content_preview = extracted_content[:3000]
prompt = (
"Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
"Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
f"{content_preview}..."
)
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=100,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
except Exception as e:
return f"❌ Error extracting keywords: {str(e)}"
def clear_multimodal_interface():
"""
Reset all global state and clear UI.
"""
global index, retriever, current_pdf_name, extracted_content
# Clear figures directory
try:
for file in os.listdir(figures_dir):
os.remove(os.path.join(figures_dir, file))
except:
pass
# Reset globals
index = retriever = None
current_pdf_name = extracted_content = None
return None, "", gr.update(interactive=False)
# ── Gradio UI ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
with gr.Blocks(theme=theme, css="""
.container { border-radius: 10px; padding: 15px; }
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
.main-title {
text-align: center;
font-size: 64px;
font-weight: bold;
margin-bottom: 20px;
}
.multimodal-badge {
background: linear-gradient(45deg, #6366f1, #8b5cf6);
color: white;
padding: 5px 15px;
border-radius: 20px;
font-size: 14px;
display: inline-block;
margin: 10px auto;
}
""") as demo:
# Application title with multimodal badge
gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>πŸ–ΌοΈ Text + Images + Charts</span></div>")
with gr.Row():
with gr.Column():
gr.Markdown("## πŸ“„ Document Input")
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
upload_button = gr.Button("πŸ”„ Process Document (Extract Text + Images)", variant="primary")
status_box = gr.Textbox(label="Processing Status", interactive=False)
with gr.Column():
gr.Markdown("## ❓ Ask Questions")
gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
question_input = gr.Textbox(
lines=3,
placeholder="Ask about text, images, charts, or any content in the PDF...",
interactive=False
)
ask_button = gr.Button("πŸ” Ask Question", variant="primary")
answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
# Analysis tools
with gr.Row():
with gr.Column():
summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
with gr.Column():
keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
# Clear button
clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
gr.Markdown("""
<div class='footer'>
Powered by LangChain + Unstructured + Vision AI + FAISS |
Supports: Text, Images, Charts, Tables, Diagrams
</div>
""")
# Event bindings
upload_button.click(
process_pdf_multimodal,
[pdf_file],
[pdf_display, status_box, question_input]
)
ask_button.click(
ask_multimodal_question,
[pdf_display, question_input],
answer_output
)
summary_button.click(generate_multimodal_summary, [], summary_output)
keywords_button.click(extract_multimodal_keywords, [], keywords_output)
clear_button.click(
clear_multimodal_interface,
[],
[pdf_file, pdf_display, question_input]
)
if __name__ == "__main__":
demo.launch(debug=True, share=True)