Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Multimodal / app.py

Muzammil6376

Update app.py

a6c0d87 verified 29 days ago

raw

history blame

13.4 kB

	import os
	import gradio as gr
	import tempfile
	from pathlib import Path

	# Import vectorstore and embeddings from langchain community package
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	# Text splitter to break large documents into manageable chunks
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	# HF Inference client for running multimodal models
	from huggingface_hub import InferenceClient
	# Unstructured for PDF processing with image extraction
	from unstructured.partition.pdf import partition_pdf
	from unstructured.partition.utils.constants import PartitionStrategy

	# ── Globals ───────────────────────────────────────────────────────────────────
	index = None # FAISS index storing document embeddings
	retriever = None # Retriever to fetch relevant chunks
	current_pdf_name = None # Name of the currently loaded PDF
	extracted_content = None # Combined text and image descriptions

	# ── HF Inference clients ─────────────────────────────────────────────────────
	# Text generation client (using a good open model)
	text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
	# Vision client for image analysis
	vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")

	# ── Embeddings ───────────────────────────────────────────────────────────────
	# Use BGE embeddings for vectorizing text chunks
	embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	# Create temporary directories for processing
	temp_dir = tempfile.mkdtemp()
	figures_dir = os.path.join(temp_dir, "figures")
	os.makedirs(figures_dir, exist_ok=True)

	def extract_image_description(image_path):
	"""
	Analyze an extracted image using vision model to get text description.
	Args:
	image_path: Path to the extracted image file
	Returns:
	Text description of the image content
	"""
	try:
	# Read image and send to vision model
	with open(image_path, "rb") as img_file:
	# Use vision client to analyze the image
	response = vision_client.text_to_image_generation(
	prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
	image=img_file.read()
	)
	return f"Image content: {response}"
	except Exception as e:
	return f"Image content: [Could not analyze image - {str(e)}]"

	def process_pdf_multimodal(pdf_file):
	"""
	1. Extracts text and images from PDF using unstructured
	2. Analyzes extracted images with vision model
	3. Combines text and image descriptions
	4. Creates FAISS index for retrieval
	Args:
	pdf_file: Uploaded PDF file
	Returns:
	- PDF filename, status message, and UI updates
	"""
	global current_pdf_name, index, retriever, extracted_content

	if pdf_file is None:
	return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

	current_pdf_name = os.path.basename(pdf_file.name)

	try:
	# Clear previous figures
	for file in os.listdir(figures_dir):
	os.remove(os.path.join(figures_dir, file))

	# Extract elements from PDF including images
	elements = partition_pdf(
	pdf_file.name,
	strategy=PartitionStrategy.HI_RES,
	extract_image_block_types=["Image", "Table"],
	extract_image_block_output_dir=figures_dir,
	extract_image_block_to_payload=False
	)

	# Separate text elements
	text_elements = []
	for element in elements:
	if element.category not in ["Image", "Table"]:
	text_elements.append(element.text)

	# Process extracted images
	image_descriptions = []
	if os.path.exists(figures_dir):
	for image_file in os.listdir(figures_dir):
	if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
	image_path = os.path.join(figures_dir, image_file)
	description = extract_image_description(image_path)
	image_descriptions.append(description)

	# Combine text and image descriptions
	all_content = text_elements + image_descriptions
	extracted_content = "\n\n".join(all_content)

	# Split into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	add_start_index=True
	)
	chunks = text_splitter.split_text(extracted_content)

	# Create FAISS index
	index = FAISS.from_texts(chunks, embeddings)
	retriever = index.as_retriever(search_kwargs={"k": 3})

	# Status message
	num_images = len(image_descriptions)
	status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed"

	return current_pdf_name, status, gr.update(interactive=True)

	except Exception as e:
	error_msg = f"❌ Error processing PDF: {str(e)}"
	return current_pdf_name, error_msg, gr.update(interactive=False)

	def ask_multimodal_question(pdf_name, question):
	"""
	Answer questions using both text and image content from the PDF.
	Args:
	pdf_name: Display name (unused)
	question: User's question
	Returns:
	Generated answer combining text and visual information
	"""
	global retriever

	if index is None or retriever is None:
	return "❌ Please upload and process a PDF first."

	if not question.strip():
	return "❌ Please enter a question."

	try:
	# Retrieve relevant chunks (text + image descriptions)
	docs = retriever.get_relevant_documents(question)
	context = "\n\n".join(doc.page_content for doc in docs)

	# Enhanced prompt for multimodal content
	prompt = (
	"You are an AI assistant analyzing a document that contains both text and images. "
	"Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
	"to answer the question comprehensively.\n\n"
	f"Document Content:\n{context}\n\n"
	f"Question: {question}\n\n"
	"Provide a detailed answer based on both the textual information and visual elements described above. "
	"If the answer involves data from charts, tables, or images, mention that explicitly.\n"
	"Answer:"
	)

	# Generate response
	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=256,
	temperature=0.5
	)

	answer = response["choices"][0]["message"]["content"].strip()
	return answer

	except Exception as e:
	return f"❌ Error generating answer: {str(e)}"

	def generate_multimodal_summary():
	"""
	Generate a summary considering both text and visual elements.
	"""
	if not extracted_content:
	return "❌ Please upload and process a PDF first."

	try:
	# Use first 3000 characters for summary
	content_preview = extracted_content[:3000]

	prompt = (
	"Provide a comprehensive summary of this document that contains both text and visual elements "
	"(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
	f"{content_preview}..."
	)

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=200,
	temperature=0.5
	)

	return response["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"❌ Error generating summary: {str(e)}"

	def extract_multimodal_keywords():
	"""
	Extract keywords from both text and visual content.
	"""
	if not extracted_content:
	return "❌ Please upload and process a PDF first."

	try:
	content_preview = extracted_content[:3000]

	prompt = (
	"Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
	"Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
	f"{content_preview}..."
	)

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=100,
	temperature=0.5
	)

	return response["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"❌ Error extracting keywords: {str(e)}"

	def clear_multimodal_interface():
	"""
	Reset all global state and clear UI.
	"""
	global index, retriever, current_pdf_name, extracted_content

	# Clear figures directory
	try:
	for file in os.listdir(figures_dir):
	os.remove(os.path.join(figures_dir, file))
	except:
	pass

	# Reset globals
	index = retriever = None
	current_pdf_name = extracted_content = None

	return None, "", gr.update(interactive=False)

	# ── Gradio UI ────────────────────────────────────────────────────────────────
	theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

	with gr.Blocks(theme=theme, css="""
	.container { border-radius: 10px; padding: 15px; }
	.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
	.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
	.main-title {
	text-align: center;
	font-size: 64px;
	font-weight: bold;
	margin-bottom: 20px;
	}
	.multimodal-badge {
	background: linear-gradient(45deg, #6366f1, #8b5cf6);
	color: white;
	padding: 5px 15px;
	border-radius: 20px;
	font-size: 14px;
	display: inline-block;
	margin: 10px auto;
	}
	""") as demo:

	# Application title with multimodal badge
	gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
	gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🖼️ Text + Images + Charts</span></div>")

	with gr.Row():
	with gr.Column():
	gr.Markdown("## 📄 Document Input")
	pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
	pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
	upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary")
	status_box = gr.Textbox(label="Processing Status", interactive=False)

	with gr.Column():
	gr.Markdown("## ❓ Ask Questions")
	gr.Markdown("Ask about text content, images, charts, tables, or any visual elements in your PDF")
	question_input = gr.Textbox(
	lines=3,
	placeholder="Ask about text, images, charts, or any content in the PDF...",
	interactive=False
	)
	ask_button = gr.Button("🔍 Ask Question", variant="primary")
	answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

	# Analysis tools
	with gr.Row():
	with gr.Column():
	summary_button = gr.Button("📋 Generate Summary", variant="secondary")
	summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
	with gr.Column():
	keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
	keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)

	# Clear button
	clear_button = gr.Button("🗑️ Clear All", variant="secondary")

	gr.Markdown("""
	<div class='footer'>
	Powered by LangChain + Unstructured + Vision AI + FAISS \|
	Supports: Text, Images, Charts, Tables, Diagrams
	</div>
	""")

	# Event bindings
	upload_button.click(
	process_pdf_multimodal,
	[pdf_file],
	[pdf_display, status_box, question_input]
	)
	ask_button.click(
	ask_multimodal_question,
	[pdf_display, question_input],
	answer_output
	)
	summary_button.click(generate_multimodal_summary, [], summary_output)
	keywords_button.click(extract_multimodal_keywords, [], keywords_output)
	clear_button.click(
	clear_multimodal_interface,
	[],
	[pdf_file, pdf_display, question_input]
	)

	if __name__ == "__main__":
	demo.launch(debug=True, share=True)