Spaces:
Sleeping
Sleeping
File size: 13,431 Bytes
67a56f6 a6c0d87 3fdd093 87baec5 3fdd093 87baec5 ced2810 a6c0d87 87baec5 a6c0d87 40696fb 225229c 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 009e0ad a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 7154bdc a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 7fdd092 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 7fdd092 a6c0d87 d179e57 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 87baec5 a6c0d87 d179e57 3fdd093 87baec5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
import os
import gradio as gr
import tempfile
from pathlib import Path
# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# HF Inference client for running multimodal models
from huggingface_hub import InferenceClient
# Unstructured for PDF processing with image extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
index = None # FAISS index storing document embeddings
retriever = None # Retriever to fetch relevant chunks
current_pdf_name = None # Name of the currently loaded PDF
extracted_content = None # Combined text and image descriptions
# ββ HF Inference clients βββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Text generation client (using a good open model)
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# Vision client for image analysis
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
# ββ Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Use BGE embeddings for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
# Create temporary directories for processing
temp_dir = tempfile.mkdtemp()
figures_dir = os.path.join(temp_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)
def extract_image_description(image_path):
"""
Analyze an extracted image using vision model to get text description.
Args:
image_path: Path to the extracted image file
Returns:
Text description of the image content
"""
try:
# Read image and send to vision model
with open(image_path, "rb") as img_file:
# Use vision client to analyze the image
response = vision_client.text_to_image_generation(
prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
image=img_file.read()
)
return f"Image content: {response}"
except Exception as e:
return f"Image content: [Could not analyze image - {str(e)}]"
def process_pdf_multimodal(pdf_file):
"""
1. Extracts text and images from PDF using unstructured
2. Analyzes extracted images with vision model
3. Combines text and image descriptions
4. Creates FAISS index for retrieval
Args:
pdf_file: Uploaded PDF file
Returns:
- PDF filename, status message, and UI updates
"""
global current_pdf_name, index, retriever, extracted_content
if pdf_file is None:
return None, "β Please upload a PDF file.", gr.update(interactive=False)
current_pdf_name = os.path.basename(pdf_file.name)
try:
# Clear previous figures
for file in os.listdir(figures_dir):
os.remove(os.path.join(figures_dir, file))
# Extract elements from PDF including images
elements = partition_pdf(
pdf_file.name,
strategy=PartitionStrategy.HI_RES,
extract_image_block_types=["Image", "Table"],
extract_image_block_output_dir=figures_dir,
extract_image_block_to_payload=False
)
# Separate text elements
text_elements = []
for element in elements:
if element.category not in ["Image", "Table"]:
text_elements.append(element.text)
# Process extracted images
image_descriptions = []
if os.path.exists(figures_dir):
for image_file in os.listdir(figures_dir):
if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(figures_dir, image_file)
description = extract_image_description(image_path)
image_descriptions.append(description)
# Combine text and image descriptions
all_content = text_elements + image_descriptions
extracted_content = "\n\n".join(all_content)
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True
)
chunks = text_splitter.split_text(extracted_content)
# Create FAISS index
index = FAISS.from_texts(chunks, embeddings)
retriever = index.as_retriever(search_kwargs={"k": 3})
# Status message
num_images = len(image_descriptions)
status = f"β
Processed '{current_pdf_name}' β {len(chunks)} text chunks, {num_images} images analyzed"
return current_pdf_name, status, gr.update(interactive=True)
except Exception as e:
error_msg = f"β Error processing PDF: {str(e)}"
return current_pdf_name, error_msg, gr.update(interactive=False)
def ask_multimodal_question(pdf_name, question):
"""
Answer questions using both text and image content from the PDF.
Args:
pdf_name: Display name (unused)
question: User's question
Returns:
Generated answer combining text and visual information
"""
global retriever
if index is None or retriever is None:
return "β Please upload and process a PDF first."
if not question.strip():
return "β Please enter a question."
try:
# Retrieve relevant chunks (text + image descriptions)
docs = retriever.get_relevant_documents(question)
context = "\n\n".join(doc.page_content for doc in docs)
# Enhanced prompt for multimodal content
prompt = (
"You are an AI assistant analyzing a document that contains both text and images. "
"Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
"to answer the question comprehensively.\n\n"
f"Document Content:\n{context}\n\n"
f"Question: {question}\n\n"
"Provide a detailed answer based on both the textual information and visual elements described above. "
"If the answer involves data from charts, tables, or images, mention that explicitly.\n"
"Answer:"
)
# Generate response
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=256,
temperature=0.5
)
answer = response["choices"][0]["message"]["content"].strip()
return answer
except Exception as e:
return f"β Error generating answer: {str(e)}"
def generate_multimodal_summary():
"""
Generate a summary considering both text and visual elements.
"""
if not extracted_content:
return "β Please upload and process a PDF first."
try:
# Use first 3000 characters for summary
content_preview = extracted_content[:3000]
prompt = (
"Provide a comprehensive summary of this document that contains both text and visual elements "
"(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
f"{content_preview}..."
)
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
except Exception as e:
return f"β Error generating summary: {str(e)}"
def extract_multimodal_keywords():
"""
Extract keywords from both text and visual content.
"""
if not extracted_content:
return "β Please upload and process a PDF first."
try:
content_preview = extracted_content[:3000]
prompt = (
"Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
"Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
f"{content_preview}..."
)
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=100,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
except Exception as e:
return f"β Error extracting keywords: {str(e)}"
def clear_multimodal_interface():
"""
Reset all global state and clear UI.
"""
global index, retriever, current_pdf_name, extracted_content
# Clear figures directory
try:
for file in os.listdir(figures_dir):
os.remove(os.path.join(figures_dir, file))
except:
pass
# Reset globals
index = retriever = None
current_pdf_name = extracted_content = None
return None, "", gr.update(interactive=False)
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
with gr.Blocks(theme=theme, css="""
.container { border-radius: 10px; padding: 15px; }
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
.main-title {
text-align: center;
font-size: 64px;
font-weight: bold;
margin-bottom: 20px;
}
.multimodal-badge {
background: linear-gradient(45deg, #6366f1, #8b5cf6);
color: white;
padding: 5px 15px;
border-radius: 20px;
font-size: 14px;
display: inline-block;
margin: 10px auto;
}
""") as demo:
# Application title with multimodal badge
gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>πΌοΈ Text + Images + Charts</span></div>")
with gr.Row():
with gr.Column():
gr.Markdown("## π Document Input")
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
upload_button = gr.Button("π Process Document (Extract Text + Images)", variant="primary")
status_box = gr.Textbox(label="Processing Status", interactive=False)
with gr.Column():
gr.Markdown("## β Ask Questions")
gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
question_input = gr.Textbox(
lines=3,
placeholder="Ask about text, images, charts, or any content in the PDF...",
interactive=False
)
ask_button = gr.Button("π Ask Question", variant="primary")
answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
# Analysis tools
with gr.Row():
with gr.Column():
summary_button = gr.Button("π Generate Summary", variant="secondary")
summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
with gr.Column():
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
# Clear button
clear_button = gr.Button("ποΈ Clear All", variant="secondary")
gr.Markdown("""
<div class='footer'>
Powered by LangChain + Unstructured + Vision AI + FAISS |
Supports: Text, Images, Charts, Tables, Diagrams
</div>
""")
# Event bindings
upload_button.click(
process_pdf_multimodal,
[pdf_file],
[pdf_display, status_box, question_input]
)
ask_button.click(
ask_multimodal_question,
[pdf_display, question_input],
answer_output
)
summary_button.click(generate_multimodal_summary, [], summary_output)
keywords_button.click(extract_multimodal_keywords, [], keywords_output)
clear_button.click(
clear_multimodal_interface,
[],
[pdf_file, pdf_display, question_input]
)
if __name__ == "__main__":
demo.launch(debug=True, share=True) |