Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,132 +1,508 @@
|
|
1 |
-
|
2 |
-
# app.py
|
3 |
-
|
4 |
import os
|
5 |
-
from pathlib import Path
|
6 |
-
|
7 |
import gradio as gr
|
|
|
8 |
from PIL import Image
|
9 |
-
|
|
|
10 |
|
11 |
-
|
12 |
from langchain_community.vectorstores import FAISS
|
13 |
-
from langchain_community.
|
14 |
-
|
15 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
16 |
-
|
17 |
-
from
|
18 |
-
|
19 |
from unstructured.partition.pdf import partition_pdf
|
20 |
from unstructured.partition.utils.constants import PartitionStrategy
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
""
|
49 |
-
|
50 |
-
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
|
114 |
-
with gr.
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
with gr.Row():
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
122 |
|
123 |
with gr.Row():
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
if __name__ == "__main__":
|
132 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
import gradio as gr
|
3 |
+
import base64
|
4 |
from PIL import Image
|
5 |
+
import io
|
6 |
+
import requests
|
7 |
|
8 |
+
# Import vectorstore and embeddings from langchain community package
|
9 |
from langchain_community.vectorstores import FAISS
|
10 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
11 |
+
# Text splitter to break large documents into manageable chunks
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
# HF Inference client for running chat completions
|
14 |
+
from huggingface_hub import InferenceClient
|
15 |
+
# Unstructured for advanced PDF processing with image/table extraction
|
16 |
from unstructured.partition.pdf import partition_pdf
|
17 |
from unstructured.partition.utils.constants import PartitionStrategy
|
18 |
|
19 |
+
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
20 |
+
index = None # FAISS index storing document embeddings
|
21 |
+
retriever = None # Retriever to fetch relevant chunks
|
22 |
+
current_pdf_name = None # Name of the currently loaded PDF
|
23 |
+
pdf_text = None # Full text of the uploaded PDF
|
24 |
+
extracted_images = [] # List to store extracted images and their descriptions
|
25 |
+
|
26 |
+
# Create directories for storing extracted figures
|
27 |
+
FIGURES_DIR = "extracted_figures/"
|
28 |
+
os.makedirs(FIGURES_DIR, exist_ok=True)
|
29 |
+
|
30 |
+
# ββ HF Inference clients for different models βββββββββββββββββββββββββββββββββ
|
31 |
+
# Text generation model
|
32 |
+
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
|
33 |
+
|
34 |
+
# Vision-Language Models (choose one based on your needs and HF availability)
|
35 |
+
# Option 1: BLIP-2 for general image understanding
|
36 |
+
vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
|
37 |
+
|
38 |
+
# Option 2: Alternative vision models you can use:
|
39 |
+
# vision_client = InferenceClient(model="microsoft/git-base-coco")
|
40 |
+
# vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
|
41 |
+
# vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")
|
42 |
+
|
43 |
+
# For more advanced multimodal tasks, you can use:
|
44 |
+
# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
|
45 |
+
# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
|
46 |
+
|
47 |
+
# ββ Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
48 |
+
# Use BGE embeddings from BAAI for vectorizing text chunks
|
49 |
+
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
|
50 |
+
|
51 |
+
def extract_image_description_advanced(image_path):
|
52 |
+
"""
|
53 |
+
Enhanced image description using multiple vision models
|
54 |
+
"""
|
55 |
+
try:
|
56 |
+
# Load and process image
|
57 |
+
with open(image_path, "rb") as f:
|
58 |
+
image_bytes = f.read()
|
59 |
+
|
60 |
+
# Method 1: Use BLIP-2 for detailed image captioning
|
61 |
+
try:
|
62 |
+
description = vision_client.image_to_text(image_bytes)
|
63 |
+
base_description = description if isinstance(description, str) else description.get('generated_text', '')
|
64 |
+
except Exception as e:
|
65 |
+
print(f"BLIP-2 failed: {e}")
|
66 |
+
base_description = "Image could not be processed with vision model"
|
67 |
+
|
68 |
+
# Method 2: Enhance with text-based analysis using the text model
|
69 |
+
enhancement_prompt = f"""
|
70 |
+
Analyze this image description and provide a detailed analysis focusing on:
|
71 |
+
1. Any text, numbers, or data visible
|
72 |
+
2. Charts, graphs, or tables
|
73 |
+
3. Key visual elements and their significance
|
74 |
+
4. Context and meaning
|
75 |
+
|
76 |
+
Description: {base_description}
|
77 |
+
|
78 |
+
Provide a comprehensive analysis:
|
79 |
+
"""
|
80 |
+
|
81 |
+
try:
|
82 |
+
response = text_client.chat_completion(
|
83 |
+
messages=[{"role": "user", "content": enhancement_prompt}],
|
84 |
+
max_tokens=300,
|
85 |
+
temperature=0.3
|
86 |
+
)
|
87 |
+
enhanced_description = response["choices"][0]["message"]["content"].strip()
|
88 |
+
except Exception as e:
|
89 |
+
print(f"Text enhancement failed: {e}")
|
90 |
+
enhanced_description = base_description
|
91 |
+
|
92 |
+
return f"Visual Element Analysis:\n{enhanced_description}"
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
print(f"Error processing image {image_path}: {str(e)}")
|
96 |
+
return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"
|
97 |
+
|
98 |
+
def process_pdf_multimodal_advanced(pdf_file):
|
99 |
+
"""
|
100 |
+
Advanced multimodal PDF processing with enhanced vision capabilities
|
101 |
+
"""
|
102 |
+
global current_pdf_name, index, retriever, pdf_text, extracted_images
|
103 |
+
|
104 |
+
if pdf_file is None:
|
105 |
+
return None, "β Please upload a PDF file.", gr.update(interactive=False)
|
106 |
+
|
107 |
+
current_pdf_name = os.path.basename(pdf_file.name)
|
108 |
+
extracted_images = []
|
109 |
+
|
110 |
+
# Clear existing figures directory
|
111 |
+
for file in os.listdir(FIGURES_DIR):
|
112 |
+
try:
|
113 |
+
os.remove(os.path.join(FIGURES_DIR, file))
|
114 |
+
except:
|
115 |
+
pass
|
116 |
+
|
117 |
+
try:
|
118 |
+
# Process PDF with unstructured
|
119 |
+
elements = partition_pdf(
|
120 |
+
pdf_file.name,
|
121 |
+
strategy=PartitionStrategy.HI_RES,
|
122 |
+
extract_image_block_types=["Image", "Table"],
|
123 |
+
extract_image_block_output_dir=FIGURES_DIR,
|
124 |
+
extract_image_block_to_payload=False,
|
125 |
+
# Additional parameters for better extraction
|
126 |
+
infer_table_structure=True,
|
127 |
+
chunking_strategy="by_title",
|
128 |
+
max_characters=1000,
|
129 |
+
combine_text_under_n_chars=100
|
130 |
+
)
|
131 |
+
|
132 |
+
# Process elements
|
133 |
+
text_elements = []
|
134 |
+
visual_descriptions = []
|
135 |
+
|
136 |
+
for element in elements:
|
137 |
+
if element.category in ["Image", "Table"]:
|
138 |
+
# Handle image/table elements
|
139 |
+
continue
|
140 |
+
elif element.category == "Title":
|
141 |
+
text_elements.append(f"TITLE: {element.text}")
|
142 |
+
elif element.category == "Header":
|
143 |
+
text_elements.append(f"HEADER: {element.text}")
|
144 |
+
else:
|
145 |
+
if hasattr(element, 'text') and element.text.strip():
|
146 |
+
text_elements.append(element.text)
|
147 |
+
|
148 |
+
pdf_text = "\n\n".join(text_elements)
|
149 |
+
|
150 |
+
# Process extracted visual elements
|
151 |
+
if os.path.exists(FIGURES_DIR):
|
152 |
+
for filename in sorted(os.listdir(FIGURES_DIR)):
|
153 |
+
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
|
154 |
+
image_path = os.path.join(FIGURES_DIR, filename)
|
155 |
+
|
156 |
+
# Get enhanced description
|
157 |
+
description = extract_image_description_advanced(image_path)
|
158 |
+
visual_descriptions.append(description)
|
159 |
+
|
160 |
+
extracted_images.append({
|
161 |
+
'path': image_path,
|
162 |
+
'description': description,
|
163 |
+
'filename': filename,
|
164 |
+
'type': 'table' if 'table' in filename.lower() else 'image'
|
165 |
+
})
|
166 |
+
|
167 |
+
# Combine all content
|
168 |
+
all_content = text_elements + visual_descriptions
|
169 |
+
|
170 |
+
# Advanced text splitting
|
171 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
172 |
+
chunk_size=800, # Smaller chunks for better retrieval
|
173 |
+
chunk_overlap=150,
|
174 |
+
add_start_index=True,
|
175 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
176 |
+
)
|
177 |
+
|
178 |
+
combined_content = "\n\n".join(all_content)
|
179 |
+
chunks = text_splitter.split_text(combined_content)
|
180 |
+
|
181 |
+
# Create FAISS index with better retrieval settings
|
182 |
+
index = FAISS.from_texts(chunks, embeddings)
|
183 |
+
retriever = index.as_retriever(
|
184 |
+
search_type="mmr", # Maximum marginal relevance
|
185 |
+
search_kwargs={
|
186 |
+
"k": 4,
|
187 |
+
"fetch_k": 8,
|
188 |
+
"lambda_mult": 0.7
|
189 |
+
}
|
190 |
+
)
|
191 |
+
|
192 |
+
status = f"β
Advanced processing complete for '{current_pdf_name}'\nπ {len(text_elements)} text sections\nπΌοΈ {len(extracted_images)} visual elements\nπ¦ {len(chunks)} searchable chunks"
|
193 |
+
|
194 |
+
return current_pdf_name, status, gr.update(interactive=True)
|
195 |
+
|
196 |
+
except Exception as e:
|
197 |
+
error_msg = f"β Processing error: {str(e)}"
|
198 |
+
return current_pdf_name, error_msg, gr.update(interactive=False)
|
199 |
+
|
200 |
+
def ask_question_multimodal_advanced(pdf_name, question):
|
201 |
+
"""
|
202 |
+
Advanced multimodal question answering with smart routing
|
203 |
+
"""
|
204 |
+
global retriever, extracted_images
|
205 |
+
|
206 |
+
if index is None or retriever is None:
|
207 |
+
return "β Please upload and process a PDF first."
|
208 |
+
|
209 |
+
if not question.strip():
|
210 |
+
return "β Please enter a question."
|
211 |
+
|
212 |
+
try:
|
213 |
+
# Retrieve relevant chunks
|
214 |
+
docs = retriever.get_relevant_documents(question)
|
215 |
+
context = "\n\n".join([doc.page_content for doc in docs])
|
216 |
+
|
217 |
+
# Enhanced visual query detection
|
218 |
+
visual_keywords = [
|
219 |
+
'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
|
220 |
+
'visual', 'show', 'display', 'plot', 'data', 'visualization',
|
221 |
+
'illustration', 'screenshot', 'photo', 'drawing'
|
222 |
+
]
|
223 |
+
|
224 |
+
is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)
|
225 |
+
|
226 |
+
# Smart context enhancement
|
227 |
+
if is_visual_query and extracted_images:
|
228 |
+
# Prioritize visual content for visual queries
|
229 |
+
visual_context = "\n\n".join([img['description'] for img in extracted_images])
|
230 |
+
enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
|
231 |
+
else:
|
232 |
+
enhanced_context = context
|
233 |
+
|
234 |
+
# Advanced prompting based on query type
|
235 |
+
if is_visual_query:
|
236 |
+
system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
|
237 |
+
You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
|
238 |
+
When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
|
239 |
+
else:
|
240 |
+
system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
|
241 |
+
Use the context provided to give detailed and helpful responses."""
|
242 |
+
|
243 |
+
prompt = f"""{system_prompt}
|
244 |
+
|
245 |
+
Context: {enhanced_context}
|
246 |
|
247 |
+
Question: {question}
|
248 |
|
249 |
+
Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""
|
250 |
+
|
251 |
+
response = text_client.chat_completion(
|
252 |
+
messages=[{"role": "user", "content": prompt}],
|
253 |
+
max_tokens=400,
|
254 |
+
temperature=0.4
|
255 |
+
)
|
256 |
+
|
257 |
+
answer = response["choices"][0]["message"]["content"].strip()
|
258 |
+
return answer
|
259 |
+
|
260 |
+
except Exception as e:
|
261 |
+
return f"β Error generating answer: {str(e)}"
|
262 |
+
|
263 |
+
def analyze_document_structure():
|
264 |
+
"""
|
265 |
+
New feature: Analyze the overall structure of the document
|
266 |
+
"""
|
267 |
+
global pdf_text, extracted_images
|
268 |
+
|
269 |
+
if not pdf_text and not extracted_images:
|
270 |
+
return "β Please upload and process a PDF first."
|
271 |
+
|
272 |
+
try:
|
273 |
+
structure_prompt = f"""
|
274 |
+
Analyze the structure and organization of this document. Provide insights about:
|
275 |
+
1. Document type and purpose
|
276 |
+
2. Main sections and topics
|
277 |
+
3. Visual elements present ({len(extracted_images)} images/tables/charts)
|
278 |
+
4. Key information hierarchy
|
279 |
+
5. Overall document quality and completeness
|
280 |
+
|
281 |
+
Text content sample: {pdf_text[:1000]}
|
282 |
+
Visual elements: {len(extracted_images)} items detected
|
283 |
+
|
284 |
+
Provide a structural analysis:
|
285 |
+
"""
|
286 |
+
|
287 |
+
response = text_client.chat_completion(
|
288 |
+
messages=[{"role": "user", "content": structure_prompt}],
|
289 |
+
max_tokens=300,
|
290 |
+
temperature=0.3
|
291 |
+
)
|
292 |
+
|
293 |
+
return response["choices"][0]["message"]["content"].strip()
|
294 |
+
|
295 |
+
except Exception as e:
|
296 |
+
return f"β Error analyzing structure: {str(e)}"
|
297 |
+
|
298 |
+
# [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]
|
299 |
+
|
300 |
+
def generate_summary_multimodal():
|
301 |
+
"""Enhanced summary generation considering both text and visual content"""
|
302 |
+
global pdf_text, extracted_images
|
303 |
+
|
304 |
+
if not pdf_text and not extracted_images:
|
305 |
+
return "β Please upload and process a PDF first."
|
306 |
+
|
307 |
+
try:
|
308 |
+
content_parts = []
|
309 |
+
|
310 |
+
if pdf_text:
|
311 |
+
content_parts.append(f"Text Content:\n{pdf_text[:2000]}")
|
312 |
+
|
313 |
+
if extracted_images:
|
314 |
+
visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
|
315 |
+
content_parts.append(f"Visual Content:\n{visual_summary}")
|
316 |
+
|
317 |
+
combined_content = "\n\n".join(content_parts)
|
318 |
+
|
319 |
+
prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
|
320 |
+
Focus on key findings, main topics, and insights from charts, tables, or images.
|
321 |
+
|
322 |
+
Content: {combined_content}
|
323 |
+
|
324 |
+
Summary:"""
|
325 |
+
|
326 |
+
response = text_client.chat_completion(
|
327 |
+
messages=[{"role": "user", "content": prompt}],
|
328 |
+
max_tokens=250,
|
329 |
+
temperature=0.5
|
330 |
+
)
|
331 |
+
|
332 |
+
return response["choices"][0]["message"]["content"].strip()
|
333 |
+
|
334 |
+
except Exception as e:
|
335 |
+
return f"β Error generating summary: {str(e)}"
|
336 |
+
|
337 |
+
def extract_keywords_multimodal():
|
338 |
+
"""Enhanced keyword extraction from both text and visual content"""
|
339 |
+
global pdf_text, extracted_images
|
340 |
+
|
341 |
+
if not pdf_text and not extracted_images:
|
342 |
+
return "β Please upload and process a PDF first."
|
343 |
+
|
344 |
+
try:
|
345 |
+
content_parts = []
|
346 |
+
|
347 |
+
if pdf_text:
|
348 |
+
content_parts.append(f"Text: {pdf_text[:1500]}")
|
349 |
+
|
350 |
+
if extracted_images:
|
351 |
+
visual_content = "\n".join([img['description'][:150] for img in extracted_images])
|
352 |
+
content_parts.append(f"Visual Content: {visual_content}")
|
353 |
+
|
354 |
+
combined_content = "\n\n".join(content_parts)
|
355 |
+
|
356 |
+
prompt = f"""Extract key terms, concepts, and topics from this document content.
|
357 |
+
Include technical terms, important concepts, and themes from both text and visual elements.
|
358 |
+
|
359 |
+
Content: {combined_content}
|
360 |
+
|
361 |
+
Key terms and concepts:"""
|
362 |
+
|
363 |
+
response = text_client.chat_completion(
|
364 |
+
messages=[{"role": "user", "content": prompt}],
|
365 |
+
max_tokens=120,
|
366 |
+
temperature=0.5
|
367 |
+
)
|
368 |
+
|
369 |
+
return response["choices"][0]["message"]["content"].strip()
|
370 |
+
|
371 |
+
except Exception as e:
|
372 |
+
return f"β Error extracting keywords: {str(e)}"
|
373 |
+
|
374 |
+
def show_extracted_images():
|
375 |
+
"""Display information about extracted images"""
|
376 |
+
global extracted_images
|
377 |
+
|
378 |
+
if not extracted_images:
|
379 |
+
return "No visual elements extracted from the current document."
|
380 |
+
|
381 |
+
info = f"π Extracted {len(extracted_images)} visual elements:\n\n"
|
382 |
+
for i, img in enumerate(extracted_images, 1):
|
383 |
+
element_type = "π Table" if img['type'] == 'table' else "πΌοΈ Image"
|
384 |
+
info += f"{i}. {element_type}: {img['filename']}\n"
|
385 |
+
info += f" Description: {img['description'][:150]}...\n\n"
|
386 |
+
|
387 |
+
if i >= 5: # Limit display to first 5
|
388 |
+
remaining = len(extracted_images) - 5
|
389 |
+
if remaining > 0:
|
390 |
+
info += f"... and {remaining} more visual elements."
|
391 |
+
break
|
392 |
+
|
393 |
+
return info
|
394 |
+
|
395 |
+
def clear_interface_multimodal():
|
396 |
+
"""Enhanced clear function for multimodal system"""
|
397 |
+
global index, retriever, current_pdf_name, pdf_text, extracted_images
|
398 |
+
|
399 |
+
index = retriever = None
|
400 |
+
current_pdf_name = pdf_text = None
|
401 |
+
extracted_images = []
|
402 |
+
|
403 |
+
if os.path.exists(FIGURES_DIR):
|
404 |
+
for file in os.listdir(FIGURES_DIR):
|
405 |
+
try:
|
406 |
+
os.remove(os.path.join(FIGURES_DIR, file))
|
407 |
+
except:
|
408 |
+
pass
|
409 |
+
|
410 |
+
return None, "", gr.update(interactive=False), "", "", "", "", ""
|
411 |
+
|
412 |
+
# Enhanced Gradio UI
|
413 |
+
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
|
414 |
+
|
415 |
+
with gr.Blocks(theme=theme, css="""
|
416 |
+
.container { border-radius: 10px; padding: 15px; }
|
417 |
+
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
|
418 |
+
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
|
419 |
+
.main-title {
|
420 |
+
text-align: center;
|
421 |
+
font-size: 56px;
|
422 |
+
font-weight: bold;
|
423 |
+
margin-bottom: 20px;
|
424 |
+
background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
|
425 |
+
-webkit-background-clip: text;
|
426 |
+
-webkit-text-fill-color: transparent;
|
427 |
+
}
|
428 |
+
.feature-badge {
|
429 |
+
background: linear-gradient(45deg, #10b981, #3b82f6);
|
430 |
+
color: white;
|
431 |
+
padding: 4px 12px;
|
432 |
+
border-radius: 15px;
|
433 |
+
font-size: 11px;
|
434 |
+
margin: 2px;
|
435 |
+
display: inline-block;
|
436 |
+
}
|
437 |
+
""") as demo:
|
438 |
+
|
439 |
+
gr.Markdown("<div class='main-title'>π€ DocQueryAI Pro</div>")
|
440 |
+
gr.Markdown("""
|
441 |
+
<div style='text-align: center; margin-bottom: 25px;'>
|
442 |
+
<span class='feature-badge'>π Advanced RAG</span>
|
443 |
+
<span class='feature-badge'>πΌοΈ Vision AI</span>
|
444 |
+
<span class='feature-badge'>π Table Analysis</span>
|
445 |
+
<span class='feature-badge'>π Chart Understanding</span>
|
446 |
+
<span class='feature-badge'>π§ Smart Retrieval</span>
|
447 |
+
</div>
|
448 |
+
""")
|
449 |
|
450 |
+
with gr.Row():
|
451 |
+
with gr.Column():
|
452 |
+
gr.Markdown("## π Document Processing")
|
453 |
+
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
|
454 |
+
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
|
455 |
+
upload_button = gr.Button("π Process with AI Vision", variant="primary", size="lg")
|
456 |
+
status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)
|
457 |
+
|
458 |
+
with gr.Column():
|
459 |
+
gr.Markdown("## π¬ Intelligent Q&A")
|
460 |
+
gr.Markdown("*Ask about any content: text, images, charts, tables, or data visualizations*")
|
461 |
+
question_input = gr.Textbox(
|
462 |
+
lines=3,
|
463 |
+
placeholder="Examples:\nβ’ What does the chart show?\nβ’ Summarize the table data\nβ’ Explain the main findings",
|
464 |
+
label="Your Question"
|
465 |
+
)
|
466 |
+
ask_button = gr.Button("π Get AI Answer", variant="primary", size="lg")
|
467 |
+
answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
|
468 |
|
469 |
with gr.Row():
|
470 |
+
with gr.Column():
|
471 |
+
summary_button = gr.Button("π Generate Summary", variant="secondary")
|
472 |
+
summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)
|
473 |
+
|
474 |
+
with gr.Column():
|
475 |
+
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
|
476 |
+
keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)
|
477 |
|
478 |
with gr.Row():
|
479 |
+
with gr.Column():
|
480 |
+
structure_button = gr.Button("ποΈ Analyze Structure", variant="secondary")
|
481 |
+
structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)
|
482 |
+
|
483 |
+
with gr.Column():
|
484 |
+
images_button = gr.Button("πΌοΈ Show Visual Elements", variant="secondary")
|
485 |
+
images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)
|
486 |
|
487 |
+
with gr.Row():
|
488 |
+
clear_button = gr.Button("ποΈ Clear All", variant="secondary", size="sm")
|
489 |
+
|
490 |
+
gr.Markdown("""
|
491 |
+
<div class='footer'>
|
492 |
+
π <strong>Powered by Advanced AI</strong><br>
|
493 |
+
π§ HuggingFace Transformers β’ LangChain β’ FAISS β’ Unstructured<br>
|
494 |
+
π― Multimodal RAG: Text + Vision + Tables + Charts
|
495 |
+
</div>
|
496 |
+
""")
|
497 |
+
|
498 |
+
# Event bindings
|
499 |
+
upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
|
500 |
+
ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
|
501 |
+
summary_button.click(generate_summary_multimodal, [], summary_output)
|
502 |
+
keywords_button.click(extract_keywords_multimodal, [], keywords_output)
|
503 |
+
structure_button.click(analyze_document_structure, [], structure_output)
|
504 |
+
images_button.click(show_extracted_images, [], images_output)
|
505 |
+
clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])
|
506 |
|
507 |
if __name__ == "__main__":
|
508 |
+
demo.launch(debug=True, share=True)
|