Muzammil6376 commited on
Commit
87baec5
Β·
verified Β·
1 Parent(s): 3341c6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +489 -113
app.py CHANGED
@@ -1,132 +1,508 @@
1
-
2
- # app.py
3
-
4
  import os
5
- from pathlib import Path
6
-
7
  import gradio as gr
 
8
  from PIL import Image
9
- from huggingface_hub import InferenceClient
 
10
 
11
- from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain_community.vectorstores import FAISS
13
- from langchain_community.llms import HuggingFaceEndpoint
14
-
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
- from langchain.chains import RetrievalQA
17
- from langchain.prompts import PromptTemplate
18
-
19
  from unstructured.partition.pdf import partition_pdf
20
  from unstructured.partition.utils.constants import PartitionStrategy
21
 
22
- # β€”β€”β€”β€”β€” Config & Folders β€”β€”β€”β€”β€”
23
- PDF_DIR = Path("pdfs")
24
- FIG_DIR = Path("figures")
25
- PDF_DIR.mkdir(exist_ok=True)
26
- FIG_DIR.mkdir(exist_ok=True)
27
-
28
- # β€”β€”β€”β€”β€” Read your HF_TOKEN secret β€”β€”β€”β€”β€”
29
- hf_token = os.environ["HF_TOKEN"]
30
-
31
- # β€”β€”β€”β€”β€” Embeddings & LLM Setup β€”β€”β€”β€”β€”
32
- embedding_model = HuggingFaceEmbeddings(
33
- model_name="sentence-transformers/all-MiniLM-L6-v2"
34
- )
35
-
36
- llm = HuggingFaceEndpoint(
37
- endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-base",
38
- huggingfacehub_api_token=hf_token,
39
- temperature=0.5,
40
- max_length=512,
41
- )
42
-
43
- TEMPLATE = """
44
- Use the following context to answer the question. If unknown, say so.
45
- Context: {context}
46
- Question: {question}
47
- Answer (up to 3 sentences):
48
- """
49
- prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
50
-
51
- # β€”β€”β€”β€”β€” Inference client for image captioning β€”β€”β€”β€”β€”
52
- vision_client = InferenceClient(
53
- model="Salesforce/blip-image-captioning-base",
54
- token=hf_token,
55
- )
56
-
57
- # Globals (initialized after processing)
58
- vector_store = None
59
- qa_chain = None
60
-
61
-
62
- def extract_image_caption(path: str) -> str:
63
- with Image.open(path) as img:
64
- return vision_client.image_to_text(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
66
 
67
- def process_pdf(local_path: str) -> str:
68
- """Ingest a local PDF file, extract text & images, chunk, embed, and index."""
69
- global vector_store, qa_chain
70
-
71
- # Move the uploaded PDF into our PDFs folder
72
- src = Path(local_path)
73
- if src.suffix.lower() != ".pdf":
74
- return "❗ Error: Uploaded file is not a PDF."
75
- dest = PDF_DIR / src.name
76
- src.rename(dest)
77
-
78
- # Partition PDF into text + image blocks
79
- elems = partition_pdf(
80
- str(dest),
81
- strategy=PartitionStrategy.HI_RES,
82
- extract_image_block_types=["Image", "Table"],
83
- extract_image_block_output_dir=str(FIG_DIR),
84
- )
85
-
86
- # Collect text blocks
87
- texts = [el.text for el in elems if el.category not in ("Image", "Table")]
88
-
89
- # Generate captions for each extracted image
90
- for img_file in FIG_DIR.iterdir():
91
- texts.append(extract_image_caption(str(img_file)))
92
-
93
- # Chunk and embed
94
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
95
- docs = splitter.split_text("\n\n".join(texts))
96
-
97
- vector_store = FAISS.from_texts(docs, embedding_model)
98
- qa_chain = RetrievalQA.from_chain_type(
99
- llm=llm,
100
- retriever=vector_store.as_retriever(),
101
- chain_type_kwargs={"prompt": prompt},
102
- )
103
-
104
- return f"βœ… Processed `{dest.name}` into {len(docs)} chunks."
105
-
106
-
107
- def answer_query(question: str) -> str:
108
- if qa_chain is None:
109
- return "❗ Please upload and process a PDF first."
110
- return qa_chain.run(question)
111
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # β€”β€”β€”β€”β€” Gradio UI β€”β€”β€”β€”β€”
114
- with gr.Blocks() as demo:
115
- gr.Markdown("## πŸ“„πŸ“· Multimodal RAG β€” Hugging Face Spaces")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  with gr.Row():
118
- # just accept any local file; we check for .pdf in process_pdf()
119
- pdf_in = gr.File(label="Upload PDF", type="filepath")
120
- btn_proc = gr.Button("Process PDF")
121
- status = gr.Textbox(label="Status")
 
 
 
122
 
123
  with gr.Row():
124
- q_in = gr.Textbox(label="Your Question")
125
- btn_ask = gr.Button("Ask")
126
- ans_out = gr.Textbox(label="Answer")
 
 
 
 
127
 
128
- btn_proc.click(fn=process_pdf, inputs=pdf_in, outputs=status)
129
- btn_ask.click(fn=answer_query, inputs=q_in, outputs=ans_out)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  if __name__ == "__main__":
132
- demo.launch()
 
 
 
 
1
  import os
 
 
2
  import gradio as gr
3
+ import base64
4
  from PIL import Image
5
+ import io
6
+ import requests
7
 
8
+ # Import vectorstore and embeddings from langchain community package
9
  from langchain_community.vectorstores import FAISS
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
+ # Text splitter to break large documents into manageable chunks
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ # HF Inference client for running chat completions
14
+ from huggingface_hub import InferenceClient
15
+ # Unstructured for advanced PDF processing with image/table extraction
16
  from unstructured.partition.pdf import partition_pdf
17
  from unstructured.partition.utils.constants import PartitionStrategy
18
 
19
+ # ── Globals ───────────────────────────────────────────────────────────────────
20
+ index = None # FAISS index storing document embeddings
21
+ retriever = None # Retriever to fetch relevant chunks
22
+ current_pdf_name = None # Name of the currently loaded PDF
23
+ pdf_text = None # Full text of the uploaded PDF
24
+ extracted_images = [] # List to store extracted images and their descriptions
25
+
26
+ # Create directories for storing extracted figures
27
+ FIGURES_DIR = "extracted_figures/"
28
+ os.makedirs(FIGURES_DIR, exist_ok=True)
29
+
30
+ # ── HF Inference clients for different models ─────────────────────────────────
31
+ # Text generation model
32
+ text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
33
+
34
+ # Vision-Language Models (choose one based on your needs and HF availability)
35
+ # Option 1: BLIP-2 for general image understanding
36
+ vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")
37
+
38
+ # Option 2: Alternative vision models you can use:
39
+ # vision_client = InferenceClient(model="microsoft/git-base-coco")
40
+ # vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
41
+ # vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")
42
+
43
+ # For more advanced multimodal tasks, you can use:
44
+ # multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
45
+ # multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following
46
+
47
+ # ── Embeddings ───────────────────────────────────────────────────────────────
48
+ # Use BGE embeddings from BAAI for vectorizing text chunks
49
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
50
+
51
+ def extract_image_description_advanced(image_path):
52
+ """
53
+ Enhanced image description using multiple vision models
54
+ """
55
+ try:
56
+ # Load and process image
57
+ with open(image_path, "rb") as f:
58
+ image_bytes = f.read()
59
+
60
+ # Method 1: Use BLIP-2 for detailed image captioning
61
+ try:
62
+ description = vision_client.image_to_text(image_bytes)
63
+ base_description = description if isinstance(description, str) else description.get('generated_text', '')
64
+ except Exception as e:
65
+ print(f"BLIP-2 failed: {e}")
66
+ base_description = "Image could not be processed with vision model"
67
+
68
+ # Method 2: Enhance with text-based analysis using the text model
69
+ enhancement_prompt = f"""
70
+ Analyze this image description and provide a detailed analysis focusing on:
71
+ 1. Any text, numbers, or data visible
72
+ 2. Charts, graphs, or tables
73
+ 3. Key visual elements and their significance
74
+ 4. Context and meaning
75
+
76
+ Description: {base_description}
77
+
78
+ Provide a comprehensive analysis:
79
+ """
80
+
81
+ try:
82
+ response = text_client.chat_completion(
83
+ messages=[{"role": "user", "content": enhancement_prompt}],
84
+ max_tokens=300,
85
+ temperature=0.3
86
+ )
87
+ enhanced_description = response["choices"][0]["message"]["content"].strip()
88
+ except Exception as e:
89
+ print(f"Text enhancement failed: {e}")
90
+ enhanced_description = base_description
91
+
92
+ return f"Visual Element Analysis:\n{enhanced_description}"
93
+
94
+ except Exception as e:
95
+ print(f"Error processing image {image_path}: {str(e)}")
96
+ return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"
97
+
98
+ def process_pdf_multimodal_advanced(pdf_file):
99
+ """
100
+ Advanced multimodal PDF processing with enhanced vision capabilities
101
+ """
102
+ global current_pdf_name, index, retriever, pdf_text, extracted_images
103
+
104
+ if pdf_file is None:
105
+ return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
106
+
107
+ current_pdf_name = os.path.basename(pdf_file.name)
108
+ extracted_images = []
109
+
110
+ # Clear existing figures directory
111
+ for file in os.listdir(FIGURES_DIR):
112
+ try:
113
+ os.remove(os.path.join(FIGURES_DIR, file))
114
+ except:
115
+ pass
116
+
117
+ try:
118
+ # Process PDF with unstructured
119
+ elements = partition_pdf(
120
+ pdf_file.name,
121
+ strategy=PartitionStrategy.HI_RES,
122
+ extract_image_block_types=["Image", "Table"],
123
+ extract_image_block_output_dir=FIGURES_DIR,
124
+ extract_image_block_to_payload=False,
125
+ # Additional parameters for better extraction
126
+ infer_table_structure=True,
127
+ chunking_strategy="by_title",
128
+ max_characters=1000,
129
+ combine_text_under_n_chars=100
130
+ )
131
+
132
+ # Process elements
133
+ text_elements = []
134
+ visual_descriptions = []
135
+
136
+ for element in elements:
137
+ if element.category in ["Image", "Table"]:
138
+ # Handle image/table elements
139
+ continue
140
+ elif element.category == "Title":
141
+ text_elements.append(f"TITLE: {element.text}")
142
+ elif element.category == "Header":
143
+ text_elements.append(f"HEADER: {element.text}")
144
+ else:
145
+ if hasattr(element, 'text') and element.text.strip():
146
+ text_elements.append(element.text)
147
+
148
+ pdf_text = "\n\n".join(text_elements)
149
+
150
+ # Process extracted visual elements
151
+ if os.path.exists(FIGURES_DIR):
152
+ for filename in sorted(os.listdir(FIGURES_DIR)):
153
+ if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
154
+ image_path = os.path.join(FIGURES_DIR, filename)
155
+
156
+ # Get enhanced description
157
+ description = extract_image_description_advanced(image_path)
158
+ visual_descriptions.append(description)
159
+
160
+ extracted_images.append({
161
+ 'path': image_path,
162
+ 'description': description,
163
+ 'filename': filename,
164
+ 'type': 'table' if 'table' in filename.lower() else 'image'
165
+ })
166
+
167
+ # Combine all content
168
+ all_content = text_elements + visual_descriptions
169
+
170
+ # Advanced text splitting
171
+ text_splitter = RecursiveCharacterTextSplitter(
172
+ chunk_size=800, # Smaller chunks for better retrieval
173
+ chunk_overlap=150,
174
+ add_start_index=True,
175
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
176
+ )
177
+
178
+ combined_content = "\n\n".join(all_content)
179
+ chunks = text_splitter.split_text(combined_content)
180
+
181
+ # Create FAISS index with better retrieval settings
182
+ index = FAISS.from_texts(chunks, embeddings)
183
+ retriever = index.as_retriever(
184
+ search_type="mmr", # Maximum marginal relevance
185
+ search_kwargs={
186
+ "k": 4,
187
+ "fetch_k": 8,
188
+ "lambda_mult": 0.7
189
+ }
190
+ )
191
+
192
+ status = f"βœ… Advanced processing complete for '{current_pdf_name}'\nπŸ“„ {len(text_elements)} text sections\nπŸ–ΌοΈ {len(extracted_images)} visual elements\nπŸ“¦ {len(chunks)} searchable chunks"
193
+
194
+ return current_pdf_name, status, gr.update(interactive=True)
195
+
196
+ except Exception as e:
197
+ error_msg = f"❌ Processing error: {str(e)}"
198
+ return current_pdf_name, error_msg, gr.update(interactive=False)
199
+
200
+ def ask_question_multimodal_advanced(pdf_name, question):
201
+ """
202
+ Advanced multimodal question answering with smart routing
203
+ """
204
+ global retriever, extracted_images
205
+
206
+ if index is None or retriever is None:
207
+ return "❌ Please upload and process a PDF first."
208
+
209
+ if not question.strip():
210
+ return "❌ Please enter a question."
211
+
212
+ try:
213
+ # Retrieve relevant chunks
214
+ docs = retriever.get_relevant_documents(question)
215
+ context = "\n\n".join([doc.page_content for doc in docs])
216
+
217
+ # Enhanced visual query detection
218
+ visual_keywords = [
219
+ 'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
220
+ 'visual', 'show', 'display', 'plot', 'data', 'visualization',
221
+ 'illustration', 'screenshot', 'photo', 'drawing'
222
+ ]
223
+
224
+ is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)
225
+
226
+ # Smart context enhancement
227
+ if is_visual_query and extracted_images:
228
+ # Prioritize visual content for visual queries
229
+ visual_context = "\n\n".join([img['description'] for img in extracted_images])
230
+ enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
231
+ else:
232
+ enhanced_context = context
233
+
234
+ # Advanced prompting based on query type
235
+ if is_visual_query:
236
+ system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
237
+ You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
238
+ When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
239
+ else:
240
+ system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
241
+ Use the context provided to give detailed and helpful responses."""
242
+
243
+ prompt = f"""{system_prompt}
244
+
245
+ Context: {enhanced_context}
246
 
247
+ Question: {question}
248
 
249
+ Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""
250
+
251
+ response = text_client.chat_completion(
252
+ messages=[{"role": "user", "content": prompt}],
253
+ max_tokens=400,
254
+ temperature=0.4
255
+ )
256
+
257
+ answer = response["choices"][0]["message"]["content"].strip()
258
+ return answer
259
+
260
+ except Exception as e:
261
+ return f"❌ Error generating answer: {str(e)}"
262
+
263
+ def analyze_document_structure():
264
+ """
265
+ New feature: Analyze the overall structure of the document
266
+ """
267
+ global pdf_text, extracted_images
268
+
269
+ if not pdf_text and not extracted_images:
270
+ return "❌ Please upload and process a PDF first."
271
+
272
+ try:
273
+ structure_prompt = f"""
274
+ Analyze the structure and organization of this document. Provide insights about:
275
+ 1. Document type and purpose
276
+ 2. Main sections and topics
277
+ 3. Visual elements present ({len(extracted_images)} images/tables/charts)
278
+ 4. Key information hierarchy
279
+ 5. Overall document quality and completeness
280
+
281
+ Text content sample: {pdf_text[:1000]}
282
+ Visual elements: {len(extracted_images)} items detected
283
+
284
+ Provide a structural analysis:
285
+ """
286
+
287
+ response = text_client.chat_completion(
288
+ messages=[{"role": "user", "content": structure_prompt}],
289
+ max_tokens=300,
290
+ temperature=0.3
291
+ )
292
+
293
+ return response["choices"][0]["message"]["content"].strip()
294
+
295
+ except Exception as e:
296
+ return f"❌ Error analyzing structure: {str(e)}"
297
+
298
+ # [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]
299
+
300
+ def generate_summary_multimodal():
301
+ """Enhanced summary generation considering both text and visual content"""
302
+ global pdf_text, extracted_images
303
+
304
+ if not pdf_text and not extracted_images:
305
+ return "❌ Please upload and process a PDF first."
306
+
307
+ try:
308
+ content_parts = []
309
+
310
+ if pdf_text:
311
+ content_parts.append(f"Text Content:\n{pdf_text[:2000]}")
312
+
313
+ if extracted_images:
314
+ visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
315
+ content_parts.append(f"Visual Content:\n{visual_summary}")
316
+
317
+ combined_content = "\n\n".join(content_parts)
318
+
319
+ prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
320
+ Focus on key findings, main topics, and insights from charts, tables, or images.
321
+
322
+ Content: {combined_content}
323
+
324
+ Summary:"""
325
+
326
+ response = text_client.chat_completion(
327
+ messages=[{"role": "user", "content": prompt}],
328
+ max_tokens=250,
329
+ temperature=0.5
330
+ )
331
+
332
+ return response["choices"][0]["message"]["content"].strip()
333
+
334
+ except Exception as e:
335
+ return f"❌ Error generating summary: {str(e)}"
336
+
337
+ def extract_keywords_multimodal():
338
+ """Enhanced keyword extraction from both text and visual content"""
339
+ global pdf_text, extracted_images
340
+
341
+ if not pdf_text and not extracted_images:
342
+ return "❌ Please upload and process a PDF first."
343
+
344
+ try:
345
+ content_parts = []
346
+
347
+ if pdf_text:
348
+ content_parts.append(f"Text: {pdf_text[:1500]}")
349
+
350
+ if extracted_images:
351
+ visual_content = "\n".join([img['description'][:150] for img in extracted_images])
352
+ content_parts.append(f"Visual Content: {visual_content}")
353
+
354
+ combined_content = "\n\n".join(content_parts)
355
+
356
+ prompt = f"""Extract key terms, concepts, and topics from this document content.
357
+ Include technical terms, important concepts, and themes from both text and visual elements.
358
+
359
+ Content: {combined_content}
360
+
361
+ Key terms and concepts:"""
362
+
363
+ response = text_client.chat_completion(
364
+ messages=[{"role": "user", "content": prompt}],
365
+ max_tokens=120,
366
+ temperature=0.5
367
+ )
368
+
369
+ return response["choices"][0]["message"]["content"].strip()
370
+
371
+ except Exception as e:
372
+ return f"❌ Error extracting keywords: {str(e)}"
373
+
374
+ def show_extracted_images():
375
+ """Display information about extracted images"""
376
+ global extracted_images
377
+
378
+ if not extracted_images:
379
+ return "No visual elements extracted from the current document."
380
+
381
+ info = f"πŸ“Š Extracted {len(extracted_images)} visual elements:\n\n"
382
+ for i, img in enumerate(extracted_images, 1):
383
+ element_type = "πŸ“Š Table" if img['type'] == 'table' else "πŸ–ΌοΈ Image"
384
+ info += f"{i}. {element_type}: {img['filename']}\n"
385
+ info += f" Description: {img['description'][:150]}...\n\n"
386
+
387
+ if i >= 5: # Limit display to first 5
388
+ remaining = len(extracted_images) - 5
389
+ if remaining > 0:
390
+ info += f"... and {remaining} more visual elements."
391
+ break
392
+
393
+ return info
394
+
395
+ def clear_interface_multimodal():
396
+ """Enhanced clear function for multimodal system"""
397
+ global index, retriever, current_pdf_name, pdf_text, extracted_images
398
+
399
+ index = retriever = None
400
+ current_pdf_name = pdf_text = None
401
+ extracted_images = []
402
+
403
+ if os.path.exists(FIGURES_DIR):
404
+ for file in os.listdir(FIGURES_DIR):
405
+ try:
406
+ os.remove(os.path.join(FIGURES_DIR, file))
407
+ except:
408
+ pass
409
+
410
+ return None, "", gr.update(interactive=False), "", "", "", "", ""
411
+
412
+ # Enhanced Gradio UI
413
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
414
+
415
+ with gr.Blocks(theme=theme, css="""
416
+ .container { border-radius: 10px; padding: 15px; }
417
+ .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
418
+ .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
419
+ .main-title {
420
+ text-align: center;
421
+ font-size: 56px;
422
+ font-weight: bold;
423
+ margin-bottom: 20px;
424
+ background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
425
+ -webkit-background-clip: text;
426
+ -webkit-text-fill-color: transparent;
427
+ }
428
+ .feature-badge {
429
+ background: linear-gradient(45deg, #10b981, #3b82f6);
430
+ color: white;
431
+ padding: 4px 12px;
432
+ border-radius: 15px;
433
+ font-size: 11px;
434
+ margin: 2px;
435
+ display: inline-block;
436
+ }
437
+ """) as demo:
438
+
439
+ gr.Markdown("<div class='main-title'>πŸ€– DocQueryAI Pro</div>")
440
+ gr.Markdown("""
441
+ <div style='text-align: center; margin-bottom: 25px;'>
442
+ <span class='feature-badge'>πŸ” Advanced RAG</span>
443
+ <span class='feature-badge'>πŸ–ΌοΈ Vision AI</span>
444
+ <span class='feature-badge'>πŸ“Š Table Analysis</span>
445
+ <span class='feature-badge'>πŸ“ˆ Chart Understanding</span>
446
+ <span class='feature-badge'>🧠 Smart Retrieval</span>
447
+ </div>
448
+ """)
449
 
450
+ with gr.Row():
451
+ with gr.Column():
452
+ gr.Markdown("## πŸ“„ Document Processing")
453
+ pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
454
+ pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
455
+ upload_button = gr.Button("πŸš€ Process with AI Vision", variant="primary", size="lg")
456
+ status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)
457
+
458
+ with gr.Column():
459
+ gr.Markdown("## πŸ’¬ Intelligent Q&A")
460
+ gr.Markdown("*Ask about any content: text, images, charts, tables, or data visualizations*")
461
+ question_input = gr.Textbox(
462
+ lines=3,
463
+ placeholder="Examples:\nβ€’ What does the chart show?\nβ€’ Summarize the table data\nβ€’ Explain the main findings",
464
+ label="Your Question"
465
+ )
466
+ ask_button = gr.Button("πŸ” Get AI Answer", variant="primary", size="lg")
467
+ answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
468
 
469
  with gr.Row():
470
+ with gr.Column():
471
+ summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
472
+ summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)
473
+
474
+ with gr.Column():
475
+ keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
476
+ keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)
477
 
478
  with gr.Row():
479
+ with gr.Column():
480
+ structure_button = gr.Button("πŸ—οΈ Analyze Structure", variant="secondary")
481
+ structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)
482
+
483
+ with gr.Column():
484
+ images_button = gr.Button("πŸ–ΌοΈ Show Visual Elements", variant="secondary")
485
+ images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)
486
 
487
+ with gr.Row():
488
+ clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary", size="sm")
489
+
490
+ gr.Markdown("""
491
+ <div class='footer'>
492
+ πŸš€ <strong>Powered by Advanced AI</strong><br>
493
+ πŸ”§ HuggingFace Transformers β€’ LangChain β€’ FAISS β€’ Unstructured<br>
494
+ 🎯 Multimodal RAG: Text + Vision + Tables + Charts
495
+ </div>
496
+ """)
497
+
498
+ # Event bindings
499
+ upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
500
+ ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
501
+ summary_button.click(generate_summary_multimodal, [], summary_output)
502
+ keywords_button.click(extract_keywords_multimodal, [], keywords_output)
503
+ structure_button.click(analyze_document_structure, [], structure_output)
504
+ images_button.click(show_extracted_images, [], images_output)
505
+ clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])
506
 
507
  if __name__ == "__main__":
508
+ demo.launch(debug=True, share=True)