Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,14 @@ import os
|
|
2 |
import gradio as gr
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
# Import vectorstore and embeddings from langchain community package
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
9 |
# Text splitter to break large documents into manageable chunks
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
-
# HF Inference client for
|
12 |
from huggingface_hub import InferenceClient
|
13 |
# Unstructured for PDF processing with image extraction
|
14 |
from unstructured.partition.pdf import partition_pdf
|
@@ -19,54 +20,75 @@ index = None # FAISS index storing document embeddings
|
|
19 |
retriever = None # Retriever to fetch relevant chunks
|
20 |
current_pdf_name = None # Name of the currently loaded PDF
|
21 |
extracted_content = None # Combined text and image descriptions
|
|
|
22 |
|
23 |
-
# ββ
|
24 |
-
#
|
25 |
-
|
26 |
-
# Vision client for image analysis
|
27 |
-
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")
|
28 |
|
29 |
-
# ββ Embeddings
|
30 |
-
#
|
31 |
-
embeddings = HuggingFaceEmbeddings(model_name="
|
32 |
|
33 |
# Create temporary directories for processing
|
34 |
temp_dir = tempfile.mkdtemp()
|
35 |
figures_dir = os.path.join(temp_dir, "figures")
|
36 |
os.makedirs(figures_dir, exist_ok=True)
|
37 |
|
38 |
-
def
|
|
|
|
|
|
|
|
|
|
|
39 |
"""
|
40 |
-
Analyze an extracted image using
|
41 |
Args:
|
42 |
image_path: Path to the extracted image file
|
43 |
Returns:
|
44 |
Text description of the image content
|
45 |
"""
|
46 |
try:
|
47 |
-
#
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
except Exception as e:
|
56 |
-
return f"
|
57 |
|
58 |
def process_pdf_multimodal(pdf_file):
|
59 |
"""
|
60 |
-
|
61 |
-
2. Analyzes extracted images with vision model
|
62 |
-
3. Combines text and image descriptions
|
63 |
-
4. Creates FAISS index for retrieval
|
64 |
-
Args:
|
65 |
-
pdf_file: Uploaded PDF file
|
66 |
-
Returns:
|
67 |
-
- PDF filename, status message, and UI updates
|
68 |
"""
|
69 |
-
global current_pdf_name, index, retriever, extracted_content
|
70 |
|
71 |
if pdf_file is None:
|
72 |
return None, "β Please upload a PDF file.", gr.update(interactive=False)
|
@@ -74,7 +96,8 @@ def process_pdf_multimodal(pdf_file):
|
|
74 |
current_pdf_name = os.path.basename(pdf_file.name)
|
75 |
|
76 |
try:
|
77 |
-
# Clear previous
|
|
|
78 |
for file in os.listdir(figures_dir):
|
79 |
os.remove(os.path.join(figures_dir, file))
|
80 |
|
@@ -91,22 +114,27 @@ def process_pdf_multimodal(pdf_file):
|
|
91 |
text_elements = []
|
92 |
for element in elements:
|
93 |
if element.category not in ["Image", "Table"]:
|
94 |
-
|
|
|
95 |
|
96 |
-
# Process extracted images
|
97 |
image_descriptions = []
|
98 |
if os.path.exists(figures_dir):
|
99 |
for image_file in os.listdir(figures_dir):
|
100 |
if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
|
101 |
image_path = os.path.join(figures_dir, image_file)
|
102 |
-
|
|
|
103 |
image_descriptions.append(description)
|
104 |
|
105 |
-
# Combine
|
106 |
all_content = text_elements + image_descriptions
|
107 |
extracted_content = "\n\n".join(all_content)
|
108 |
|
109 |
-
|
|
|
|
|
|
|
110 |
text_splitter = RecursiveCharacterTextSplitter(
|
111 |
chunk_size=1000,
|
112 |
chunk_overlap=200,
|
@@ -114,13 +142,14 @@ def process_pdf_multimodal(pdf_file):
|
|
114 |
)
|
115 |
chunks = text_splitter.split_text(extracted_content)
|
116 |
|
117 |
-
# Create FAISS index
|
118 |
index = FAISS.from_texts(chunks, embeddings)
|
119 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
120 |
|
121 |
# Status message
|
122 |
num_images = len(image_descriptions)
|
123 |
-
|
|
|
124 |
|
125 |
return current_pdf_name, status, gr.update(interactive=True)
|
126 |
|
@@ -130,14 +159,9 @@ def process_pdf_multimodal(pdf_file):
|
|
130 |
|
131 |
def ask_multimodal_question(pdf_name, question):
|
132 |
"""
|
133 |
-
Answer questions using
|
134 |
-
Args:
|
135 |
-
pdf_name: Display name (unused)
|
136 |
-
question: User's question
|
137 |
-
Returns:
|
138 |
-
Generated answer combining text and visual information
|
139 |
"""
|
140 |
-
global retriever
|
141 |
|
142 |
if index is None or retriever is None:
|
143 |
return "β Please upload and process a PDF first."
|
@@ -146,26 +170,41 @@ def ask_multimodal_question(pdf_name, question):
|
|
146 |
return "β Please enter a question."
|
147 |
|
148 |
try:
|
149 |
-
# Retrieve relevant chunks
|
150 |
docs = retriever.get_relevant_documents(question)
|
151 |
context = "\n\n".join(doc.page_content for doc in docs)
|
152 |
|
153 |
-
#
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
#
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
169 |
temperature=0.5
|
170 |
)
|
171 |
|
@@ -177,25 +216,41 @@ def ask_multimodal_question(pdf_name, question):
|
|
177 |
|
178 |
def generate_multimodal_summary():
|
179 |
"""
|
180 |
-
Generate
|
181 |
"""
|
182 |
if not extracted_content:
|
183 |
return "β Please upload and process a PDF first."
|
184 |
|
185 |
try:
|
186 |
-
# Use first
|
187 |
-
content_preview = extracted_content[:
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
response =
|
196 |
-
messages=
|
197 |
-
max_tokens=
|
198 |
-
temperature=0.
|
199 |
)
|
200 |
|
201 |
return response["choices"][0]["message"]["content"].strip()
|
@@ -205,7 +260,7 @@ def generate_multimodal_summary():
|
|
205 |
|
206 |
def extract_multimodal_keywords():
|
207 |
"""
|
208 |
-
Extract keywords
|
209 |
"""
|
210 |
if not extracted_content:
|
211 |
return "β Please upload and process a PDF first."
|
@@ -213,16 +268,35 @@ def extract_multimodal_keywords():
|
|
213 |
try:
|
214 |
content_preview = extracted_content[:3000]
|
215 |
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
-
response =
|
223 |
-
messages=
|
224 |
-
max_tokens=
|
225 |
-
temperature=0.
|
226 |
)
|
227 |
|
228 |
return response["choices"][0]["message"]["content"].strip()
|
@@ -234,7 +308,7 @@ def clear_multimodal_interface():
|
|
234 |
"""
|
235 |
Reset all global state and clear UI.
|
236 |
"""
|
237 |
-
global index, retriever, current_pdf_name, extracted_content
|
238 |
|
239 |
# Clear figures directory
|
240 |
try:
|
@@ -246,6 +320,7 @@ def clear_multimodal_interface():
|
|
246 |
# Reset globals
|
247 |
index = retriever = None
|
248 |
current_pdf_name = extracted_content = None
|
|
|
249 |
|
250 |
return None, "", gr.update(interactive=False)
|
251 |
|
@@ -271,30 +346,46 @@ with gr.Blocks(theme=theme, css="""
|
|
271 |
display: inline-block;
|
272 |
margin: 10px auto;
|
273 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
""") as demo:
|
275 |
|
276 |
# Application title with multimodal badge
|
277 |
-
gr.Markdown("<div class='main-title'>MultiModal
|
278 |
-
gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
with gr.Row():
|
281 |
with gr.Column():
|
282 |
gr.Markdown("## π Document Input")
|
283 |
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
|
284 |
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
|
285 |
-
upload_button = gr.Button("π Process
|
286 |
status_box = gr.Textbox(label="Processing Status", interactive=False)
|
287 |
|
288 |
with gr.Column():
|
289 |
gr.Markdown("## β Ask Questions")
|
290 |
-
gr.Markdown("*
|
291 |
question_input = gr.Textbox(
|
292 |
lines=3,
|
293 |
-
placeholder="Ask about text, images, charts, or any
|
294 |
interactive=False
|
295 |
)
|
296 |
-
ask_button = gr.Button("π Ask
|
297 |
-
answer_output = gr.Textbox(label="
|
298 |
|
299 |
# Analysis tools
|
300 |
with gr.Row():
|
@@ -310,8 +401,8 @@ with gr.Blocks(theme=theme, css="""
|
|
310 |
|
311 |
gr.Markdown("""
|
312 |
<div class='footer'>
|
313 |
-
|
314 |
-
Supports: Text
|
315 |
</div>
|
316 |
""")
|
317 |
|
|
|
2 |
import gradio as gr
|
3 |
import tempfile
|
4 |
from pathlib import Path
|
5 |
+
import base64
|
6 |
|
7 |
# Import vectorstore and embeddings from langchain community package
|
8 |
from langchain_community.vectorstores import FAISS
|
9 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
# Text splitter to break large documents into manageable chunks
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
# HF Inference client for multimodal model
|
13 |
from huggingface_hub import InferenceClient
|
14 |
# Unstructured for PDF processing with image extraction
|
15 |
from unstructured.partition.pdf import partition_pdf
|
|
|
20 |
retriever = None # Retriever to fetch relevant chunks
|
21 |
current_pdf_name = None # Name of the currently loaded PDF
|
22 |
extracted_content = None # Combined text and image descriptions
|
23 |
+
extracted_images = [] # Store image paths for multimodal queries
|
24 |
|
25 |
+
# ββ Single Multimodal Model ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
26 |
+
# Using a single multimodal model that can handle both text and images
|
27 |
+
multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
|
|
|
|
|
28 |
|
29 |
+
# ββ Multimodal Embeddings ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
30 |
+
# Using CLIP-based embeddings that can handle both text and images
|
31 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
|
32 |
|
33 |
# Create temporary directories for processing
|
34 |
temp_dir = tempfile.mkdtemp()
|
35 |
figures_dir = os.path.join(temp_dir, "figures")
|
36 |
os.makedirs(figures_dir, exist_ok=True)
|
37 |
|
38 |
+
def encode_image_to_base64(image_path):
|
39 |
+
"""Convert image to base64 for API calls"""
|
40 |
+
with open(image_path, "rb") as image_file:
|
41 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
42 |
+
|
43 |
+
def analyze_image_with_multimodal_model(image_path):
|
44 |
"""
|
45 |
+
Analyze an extracted image using the multimodal model.
|
46 |
Args:
|
47 |
image_path: Path to the extracted image file
|
48 |
Returns:
|
49 |
Text description of the image content
|
50 |
"""
|
51 |
try:
|
52 |
+
# Encode image to base64
|
53 |
+
image_base64 = encode_image_to_base64(image_path)
|
54 |
+
|
55 |
+
# Create multimodal prompt
|
56 |
+
messages = [
|
57 |
+
{
|
58 |
+
"role": "user",
|
59 |
+
"content": [
|
60 |
+
{
|
61 |
+
"type": "text",
|
62 |
+
"text": "Analyze this image and provide a detailed description. Include any text, data, charts, diagrams, tables, or important visual elements you can see. Be specific and comprehensive."
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"type": "image_url",
|
66 |
+
"image_url": {
|
67 |
+
"url": f"data:image/jpeg;base64,{image_base64}"
|
68 |
+
}
|
69 |
+
}
|
70 |
+
]
|
71 |
+
}
|
72 |
+
]
|
73 |
+
|
74 |
+
# Use multimodal model for image analysis
|
75 |
+
response = multimodal_client.chat_completion(
|
76 |
+
messages=messages,
|
77 |
+
max_tokens=200,
|
78 |
+
temperature=0.3
|
79 |
+
)
|
80 |
+
|
81 |
+
description = response["choices"][0]["message"]["content"].strip()
|
82 |
+
return f"[IMAGE CONTENT]: {description}"
|
83 |
+
|
84 |
except Exception as e:
|
85 |
+
return f"[IMAGE CONTENT]: Could not analyze image - {str(e)}"
|
86 |
|
87 |
def process_pdf_multimodal(pdf_file):
|
88 |
"""
|
89 |
+
Process PDF with single multimodal model for both text and images.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"""
|
91 |
+
global current_pdf_name, index, retriever, extracted_content, extracted_images
|
92 |
|
93 |
if pdf_file is None:
|
94 |
return None, "β Please upload a PDF file.", gr.update(interactive=False)
|
|
|
96 |
current_pdf_name = os.path.basename(pdf_file.name)
|
97 |
|
98 |
try:
|
99 |
+
# Clear previous data
|
100 |
+
extracted_images.clear()
|
101 |
for file in os.listdir(figures_dir):
|
102 |
os.remove(os.path.join(figures_dir, file))
|
103 |
|
|
|
114 |
text_elements = []
|
115 |
for element in elements:
|
116 |
if element.category not in ["Image", "Table"]:
|
117 |
+
if element.text.strip(): # Only add non-empty text
|
118 |
+
text_elements.append(element.text.strip())
|
119 |
|
120 |
+
# Process extracted images with multimodal model
|
121 |
image_descriptions = []
|
122 |
if os.path.exists(figures_dir):
|
123 |
for image_file in os.listdir(figures_dir):
|
124 |
if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
|
125 |
image_path = os.path.join(figures_dir, image_file)
|
126 |
+
extracted_images.append(image_path) # Store for later use
|
127 |
+
description = analyze_image_with_multimodal_model(image_path)
|
128 |
image_descriptions.append(description)
|
129 |
|
130 |
+
# Combine all content
|
131 |
all_content = text_elements + image_descriptions
|
132 |
extracted_content = "\n\n".join(all_content)
|
133 |
|
134 |
+
if not extracted_content.strip():
|
135 |
+
return current_pdf_name, "β No content could be extracted from the PDF.", gr.update(interactive=False)
|
136 |
+
|
137 |
+
# Split into chunks for embedding
|
138 |
text_splitter = RecursiveCharacterTextSplitter(
|
139 |
chunk_size=1000,
|
140 |
chunk_overlap=200,
|
|
|
142 |
)
|
143 |
chunks = text_splitter.split_text(extracted_content)
|
144 |
|
145 |
+
# Create FAISS index with multimodal embeddings
|
146 |
index = FAISS.from_texts(chunks, embeddings)
|
147 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
148 |
|
149 |
# Status message
|
150 |
num_images = len(image_descriptions)
|
151 |
+
num_text_elements = len(text_elements)
|
152 |
+
status = f"β
Processed '{current_pdf_name}' β {len(chunks)} chunks ({num_text_elements} text sections, {num_images} images analyzed)"
|
153 |
|
154 |
return current_pdf_name, status, gr.update(interactive=True)
|
155 |
|
|
|
159 |
|
160 |
def ask_multimodal_question(pdf_name, question):
|
161 |
"""
|
162 |
+
Answer questions using the single multimodal model with retrieved context.
|
|
|
|
|
|
|
|
|
|
|
163 |
"""
|
164 |
+
global retriever, extracted_images
|
165 |
|
166 |
if index is None or retriever is None:
|
167 |
return "β Please upload and process a PDF first."
|
|
|
170 |
return "β Please enter a question."
|
171 |
|
172 |
try:
|
173 |
+
# Retrieve relevant chunks
|
174 |
docs = retriever.get_relevant_documents(question)
|
175 |
context = "\n\n".join(doc.page_content for doc in docs)
|
176 |
|
177 |
+
# Create messages for multimodal model
|
178 |
+
messages = [
|
179 |
+
{
|
180 |
+
"role": "user",
|
181 |
+
"content": [
|
182 |
+
{
|
183 |
+
"type": "text",
|
184 |
+
"text": f"""You are an AI assistant analyzing a document that contains both text and visual elements.
|
185 |
+
|
186 |
+
RETRIEVED CONTEXT:
|
187 |
+
{context}
|
188 |
+
|
189 |
+
QUESTION: {question}
|
190 |
+
|
191 |
+
Please provide a comprehensive answer based on the retrieved context above. The context includes both textual information and descriptions of images, charts, tables, and other visual elements from the document.
|
192 |
+
|
193 |
+
If your answer references visual elements (charts, graphs, images, tables), mention that explicitly. Keep your response focused and informative.
|
194 |
+
|
195 |
+
ANSWER:"""
|
196 |
+
}
|
197 |
+
]
|
198 |
+
}
|
199 |
+
]
|
200 |
|
201 |
+
# If question seems to be about images and we have extracted images,
|
202 |
+
# we could potentially include an image in the query (for advanced use cases)
|
203 |
+
|
204 |
+
# Generate response with multimodal model
|
205 |
+
response = multimodal_client.chat_completion(
|
206 |
+
messages=messages,
|
207 |
+
max_tokens=300,
|
208 |
temperature=0.5
|
209 |
)
|
210 |
|
|
|
216 |
|
217 |
def generate_multimodal_summary():
|
218 |
"""
|
219 |
+
Generate summary using the multimodal model.
|
220 |
"""
|
221 |
if not extracted_content:
|
222 |
return "β Please upload and process a PDF first."
|
223 |
|
224 |
try:
|
225 |
+
# Use first 4000 characters for summary
|
226 |
+
content_preview = extracted_content[:4000]
|
227 |
|
228 |
+
messages = [
|
229 |
+
{
|
230 |
+
"role": "user",
|
231 |
+
"content": [
|
232 |
+
{
|
233 |
+
"type": "text",
|
234 |
+
"text": f"""Please provide a comprehensive summary of this document content. The content includes both textual information and descriptions of visual elements (images, charts, tables, diagrams).
|
235 |
+
|
236 |
+
DOCUMENT CONTENT:
|
237 |
+
{content_preview}
|
238 |
+
|
239 |
+
Create a well-structured summary that captures:
|
240 |
+
1. Main topics and key points from the text
|
241 |
+
2. Important information from visual elements (charts, images, tables)
|
242 |
+
3. Overall document purpose and conclusions
|
243 |
+
|
244 |
+
SUMMARY:"""
|
245 |
+
}
|
246 |
+
]
|
247 |
+
}
|
248 |
+
]
|
249 |
|
250 |
+
response = multimodal_client.chat_completion(
|
251 |
+
messages=messages,
|
252 |
+
max_tokens=250,
|
253 |
+
temperature=0.3
|
254 |
)
|
255 |
|
256 |
return response["choices"][0]["message"]["content"].strip()
|
|
|
260 |
|
261 |
def extract_multimodal_keywords():
|
262 |
"""
|
263 |
+
Extract keywords using the multimodal model.
|
264 |
"""
|
265 |
if not extracted_content:
|
266 |
return "β Please upload and process a PDF first."
|
|
|
268 |
try:
|
269 |
content_preview = extracted_content[:3000]
|
270 |
|
271 |
+
messages = [
|
272 |
+
{
|
273 |
+
"role": "user",
|
274 |
+
"content": [
|
275 |
+
{
|
276 |
+
"type": "text",
|
277 |
+
"text": f"""Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. The content includes both text and descriptions of visual elements.
|
278 |
+
|
279 |
+
DOCUMENT CONTENT:
|
280 |
+
{content_preview}
|
281 |
+
|
282 |
+
Extract key terms that represent:
|
283 |
+
- Main topics and concepts
|
284 |
+
- Important technical terms
|
285 |
+
- Key findings or data points
|
286 |
+
- Visual elements mentioned (chart types, image subjects)
|
287 |
+
|
288 |
+
Format as a comma-separated list.
|
289 |
+
|
290 |
+
KEY TERMS:"""
|
291 |
+
}
|
292 |
+
]
|
293 |
+
}
|
294 |
+
]
|
295 |
|
296 |
+
response = multimodal_client.chat_completion(
|
297 |
+
messages=messages,
|
298 |
+
max_tokens=120,
|
299 |
+
temperature=0.3
|
300 |
)
|
301 |
|
302 |
return response["choices"][0]["message"]["content"].strip()
|
|
|
308 |
"""
|
309 |
Reset all global state and clear UI.
|
310 |
"""
|
311 |
+
global index, retriever, current_pdf_name, extracted_content, extracted_images
|
312 |
|
313 |
# Clear figures directory
|
314 |
try:
|
|
|
320 |
# Reset globals
|
321 |
index = retriever = None
|
322 |
current_pdf_name = extracted_content = None
|
323 |
+
extracted_images.clear()
|
324 |
|
325 |
return None, "", gr.update(interactive=False)
|
326 |
|
|
|
346 |
display: inline-block;
|
347 |
margin: 10px auto;
|
348 |
}
|
349 |
+
.model-info {
|
350 |
+
background: #f8fafc;
|
351 |
+
border: 1px solid #e2e8f0;
|
352 |
+
border-radius: 8px;
|
353 |
+
padding: 10px;
|
354 |
+
margin: 10px 0;
|
355 |
+
font-size: 12px;
|
356 |
+
color: #64748b;
|
357 |
+
}
|
358 |
""") as demo:
|
359 |
|
360 |
# Application title with multimodal badge
|
361 |
+
gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
|
362 |
+
gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>π§ Single Model β’ Text + Vision</span></div>")
|
363 |
+
|
364 |
+
# Model information
|
365 |
+
gr.Markdown("""
|
366 |
+
<div class='model-info'>
|
367 |
+
<strong>π€ Powered by:</strong> Microsoft Phi-3.5-Vision (Multimodal) + CLIP Embeddings (Text+Image) + Unstructured (PDF Processing)
|
368 |
+
</div>
|
369 |
+
""")
|
370 |
|
371 |
with gr.Row():
|
372 |
with gr.Column():
|
373 |
gr.Markdown("## π Document Input")
|
374 |
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
|
375 |
pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
|
376 |
+
upload_button = gr.Button("π Process with Multimodal AI", variant="primary")
|
377 |
status_box = gr.Textbox(label="Processing Status", interactive=False)
|
378 |
|
379 |
with gr.Column():
|
380 |
gr.Markdown("## β Ask Questions")
|
381 |
+
gr.Markdown("*Single AI model understands both text and visual content*")
|
382 |
question_input = gr.Textbox(
|
383 |
lines=3,
|
384 |
+
placeholder="Ask about text content, images, charts, tables, or any visual elements...",
|
385 |
interactive=False
|
386 |
)
|
387 |
+
ask_button = gr.Button("π Ask Multimodal AI", variant="primary")
|
388 |
+
answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
|
389 |
|
390 |
# Analysis tools
|
391 |
with gr.Row():
|
|
|
401 |
|
402 |
gr.Markdown("""
|
403 |
<div class='footer'>
|
404 |
+
<strong>Unified Multimodal Pipeline:</strong> One model handles text analysis, image understanding, and question answering<br>
|
405 |
+
Supports: Text β’ Images β’ Charts β’ Tables β’ Diagrams β’ Mixed Content Queries
|
406 |
</div>
|
407 |
""")
|
408 |
|