Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,252 +1,177 @@
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import tempfile
|
4 |
-
from pathlib import Path
|
5 |
import base64
|
6 |
-
import
|
7 |
-
from PIL import Image
|
8 |
import io
|
9 |
|
10 |
import gradio as gr
|
11 |
from huggingface_hub import InferenceClient
|
12 |
-
|
13 |
-
# Import vectorstore and embeddings from updated packages
|
14 |
from langchain_community.vectorstores import FAISS
|
15 |
from langchain_huggingface import HuggingFaceEmbeddings
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
|
18 |
-
# ββ Globals
|
19 |
index = None
|
20 |
retriever = None
|
21 |
-
current_pdf_name = None
|
22 |
extracted_content = None
|
23 |
-
extracted_images = []
|
24 |
|
25 |
-
# ββ
|
26 |
multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
|
27 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
os.makedirs(
|
33 |
|
|
|
34 |
def encode_image_to_base64(image_path):
|
35 |
-
with open(image_path, "rb") as
|
36 |
-
return base64.b64encode(
|
|
|
37 |
|
38 |
-
def
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
try:
|
42 |
-
|
43 |
-
for
|
44 |
-
page =
|
45 |
-
for
|
46 |
xref = img[0]
|
47 |
-
pix = fitz.Pixmap(
|
48 |
if pix.n - pix.alpha < 4:
|
49 |
-
|
50 |
-
img_pil = Image.open(io.BytesIO(
|
51 |
-
|
52 |
-
|
53 |
-
img_pil.save(
|
54 |
-
desc =
|
55 |
-
|
56 |
-
|
57 |
pix = None
|
58 |
-
|
59 |
-
return extracted_images, image_descriptions
|
60 |
except Exception as e:
|
61 |
-
print(f"
|
62 |
-
|
|
|
63 |
|
64 |
-
def
|
65 |
try:
|
66 |
b64 = encode_image_to_base64(image_path)
|
67 |
prompt = (
|
68 |
-
"Analyze this image and provide a detailed description.
|
69 |
-
"
|
70 |
-
"Image: [
|
71 |
)
|
72 |
-
|
73 |
prompt=prompt, max_new_tokens=200, temperature=0.3
|
74 |
)
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
except Exception as e:
|
77 |
-
return f"[IMAGE
|
|
|
78 |
|
79 |
-
def
|
80 |
-
global
|
81 |
-
if pdf_file
|
82 |
-
return None, "β
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
os.remove(os.path.join(figures_dir, f))
|
88 |
|
|
|
89 |
try:
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
for i in range(len(
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
imgs, img_descs = extract_images_from_pdf_pymupdf(pdf_file.name)
|
102 |
-
extracted_images.extend(imgs)
|
103 |
-
|
104 |
-
# Combine content and split
|
105 |
-
all_content = text_elements + img_descs
|
106 |
extracted_content = "\n\n".join(all_content)
|
107 |
if not extracted_content:
|
108 |
-
return
|
109 |
|
110 |
splitter = RecursiveCharacterTextSplitter(
|
111 |
chunk_size=1000, chunk_overlap=200, add_start_index=True
|
112 |
)
|
113 |
chunks = splitter.split_text(extracted_content)
|
114 |
-
|
115 |
index = FAISS.from_texts(chunks, embeddings)
|
116 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
f"{len(chunks)} chunks "
|
121 |
-
f"({len(text_elements)} pages, {len(img_descs)} images analyzed)"
|
122 |
-
)
|
123 |
-
return current_pdf_name, status, gr.update(interactive=True)
|
124 |
|
125 |
except Exception as e:
|
126 |
-
return
|
|
|
127 |
|
128 |
-
def
|
129 |
global retriever
|
130 |
if not retriever:
|
131 |
-
return "β
|
132 |
if not question.strip():
|
133 |
-
return "β
|
134 |
|
|
|
135 |
try:
|
136 |
docs = retriever.invoke(question)
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
resp = multimodal_client.text_generation(
|
146 |
-
prompt=prompt, max_new_tokens=300, temperature=0.5
|
147 |
-
)
|
148 |
-
return resp.strip()
|
149 |
-
except Exception as e:
|
150 |
-
return f"β Error generating answer: {e}"
|
151 |
-
|
152 |
-
def generate_multimodal_summary():
|
153 |
-
if not extracted_content:
|
154 |
-
return "β Please upload and process a PDF first."
|
155 |
try:
|
156 |
-
|
157 |
-
|
158 |
-
{"role":"user","content":[{"type":"text","text":
|
159 |
-
"Please provide a comprehensive summary of this document content. The content includes both textual "
|
160 |
-
f"information and descriptions of visual elements.\n\nDOCUMENT CONTENT:\n{preview}\n\nSUMMARY:"
|
161 |
-
}]}
|
162 |
-
]
|
163 |
-
resp = multimodal_client.chat_completion(
|
164 |
-
messages=messages, max_tokens=250, temperature=0.3
|
165 |
)
|
166 |
-
|
|
|
|
|
|
|
167 |
except Exception as e:
|
168 |
-
return f"β
|
169 |
|
170 |
-
|
171 |
-
if not extracted_content:
|
172 |
-
return "β Please upload and process a PDF first."
|
173 |
-
try:
|
174 |
-
preview = extracted_content[:3000]
|
175 |
-
messages = [
|
176 |
-
{"role":"user","content":[{"type":"text","text":
|
177 |
-
"Analyze the following document content and extract 12-15 key terms, concepts, and important phrases. "
|
178 |
-
f"DOCUMENT CONTENT:\n{preview}\n\nKEY TERMS:"
|
179 |
-
}]}
|
180 |
-
]
|
181 |
-
resp = multimodal_client.chat_completion(
|
182 |
-
messages=messages, max_tokens=120, temperature=0.3
|
183 |
-
)
|
184 |
-
return resp["choices"][0]["message"]["content"].strip()
|
185 |
-
except Exception as e:
|
186 |
-
return f"β Error extracting keywords: {e}"
|
187 |
-
|
188 |
-
def clear_multimodal_interface():
|
189 |
-
global index, retriever, current_pdf_name, extracted_content, extracted_images
|
190 |
-
for f in os.listdir(figures_dir):
|
191 |
-
try: os.remove(os.path.join(figures_dir, f))
|
192 |
-
except: pass
|
193 |
-
index = retriever = None
|
194 |
-
current_pdf_name = extracted_content = None
|
195 |
-
extracted_images.clear()
|
196 |
-
return None, "", gr.update(interactive=False)
|
197 |
-
|
198 |
-
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
199 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
|
200 |
-
|
201 |
-
|
202 |
-
.container { border-radius: 10px; padding: 15px; }
|
203 |
-
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
|
204 |
-
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
|
205 |
-
.main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
|
206 |
-
.multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; }
|
207 |
-
.model-info { background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 10px; margin: 10px 0; font-size: 12px; color: #64748b; }
|
208 |
-
""") as demo:
|
209 |
-
gr.Markdown("<div class='main-title'>Unified MultiModal RAG</div>")
|
210 |
-
gr.Markdown("<div style='text-align:center;'><span class='multimodal-badge'>π§ Single Model β’ Text + Vision</span></div>")
|
211 |
-
gr.Markdown("""
|
212 |
-
<div class='model-info'>
|
213 |
-
<strong>π€ Powered by:</strong> Microsoft Phi-3.5-Vision + CLIP Embeddings + PyMuPDF (HF Spaces Compatible)
|
214 |
-
</div>
|
215 |
-
""")
|
216 |
-
|
217 |
with gr.Row():
|
218 |
with gr.Column():
|
219 |
-
gr.
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
status_box = gr.Textbox(label="Processing Status", interactive=False)
|
224 |
with gr.Column():
|
225 |
-
gr.
|
226 |
-
|
227 |
-
|
228 |
-
answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)
|
229 |
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
with gr.Column():
|
235 |
-
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
|
236 |
-
keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)
|
237 |
-
|
238 |
-
clear_button = gr.Button("ποΈ Clear All", variant="secondary")
|
239 |
-
gr.Markdown("""
|
240 |
-
<div class='footer'>
|
241 |
-
<strong>Unified Multimodal Pipeline:</strong> One model handles text, images, charts, tables, diagrams, and mixed content queries
|
242 |
-
</div>
|
243 |
-
""")
|
244 |
-
|
245 |
-
upload_button.click(process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input])
|
246 |
-
ask_button.click(ask_multimodal_question, [pdf_display, question_input], answer_output)
|
247 |
-
summary_button.click(generate_multimodal_summary, [], summary_output)
|
248 |
-
keywords_button.click(extract_multimodal_keywords, [], keywords_output)
|
249 |
-
clear_button.click(clear_multimodal_interface, [], [pdf_file, pdf_display, question_input])
|
250 |
|
251 |
if __name__ == "__main__":
|
252 |
-
demo.launch(debug=True)
|
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import tempfile
|
|
|
4 |
import base64
|
5 |
+
from pathlib import Path
|
|
|
6 |
import io
|
7 |
|
8 |
import gradio as gr
|
9 |
from huggingface_hub import InferenceClient
|
|
|
|
|
10 |
from langchain_community.vectorstores import FAISS
|
11 |
from langchain_huggingface import HuggingFaceEmbeddings
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
|
14 |
+
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
15 |
index = None
|
16 |
retriever = None
|
|
|
17 |
extracted_content = None
|
|
|
18 |
|
19 |
+
# ββ Inference & Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββ
|
20 |
multimodal_client = InferenceClient(model="microsoft/Phi-3.5-vision-instruct")
|
21 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/clip-ViT-B-32")
|
22 |
|
23 |
+
# Temporary dirs for image extraction
|
24 |
+
TMP_DIR = tempfile.mkdtemp()
|
25 |
+
FIGURES_DIR = os.path.join(TMP_DIR, "figures")
|
26 |
+
os.makedirs(FIGURES_DIR, exist_ok=True)
|
27 |
|
28 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
29 |
def encode_image_to_base64(image_path):
|
30 |
+
with open(image_path, "rb") as f:
|
31 |
+
return base64.b64encode(f.read()).decode()
|
32 |
+
|
33 |
|
34 |
+
def extract_images_from_pdf(pdf_path):
|
35 |
+
from fitz import open as fitz_open
|
36 |
+
from PIL import Image
|
37 |
+
import fitz
|
38 |
+
|
39 |
+
extracted = []
|
40 |
+
descriptions = []
|
41 |
try:
|
42 |
+
doc = fitz_open(pdf_path)
|
43 |
+
for p in range(len(doc)):
|
44 |
+
page = doc.load_page(p)
|
45 |
+
for img in page.get_images():
|
46 |
xref = img[0]
|
47 |
+
pix = fitz.Pixmap(doc, xref)
|
48 |
if pix.n - pix.alpha < 4:
|
49 |
+
png = pix.tobytes("png")
|
50 |
+
img_pil = Image.open(io.BytesIO(png))
|
51 |
+
fname = f"page_{p}_img_{xref}.png"
|
52 |
+
path = os.path.join(FIGURES_DIR, fname)
|
53 |
+
img_pil.save(path)
|
54 |
+
desc = analyze_image(path)
|
55 |
+
extracted.append(path)
|
56 |
+
descriptions.append(desc)
|
57 |
pix = None
|
58 |
+
doc.close()
|
|
|
59 |
except Exception as e:
|
60 |
+
print(f"Image extraction error: {e}")
|
61 |
+
return extracted, descriptions
|
62 |
+
|
63 |
|
64 |
+
def analyze_image(image_path):
|
65 |
try:
|
66 |
b64 = encode_image_to_base64(image_path)
|
67 |
prompt = (
|
68 |
+
"Analyze this image and provide a detailed description. "
|
69 |
+
"Include any text, charts, tables, or important visual elements.\n"
|
70 |
+
"Image: [data]\nDescription:"
|
71 |
)
|
72 |
+
raw = multimodal_client.text_generation(
|
73 |
prompt=prompt, max_new_tokens=200, temperature=0.3
|
74 |
)
|
75 |
+
# Handle dict or list wrapping
|
76 |
+
if isinstance(raw, dict):
|
77 |
+
out = raw.get("generated_text", str(raw))
|
78 |
+
elif isinstance(raw, list) and raw and isinstance(raw[0], dict):
|
79 |
+
out = raw[0].get("generated_text", str(raw))
|
80 |
+
else:
|
81 |
+
out = str(raw)
|
82 |
+
return f"[IMAGE]: {out.strip()}"
|
83 |
except Exception as e:
|
84 |
+
return f"[IMAGE ERROR]: {e}"
|
85 |
+
|
86 |
|
87 |
+
def process_pdf(pdf_file):
|
88 |
+
global index, retriever, extracted_content
|
89 |
+
if not pdf_file:
|
90 |
+
return None, "β Upload a PDF.", gr.update(interactive=False)
|
91 |
|
92 |
+
# clear old images
|
93 |
+
for f in os.listdir(FIGURES_DIR):
|
94 |
+
os.remove(os.path.join(FIGURES_DIR, f))
|
|
|
95 |
|
96 |
+
path = pdf_file.name if isinstance(pdf_file, Path) else pdf_file
|
97 |
try:
|
98 |
+
import fitz
|
99 |
+
doc = fitz.open(path)
|
100 |
+
pages = []
|
101 |
+
for i in range(len(doc)):
|
102 |
+
txt = doc.load_page(i).get_text().strip()
|
103 |
+
if txt:
|
104 |
+
pages.append(f"[Page {i+1}]\n" + txt)
|
105 |
+
doc.close()
|
106 |
+
|
107 |
+
imgs, descs = extract_images_from_pdf(path)
|
108 |
+
all_content = pages + descs
|
|
|
|
|
|
|
|
|
|
|
109 |
extracted_content = "\n\n".join(all_content)
|
110 |
if not extracted_content:
|
111 |
+
return pdf_file.name, "β No content extracted.", gr.update(interactive=False)
|
112 |
|
113 |
splitter = RecursiveCharacterTextSplitter(
|
114 |
chunk_size=1000, chunk_overlap=200, add_start_index=True
|
115 |
)
|
116 |
chunks = splitter.split_text(extracted_content)
|
|
|
117 |
index = FAISS.from_texts(chunks, embeddings)
|
118 |
retriever = index.as_retriever(search_kwargs={"k": 3})
|
119 |
|
120 |
+
msg = f"β
Processed {pdf_file.name} β {len(chunks)} chunks."
|
121 |
+
return pdf_file.name, msg, gr.update(interactive=True)
|
|
|
|
|
|
|
|
|
122 |
|
123 |
except Exception as e:
|
124 |
+
return pdf_file.name if pdf_file else None, f"β PDF error: {e}", gr.update(interactive=False)
|
125 |
+
|
126 |
|
127 |
+
def ask_question(doc_name, question):
|
128 |
global retriever
|
129 |
if not retriever:
|
130 |
+
return "β Process a PDF first."
|
131 |
if not question.strip():
|
132 |
+
return "β Enter a question."
|
133 |
|
134 |
+
# retrieve
|
135 |
try:
|
136 |
docs = retriever.invoke(question)
|
137 |
+
except Exception:
|
138 |
+
docs = retriever.get_relevant_documents(question)
|
139 |
+
|
140 |
+
context = "\n\n".join(d.page_content for d in docs)
|
141 |
+
prompt = (
|
142 |
+
"You are an AI assistant with both text and visual context.\n"
|
143 |
+
f"CONTEXT:\n{context}\nQUESTION: {question}\nAnswer:"
|
144 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
try:
|
146 |
+
raw = multimodal_client.text_generation(
|
147 |
+
prompt=prompt, max_new_tokens=300, temperature=0.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
)
|
149 |
+
if isinstance(raw, dict): out = raw.get("generated_text", str(raw))
|
150 |
+
elif isinstance(raw, list) and raw and isinstance(raw[0], dict): out = raw[0].get("generated_text", str(raw))
|
151 |
+
else: out = str(raw)
|
152 |
+
return out.strip()
|
153 |
except Exception as e:
|
154 |
+
return f"β Generation error: {e}"
|
155 |
|
156 |
+
# ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
|
158 |
+
with gr.Blocks(theme=theme) as demo:
|
159 |
+
gr.Markdown("## π§ Unified MultiModal RAG")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
+
pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"], type="file")
|
163 |
+
proc_btn = gr.Button("π Process PDF", variant="primary")
|
164 |
+
pdf_disp = gr.Textbox(label="Active Doc", interactive=False)
|
165 |
+
status = gr.Textbox(label="Status", interactive=False)
|
|
|
166 |
with gr.Column():
|
167 |
+
q_in = gr.Textbox(label="Ask your questionβ¦", lines=3, interactive=False)
|
168 |
+
ask_btn = gr.Button("π Ask", variant="primary", interactive=False)
|
169 |
+
ans_out = gr.Textbox(label="Answer", lines=6, interactive=False)
|
|
|
170 |
|
171 |
+
proc_btn.click(process_pdf, [pdf_in], [pdf_disp, status, q_in])
|
172 |
+
# enable ask button only after processing
|
173 |
+
proc_btn.click(lambda *_: gr.update(interactive=True), [], [], [ask_btn])
|
174 |
+
ask_btn.click(ask_question, [pdf_disp, q_in], ans_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
if __name__ == "__main__":
|
177 |
+
demo.launch(debug=True)
|