Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
22 |
|
23 |
|
24 |
|
|
|
25 |
retriever = None # FAISS retriever for multimodal content
|
26 |
current_pdf_name = None # Name of the currently loaded PDF
|
27 |
combined_texts: List[str] = [] # Combined text + image captions corpus
|
@@ -50,8 +51,7 @@ def generate_caption(image_path: str) -> str:
|
|
50 |
|
51 |
def embed_texts(texts: List[str]) -> List[List[float]]:
|
52 |
"""
|
53 |
-
Call the HF embeddings endpoint.
|
54 |
-
Uses `google/Gemma-Embeddings-v1.0` (or any other hosted embeddings model).
|
55 |
"""
|
56 |
resp = hf.embeddings(
|
57 |
model="google/Gemma-Embeddings-v1.0",
|
@@ -62,7 +62,47 @@ def embed_texts(texts: List[str]) -> List[List[float]]:
|
|
62 |
|
63 |
def process_pdf(pdf_file) -> str:
|
64 |
"""
|
65 |
-
Parse the PDF, caption
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
build FAISS index, and prepare retriever.
|
67 |
"""
|
68 |
global current_pdf_name, retriever, combined_texts
|
@@ -109,12 +149,13 @@ def process_pdf(pdf_file) -> str:
|
|
109 |
)
|
110 |
retriever = index.as_retriever(search_kwargs={"k": 2})
|
111 |
|
112 |
-
return f"β
Indexed '{current_pdf_name}' β "
|
113 |
-
f"{len(text_elements)} text blocks + {len(captions)} image captions"
|
114 |
|
115 |
|
116 |
def ask_question(question: str) -> str:
|
117 |
-
"""
|
|
|
|
|
118 |
global retriever
|
119 |
if retriever is None:
|
120 |
return "β Please upload and process a PDF first."
|
@@ -149,7 +190,6 @@ def clear_interface():
|
|
149 |
os.makedirs(FIGURES_DIR, exist_ok=True)
|
150 |
return ""
|
151 |
|
152 |
-
|
153 |
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
154 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
|
155 |
with gr.Blocks(theme=theme, css="""
|
@@ -191,4 +231,4 @@ with gr.Blocks(theme=theme, css="""
|
|
191 |
outputs=[status_box, answer_output])
|
192 |
|
193 |
if __name__ == "__main__":
|
194 |
-
demo.launch(debug=True, share=True)
|
|
|
22 |
|
23 |
|
24 |
|
25 |
+
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
26 |
retriever = None # FAISS retriever for multimodal content
|
27 |
current_pdf_name = None # Name of the currently loaded PDF
|
28 |
combined_texts: List[str] = [] # Combined text + image captions corpus
|
|
|
51 |
|
52 |
def embed_texts(texts: List[str]) -> List[List[float]]:
|
53 |
"""
|
54 |
+
Call the HF embeddings endpoint using google/Gemma-Embeddings-v1.0.
|
|
|
55 |
"""
|
56 |
resp = hf.embeddings(
|
57 |
model="google/Gemma-Embeddings-v1.0",
|
|
|
62 |
|
63 |
def process_pdf(pdf_file) -> str:
|
64 |
"""
|
65 |
+
Parse the PDF, caption images, combine text+captions, embed remotely,
|
66 |
+
build FAISS index, and prepare retriever. Falls back to text-only if poppler is missing.
|
67 |
+
"""
|
68 |
+
from pdf2image.exceptions import PDFInfoNotInstalledError
|
69 |
+
global current_pdf_name, retriever, combined_texts
|
70 |
+
|
71 |
+
if pdf_file is None:
|
72 |
+
return "β Please upload a PDF file."
|
73 |
+
|
74 |
+
pdf_path = pdf_file.name
|
75 |
+
current_pdf_name = os.path.basename(pdf_path)
|
76 |
+
|
77 |
+
# Try rich parsing; fallback if poppler/pdfinfo is unavailable
|
78 |
+
try:
|
79 |
+
elements = partition_pdf(
|
80 |
+
filename=pdf_path,
|
81 |
+
strategy=PartitionStrategy.HI_RES,
|
82 |
+
extract_image_block_types=["Image", "Table"],
|
83 |
+
extract_image_block_output_dir=FIGURES_DIR,
|
84 |
+
)
|
85 |
+
text_elements = [el.text for el in elements if el.category not in ["Image","Table"] and el.text]
|
86 |
+
image_files = [os.path.join(FIGURES_DIR, f) for f in os.listdir(FIGURES_DIR)
|
87 |
+
if f.lower().endswith((".png",".jpg",".jpeg"))]
|
88 |
+
except PDFInfoNotInstalledError:
|
89 |
+
# Fallback: text-only extraction
|
90 |
+
from PyPDF2 import PdfReader
|
91 |
+
reader = PdfReader(pdf_path)
|
92 |
+
text_elements = [page.extract_text() or "" for page in reader.pages]
|
93 |
+
image_files = []
|
94 |
+
|
95 |
+
# Caption images if any
|
96 |
+
captions = [generate_caption(img) for img in image_files]
|
97 |
+
|
98 |
+
combined_texts = text_elements + captions
|
99 |
+
vectors = embed_texts(combined_texts)
|
100 |
+
index = FAISS.from_embeddings(texts=combined_texts, embeddings=vectors)
|
101 |
+
retriever = index.as_retriever(search_kwargs={"k": 2})
|
102 |
+
|
103 |
+
return f"β
Indexed '{current_pdf_name}' β {len(text_elements)} text blocks + {len(captions)} image captions"
|
104 |
+
"""
|
105 |
+
Parse the PDF, caption images, combine text+captions, embed remotely,
|
106 |
build FAISS index, and prepare retriever.
|
107 |
"""
|
108 |
global current_pdf_name, retriever, combined_texts
|
|
|
149 |
)
|
150 |
retriever = index.as_retriever(search_kwargs={"k": 2})
|
151 |
|
152 |
+
return f"β
Indexed '{current_pdf_name}' β {len(text_elements)} text blocks + {len(captions)} image captions"
|
|
|
153 |
|
154 |
|
155 |
def ask_question(question: str) -> str:
|
156 |
+
"""
|
157 |
+
Retrieve top-k chunks from FAISS and call chat_completions endpoint.
|
158 |
+
"""
|
159 |
global retriever
|
160 |
if retriever is None:
|
161 |
return "β Please upload and process a PDF first."
|
|
|
190 |
os.makedirs(FIGURES_DIR, exist_ok=True)
|
191 |
return ""
|
192 |
|
|
|
193 |
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
194 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
|
195 |
with gr.Blocks(theme=theme, css="""
|
|
|
231 |
outputs=[status_box, answer_output])
|
232 |
|
233 |
if __name__ == "__main__":
|
234 |
+
demo.launch(debug=True, share=True)
|