Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on May 21

Commit

7fdd092

verified ·

1 Parent(s): cb3c155

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -14

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # app.py
 import os
@@ -19,14 +20,18 @@ from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.utils.constants import PartitionStrategy
 # ————— Config & Folders —————
-PDF_DIR = Path("pdfs"); FIG_DIR = Path("figures")
-PDF_DIR.mkdir(exist_ok=True); FIG_DIR.mkdir(exist_ok=True)
 # ————— Read your HF_TOKEN secret —————
 hf_token = os.environ["HF_TOKEN"]
 # ————— Embeddings & LLM Setup —————
-embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 llm = HuggingFaceEndpoint(
     endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-base",
@@ -43,37 +48,47 @@ Answer (up to 3 sentences):
 """
 prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
-# ————— FIXED: correct keyword for InferenceClient —————
 vision_client = InferenceClient(
     model="Salesforce/blip-image-captioning-base",
     token=hf_token,
 )
 vector_store = None
 qa_chain = None
 def extract_image_caption(path: str) -> str:
     with Image.open(path) as img:
         return vision_client.image_to_text(img)
-def process_pdf(pdf_file) -> str:
     global vector_store, qa_chain
-    out_path = PDF_DIR / pdf_file.name
-    with open(out_path, "wb") as f:
-        f.write(pdf_file.read())
     elems = partition_pdf(
-        str(out_path),
         strategy=PartitionStrategy.HI_RES,
         extract_image_block_types=["Image", "Table"],
         extract_image_block_output_dir=str(FIG_DIR),
     )
     texts = [el.text for el in elems if el.category not in ("Image", "Table")]
     for img_file in FIG_DIR.iterdir():
         texts.append(extract_image_caption(str(img_file)))
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     docs = splitter.split_text("\n\n".join(texts))
@@ -84,21 +99,33 @@ def process_pdf(pdf_file) -> str:
         chain_type_kwargs={"prompt": prompt},
     )
-    return f"✅ Processed `{pdf_file.name}` into {len(docs)} chunks."
 def answer_query(question: str) -> str:
     if qa_chain is None:
         return "❗ Please upload and process a PDF first."
     return qa_chain.run(question)
 with gr.Blocks() as demo:
-    gr.Markdown("## 📄📷 Multimodal RAG — HF Spaces")
     with gr.Row():
-        pdf_in = gr.File(label="Upload PDF", type="file")
-        btn_proc = gr.Button("Process PDF"); status = gr.Textbox(label="Status")
     with gr.Row():
         q_in = gr.Textbox(label="Your Question")
-        btn_ask = gr.Button("Ask"); ans_out = gr.Textbox(label="Answer")
     btn_proc.click(fn=process_pdf, inputs=pdf_in, outputs=status)
     btn_ask.click(fn=answer_query, inputs=q_in, outputs=ans_out)

 # app.py
 import os
 from unstructured.partition.utils.constants import PartitionStrategy
 # ————— Config & Folders —————
+PDF_DIR = Path("pdfs")
+FIG_DIR = Path("figures")
+PDF_DIR.mkdir(exist_ok=True)
+FIG_DIR.mkdir(exist_ok=True)
 # ————— Read your HF_TOKEN secret —————
 hf_token = os.environ["HF_TOKEN"]
 # ————— Embeddings & LLM Setup —————
+embedding_model = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2"
+)
 llm = HuggingFaceEndpoint(
     endpoint_url="https://api-inference.huggingface.co/models/google/flan-t5-base",
 """
 prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])
+# ————— Inference client for image captioning —————
 vision_client = InferenceClient(
     model="Salesforce/blip-image-captioning-base",
     token=hf_token,
 )
+# Globals (initialized after processing)
 vector_store = None
 qa_chain = None
 def extract_image_caption(path: str) -> str:
     with Image.open(path) as img:
         return vision_client.image_to_text(img)
+def process_pdf(pdf_path: str) -> str:
+    """Ingest a local PDF file, extract text & images, chunk, embed, and index."""
     global vector_store, qa_chain
+    # Move the uploaded PDF into our PDFs folder
+    src = Path(pdf_path)
+    dest = PDF_DIR / src.name
+    src.rename(dest)
+    # Partition PDF into text + image blocks
     elems = partition_pdf(
+        str(dest),
         strategy=PartitionStrategy.HI_RES,
         extract_image_block_types=["Image", "Table"],
         extract_image_block_output_dir=str(FIG_DIR),
     )
+    # Collect text blocks
     texts = [el.text for el in elems if el.category not in ("Image", "Table")]
+    # Generate captions for each extracted image
     for img_file in FIG_DIR.iterdir():
         texts.append(extract_image_caption(str(img_file)))
+    # Chunk and embed
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     docs = splitter.split_text("\n\n".join(texts))
         chain_type_kwargs={"prompt": prompt},
     )
+    return f"✅ Processed `{dest.name}` into {len(docs)} chunks."
 def answer_query(question: str) -> str:
     if qa_chain is None:
         return "❗ Please upload and process a PDF first."
     return qa_chain.run(question)
+# ————— Gradio UI —————
 with gr.Blocks() as demo:
+    gr.Markdown("## 📄📷 Multimodal RAG — Hugging Face Spaces")
     with gr.Row():
+        pdf_in = gr.File(
+            label="Upload PDF",
+            file_types=["pdf"],
+            type="filepath"
+        )
+        btn_proc = gr.Button("Process PDF")
+        status = gr.Textbox(label="Status")
     with gr.Row():
         q_in = gr.Textbox(label="Your Question")
+        btn_ask = gr.Button("Ask")
+        ans_out = gr.Textbox(label="Answer")
     btn_proc.click(fn=process_pdf, inputs=pdf_in, outputs=status)
     btn_ask.click(fn=answer_query, inputs=q_in, outputs=ans_out)