Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Muzammil6376 commited on 28 days ago

Commit

583b178

verified ·

1 Parent(s): e735775

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -65

app.py CHANGED Viewed

@@ -22,11 +22,9 @@ from langchain_huggingface import HuggingFaceEmbeddings
-# ── Globals ───────────────────────────────────────────────────────────────────
 retriever = None               # FAISS retriever for multimodal content
 current_pdf_name = None        # Name of the currently loaded PDF
-combined_texts = None          # Combined text + image captions corpus
 # ── Setup: directories ─────────────────────────────────────────────────────────
 FIGURES_DIR = "figures"
@@ -34,76 +32,89 @@ if os.path.exists(FIGURES_DIR):
     shutil.rmtree(FIGURES_DIR)
 os.makedirs(FIGURES_DIR, exist_ok=True)
-# ── Models & Clients ───────────────────────────────────────────────────────────
-# Chat model (Mistral-7B-Instruct)
-chat_client = InferenceClient(model="google/gemma-3-27b-it")
-# Text embeddings (BAAI BGE)
-embeddings = HuggingFaceEmbeddings(model_name="google/gemma-3-27b-it")
-# Image captioning (BLIP)
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 def generate_caption(image_path: str) -> str:
-    """
-    Generates a natural-language caption for an image using BLIP.
-    """
-    image = Image.open(image_path).convert('RGB')
     inputs = blip_processor(image, return_tensors="pt")
     out = blip_model.generate(**inputs)
-    caption = blip_processor.decode(out[0], skip_special_tokens=True)
-    return caption
 def process_pdf(pdf_file) -> str:
     """
-    Parses the uploaded PDF into text chunks and image captions,
-    builds a FAISS index, and prepares the retriever.
-    Returns status message.
     """
     global current_pdf_name, retriever, combined_texts
     if pdf_file is None:
         return "❌ Please upload a PDF file."
-    # Save PDF locally for unstructured
     pdf_path = pdf_file.name
     current_pdf_name = os.path.basename(pdf_path)
-    # Extract text, table, and image blocks
     elements = partition_pdf(
         filename=pdf_path,
         strategy=PartitionStrategy.HI_RES,
         extract_image_block_types=["Image", "Table"],
-        extract_image_block_output_dir=FIGURES_DIR
     )
-    # Separate text and image elements
-    text_elements = [el.text for el in elements if el.category not in ["Image", "Table"] and el.text]
-    image_files = [os.path.join(FIGURES_DIR, f)
-                   for f in os.listdir(FIGURES_DIR)
-                   if f.lower().endswith((".png", ".jpg", ".jpeg"))]
-    # Generate captions for each image
-    captions = []
-    for img in image_files:
-        cap = generate_caption(img)
-        captions.append(cap)
-    # Combine all pieces for indexing
     combined_texts = text_elements + captions
-    # Create FAISS index and retriever
-    index = FAISS.from_texts(combined_texts, embeddings)
     retriever = index.as_retriever(search_kwargs={"k": 2})
-    status = f"✅ Indexed '{current_pdf_name}' — {len(text_elements)} text blocks + {len(captions)} image captions"
-    return status
 def ask_question(question: str) -> str:
-    """
-    Retrieves relevant chunks from the FAISS index and generates an answer via chat model.
-    """
     global retriever
     if retriever is None:
         return "❌ Please upload and process a PDF first."
@@ -119,56 +130,65 @@ def ask_question(question: str) -> str:
         f"Question: {question}\n"
         "Answer:"
     )
-    response = chat_client.chat_completion(
         messages=[{"role": "user", "content": prompt}],
         max_tokens=128,
-        temperature=0.5
     )
-    answer = response["choices"][0]["message"]["content"].strip()
-    return answer
 def clear_interface():
-    """Resets global state and clears the figures directory."""
     global retriever, current_pdf_name, combined_texts
     retriever = None
     current_pdf_name = None
-    combined_texts = None
-    shutil.rmtree(FIGURES_DIR)
     os.makedirs(FIGURES_DIR, exist_ok=True)
     return ""
 # ── Gradio UI ────────────────────────────────────────────────────────────────
 theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
 with gr.Blocks(theme=theme, css="""
     .container { border-radius: 10px; padding: 15px; }
-    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
-    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
-    .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; }
 """) as demo:
-    gr.Markdown("<div class='main-title'>DocQueryAI (Multimodal)</div>")
     with gr.Row():
         with gr.Column():
             gr.Markdown("## 📄 Document Input")
-            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
-            pdf_file = gr.File(file_types=[".pdf"], type="filepath")
             process_btn = gr.Button("📤 Process Document", variant="primary")
-            status_box = gr.Textbox(label="Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
-            question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
-            ask_btn = gr.Button("🔍 Ask Question", variant="primary")
-            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
     clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
-    gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS + BLIP | Gradio</div>")
-    process_btn.click(fn=process_pdf, inputs=[pdf_file], outputs=[status_box])
-    ask_btn.click(fn=ask_question, inputs=[question_input], outputs=[answer_output])
-    clear_btn.click(fn=clear_interface, outputs=[status_box, answer_output])
 if __name__ == "__main__":
-    demo.launch(debug=True, share=True)

 retriever = None               # FAISS retriever for multimodal content
 current_pdf_name = None        # Name of the currently loaded PDF
+combined_texts: List[str] = [] # Combined text + image captions corpus
 # ── Setup: directories ─────────────────────────────────────────────────────────
 FIGURES_DIR = "figures"
     shutil.rmtree(FIGURES_DIR)
 os.makedirs(FIGURES_DIR, exist_ok=True)
+# ── Clients & Models ───────────────────────────────────────────────────────────
+hf = InferenceClient()  # will use HUGGINGFACEHUB_API_TOKEN from env
+# BLIP captioner (small local model download)
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model     = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 def generate_caption(image_path: str) -> str:
+    """Ask BLIP to caption a local image."""
+    image = Image.open(image_path).convert("RGB")
     inputs = blip_processor(image, return_tensors="pt")
     out = blip_model.generate(**inputs)
+    return blip_processor.decode(out[0], skip_special_tokens=True)
+def embed_texts(texts: List[str]) -> List[List[float]]:
+    """
+    Call the HF embeddings endpoint.
+    Uses `google/Gemma-Embeddings-v1.0` (or any other hosted embeddings model).
+    """
+    resp = hf.embeddings(
+        model="google/Gemma-Embeddings-v1.0",
+        inputs=texts,
+    )
+    return resp["embeddings"]
 def process_pdf(pdf_file) -> str:
     """
+    Parse the PDF, caption its images, combine text+captions, embed remotely,
+    build FAISS index, and prepare retriever.
     """
     global current_pdf_name, retriever, combined_texts
     if pdf_file is None:
         return "❌ Please upload a PDF file."
+    # Save and name
     pdf_path = pdf_file.name
     current_pdf_name = os.path.basename(pdf_path)
+    # Extract blocks
     elements = partition_pdf(
         filename=pdf_path,
         strategy=PartitionStrategy.HI_RES,
         extract_image_block_types=["Image", "Table"],
+        extract_image_block_output_dir=FIGURES_DIR,
     )
+    # Split text vs. images
+    text_elements = [
+        el.text for el in elements
+        if el.category not in ["Image", "Table"] and el.text
+    ]
+    image_files = [
+        os.path.join(FIGURES_DIR, f)
+        for f in os.listdir(FIGURES_DIR)
+        if f.lower().endswith((".png", ".jpg", ".jpeg"))
+    ]
+    # Caption images
+    captions = [generate_caption(img) for img in image_files]
+    # Combine
     combined_texts = text_elements + captions
+    # Remote embeddings
+    vectors = embed_texts(combined_texts)
+    # Build FAISS
+    index = FAISS.from_embeddings(
+        texts=combined_texts,
+        embeddings=vectors,
+    )
     retriever = index.as_retriever(search_kwargs={"k": 2})
+    return f"✅ Indexed '{current_pdf_name}' — " \
+           f"{len(text_elements)} text blocks + {len(captions)} image captions"
 def ask_question(question: str) -> str:
+    """Retrieve top-k chunks from FAISS and call chat_completions endpoint."""
     global retriever
     if retriever is None:
         return "❌ Please upload and process a PDF first."
         f"Question: {question}\n"
         "Answer:"
     )
+    response = hf.chat_completion(
+        model="google/gemma-3-27b-it",
         messages=[{"role": "user", "content": prompt}],
         max_tokens=128,
+        temperature=0.5,
     )
+    return response["choices"][0]["message"]["content"].strip()
 def clear_interface():
+    """Reset state and clear extracted images."""
     global retriever, current_pdf_name, combined_texts
     retriever = None
     current_pdf_name = None
+    combined_texts = []
+    shutil.rmtree(FIGURES_DIR, ignore_errors=True)
     os.makedirs(FIGURES_DIR, exist_ok=True)
     return ""
 # ── Gradio UI ────────────────────────────────────────────────────────────────
 theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
 with gr.Blocks(theme=theme, css="""
     .container { border-radius: 10px; padding: 15px; }
+    .pdf-active { border-left: 3px solid #6366f1;
+                  padding-left: 10px;
+                  background-color: rgba(99,102,241,0.1); }
+    .footer { text-align: center; margin-top: 30px;
+              font-size: 0.8em; color: #666; }
+    .main-title { text-align: center; font-size: 64px;
+                  font-weight: bold; margin-bottom: 20px; }
 """) as demo:
+    gr.Markdown("<div class='main-title'>DocQueryAI (Remote‐RAG)</div>")
     with gr.Row():
         with gr.Column():
             gr.Markdown("## 📄 Document Input")
+            pdf_file    = gr.File(label="Upload PDF", file_types=[".pdf"], type="filepath")
             process_btn = gr.Button("📤 Process Document", variant="primary")
+            status_box  = gr.Textbox(label="Status", interactive=False)
         with gr.Column():
             gr.Markdown("## ❓ Ask Questions")
+            question_input = gr.Textbox(lines=3,
+                                        placeholder="Enter your question here…")
+            ask_btn        = gr.Button("🔍 Ask Question", variant="primary")
+            answer_output  = gr.Textbox(label="Answer", lines=8, interactive=False)
     clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+    gr.Markdown("<div class='footer'>Powered by HF Inference + BLIP + FAISS | Gradio</div>")
+    process_btn.click(fn=process_pdf,
+                      inputs=[pdf_file],
+                      outputs=[status_box])
+    ask_btn.click(fn=ask_question,
+                  inputs=[question_input],
+                  outputs=[answer_output])
+    clear_btn.click(fn=clear_interface,
+                    outputs=[status_box, answer_output])
 if __name__ == "__main__":
+    demo.launch(debug=True, share=True)