Spaces:

Muzamil305
/

DocQueryAI

Sleeping

App Files Files Community

Muzamil305 commited on 28 days ago

Commit

9da1dd9

verified ·

1 Parent(s): e86b027

Create app.py

Browse files

Files changed (1) hide show

app.py +221 -0

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import PyPDF2
+import gradio as gr
+# Import vectorstore and embeddings from langchain community package
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# Text splitter to break large documents into manageable chunks
+from langchain.text_splitter import CharacterTextSplitter
+# HF Inference client for running Mistral-7B chat completions
+from huggingface_hub import InferenceClient
+# ── Globals ───────────────────────────────────────────────────────────────────
+index = None               # FAISS index storing document embeddings
+retriever = None           # Retriever to fetch relevant chunks
+current_pdf_name = None    # Name of the currently loaded PDF
+pdf_text = None            # Full text of the uploaded PDF
+# ── HF Inference client (token injected via Spaces secrets) ─────────────────────
+# Instantiate client for conversational endpoint (Mistral-7B-Instruct)
+client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
+# ── Embeddings ───────────────────────────────────────────────────────────────
+# Use BGE embeddings from BAAI for vectorizing text chunks
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+def process_pdf(pdf_file):
+    """
+    1. Reads and extracts text from each page of the uploaded PDF.
+    2. Splits the combined text into overlapping chunks for retrieval.
+    3. Builds a FAISS index over those chunks and initializes a retriever.
+    Args:
+        pdf_file: Filepath to the uploaded PDF.
+    Returns:
+        - PDF filename shown in UI
+        - Status message with number of chunks
+        - Enables the question input field
+    """
+    global current_pdf_name, index, retriever, pdf_text
+    # If no file uploaded, prompt the user
+    if pdf_file is None:
+        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
+    # Save current filename for display and context
+    current_pdf_name = os.path.basename(pdf_file.name)
+    # Extract text from all pages
+    with open(pdf_file.name, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        pages = [page.extract_text() or "" for page in reader.pages]
+    pdf_text = "\n\n".join(pages)  # Combine page texts
+    # Break text into 1,000-character chunks with 100-char overlap
+    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    chunks = splitter.split_text(pdf_text)
+    # Build and store FAISS index for similarity search
+    index = FAISS.from_texts(chunks, embeddings)
+    # Create retriever configured to return top-2 most relevant chunks
+    retriever = index.as_retriever(search_kwargs={"k": 2})
+    # Return filename, success status, and enable the question box
+    status = f"✅ Indexed '{current_pdf_name}' — {len(chunks)} chunks"
+    return current_pdf_name, status, gr.update(interactive=True)
+def ask_question(pdf_name, question):
+    """
+    1. Retrieves the top-k most relevant text chunks from the FAISS index.
+    2. Constructs a prompt combining those excerpts with the user question.
+    3. Calls the HF chat endpoint to generate an answer.
+    Args:
+        pdf_name: The displayed PDF filename (unused internally).
+        question: The user's question about the document.
+    Returns:
+        The generated answer as a string.
+    """
+    global retriever
+    # Ensure a PDF is loaded first
+    if index is None or retriever is None:
+        return "❌ Please upload and index a PDF first."
+    # Prompt user to type something if empty
+    if not question.strip():
+        return "❌ Please enter a question."
+    # Fetch relevant document chunks
+    docs = retriever.get_relevant_documents(question)
+    context = "\n\n".join(doc.page_content for doc in docs)
+    # Prepare the conversational prompt
+    prompt = (
+        "Use the following document excerpts to answer the question.\n\n"
+        f"{context}\n\n"
+        f"Question: {question}\n"
+        "Answer:"
+    )
+    # Run chat completion with the prompt as the user's message
+    response = client.chat_completion(
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=128,
+        temperature=0.5
+    )
+    # Parse assistant reply from the choices
+    answer = response["choices"][0]["message"]["content"].strip()
+    return answer
+def generate_summary():
+    """
+    Uses the first 2,000 characters of the loaded PDF text to ask the model for a concise summary.
+    """
+    if not pdf_text:
+        return "❌ Please upload and index a PDF first."
+    # Shorten long docs to 2k chars for summarization
+    prompt = (
+        "Please provide a concise summary of the following document:\n\n"
+        f"{pdf_text[:2000]}..."
+    )
+    response = client.chat_completion(
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=150,
+        temperature=0.5
+    )
+    return response["choices"][0]["message"]["content"].strip()
+def extract_keywords():
+    """
+    Uses the first 2,000 characters to ask the model to extract key terms or concepts.
+    """
+    if not pdf_text:
+        return "❌ Please upload and index a PDF first."
+    prompt = (
+        "Extract 10–15 key terms or concepts from the following document:\n\n"
+        f"{pdf_text[:2000]}..."
+    )
+    response = client.chat_completion(
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=60,
+        temperature=0.5
+    )
+    return response["choices"][0]["message"]["content"].strip()
+def clear_interface():
+    """
+    Resets all global state back to None, and clears inputs in the UI.
+    """
+    global index, retriever, current_pdf_name, pdf_text
+    index = retriever = None
+    current_pdf_name = pdf_text = None
+    # Clear displayed filename and re-disable question input
+    return None, "", gr.update(interactive=False)
+# ── Gradio UI ────────────────────────────────────────────────────────────────
+theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
+with gr.Blocks(theme=theme, css="""
+    .container { border-radius: 10px; padding: 15px; }
+    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
+    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
+    /* Center and enlarge the main heading */
+    .main-title {
+        text-align: center;
+        font-size: 64px;
+        font-weight: bold;
+        margin-bottom: 20px;
+    }
+""") as demo:
+    # Application title centered and bold
+    gr.Markdown("<div class='main-title'>DocQueryAI</div>")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 📄 Document Input")
+            # Display the name of the active PDF
+            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
+            # File upload widget for PDFs
+            pdf_file = gr.File(file_types=[".pdf"], type="filepath")
+            # Button to start processing
+            upload_button = gr.Button("📤 Process Document", variant="primary")
+            # Status text below the button
+            status_box = gr.Textbox(label="Status", interactive=False)
+        with gr.Column():
+            gr.Markdown("## ❓ Ask Questions")
+            # Text area for user questions
+            question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
+            # Button to trigger Q&A
+            ask_button = gr.Button("🔍 Ask Question", variant="primary")
+            # Output textbox for the generated answer
+            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
+    # Footer section with summary and keywords extraction
+    with gr.Row():
+        summary_button = gr.Button("📋 Generate Summary", variant="secondary")
+        summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
+        keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
+        keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False)
+    # Clear everything
+    clear_button = gr.Button("🗑️ Clear All", variant="secondary")
+    gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS | Gradio</div>")
+    # Bind events to functions
+    upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input])
+    ask_button.click(ask_question, [pdf_display, question_input], answer_output)
+    summary_button.click(generate_summary, [], summary_output)
+    keywords_button.click(extract_keywords, [], keywords_output)
+    clear_button.click(clear_interface, [], [pdf_file, pdf_display, question_input])
+if __name__ == "__main__":
+    # Launch the Gradio app, share=True exposes a public URL in Spaces
+    demo.launch(debug=True, share=True)