File size: 9,214 Bytes
9da1dd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import os
import PyPDF2
import gradio as gr

# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import CharacterTextSplitter
# HF Inference client for running Mistral-7B chat completions
from huggingface_hub import InferenceClient

# ── Globals ───────────────────────────────────────────────────────────────────
index = None               # FAISS index storing document embeddings
retriever = None           # Retriever to fetch relevant chunks
current_pdf_name = None    # Name of the currently loaded PDF
pdf_text = None            # Full text of the uploaded PDF

# ── HF Inference client (token injected via Spaces secrets) ─────────────────────
# Instantiate client for conversational endpoint (Mistral-7B-Instruct)
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")

# ── Embeddings ───────────────────────────────────────────────────────────────
# Use BGE embeddings from BAAI for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

def process_pdf(pdf_file):
    """
    1. Reads and extracts text from each page of the uploaded PDF.
    2. Splits the combined text into overlapping chunks for retrieval.
    3. Builds a FAISS index over those chunks and initializes a retriever.
    Args:
        pdf_file: Filepath to the uploaded PDF.
    Returns:
        - PDF filename shown in UI
        - Status message with number of chunks
        - Enables the question input field
    """
    global current_pdf_name, index, retriever, pdf_text

    # If no file uploaded, prompt the user
    if pdf_file is None:
        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

    # Save current filename for display and context
    current_pdf_name = os.path.basename(pdf_file.name)

    # Extract text from all pages
    with open(pdf_file.name, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        pages = [page.extract_text() or "" for page in reader.pages]
    pdf_text = "\n\n".join(pages)  # Combine page texts

    # Break text into 1,000-character chunks with 100-char overlap
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(pdf_text)

    # Build and store FAISS index for similarity search
    index = FAISS.from_texts(chunks, embeddings)

    # Create retriever configured to return top-2 most relevant chunks
    retriever = index.as_retriever(search_kwargs={"k": 2})

    # Return filename, success status, and enable the question box
    status = f"βœ… Indexed '{current_pdf_name}' β€” {len(chunks)} chunks"
    return current_pdf_name, status, gr.update(interactive=True)


def ask_question(pdf_name, question):
    """
    1. Retrieves the top-k most relevant text chunks from the FAISS index.
    2. Constructs a prompt combining those excerpts with the user question.
    3. Calls the HF chat endpoint to generate an answer.
    Args:
        pdf_name: The displayed PDF filename (unused internally).
        question: The user's question about the document.
    Returns:
        The generated answer as a string.
    """
    global retriever

    # Ensure a PDF is loaded first
    if index is None or retriever is None:
        return "❌ Please upload and index a PDF first."
    # Prompt user to type something if empty
    if not question.strip():
        return "❌ Please enter a question."

    # Fetch relevant document chunks
    docs = retriever.get_relevant_documents(question)
    context = "\n\n".join(doc.page_content for doc in docs)

    # Prepare the conversational prompt
    prompt = (
        "Use the following document excerpts to answer the question.\n\n"
        f"{context}\n\n"
        f"Question: {question}\n"
        "Answer:"
    )

    # Run chat completion with the prompt as the user's message
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=128,
        temperature=0.5
    )

    # Parse assistant reply from the choices
    answer = response["choices"][0]["message"]["content"].strip()
    return answer


def generate_summary():
    """
    Uses the first 2,000 characters of the loaded PDF text to ask the model for a concise summary.
    """
    if not pdf_text:
        return "❌ Please upload and index a PDF first."

    # Shorten long docs to 2k chars for summarization
    prompt = (
        "Please provide a concise summary of the following document:\n\n"
        f"{pdf_text[:2000]}..."
    )
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150,
        temperature=0.5
    )
    return response["choices"][0]["message"]["content"].strip()


def extract_keywords():
    """
    Uses the first 2,000 characters to ask the model to extract key terms or concepts.
    """
    if not pdf_text:
        return "❌ Please upload and index a PDF first."

    prompt = (
        "Extract 10–15 key terms or concepts from the following document:\n\n"
        f"{pdf_text[:2000]}..."
    )
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=60,
        temperature=0.5
    )
    return response["choices"][0]["message"]["content"].strip()


def clear_interface():
    """
    Resets all global state back to None, and clears inputs in the UI.
    """
    global index, retriever, current_pdf_name, pdf_text
    index = retriever = None
    current_pdf_name = pdf_text = None
    # Clear displayed filename and re-disable question input
    return None, "", gr.update(interactive=False)

# ── Gradio UI ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

with gr.Blocks(theme=theme, css="""
    .container { border-radius: 10px; padding: 15px; }
    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
    /* Center and enlarge the main heading */
    .main-title {
        text-align: center;
        font-size: 64px;
        font-weight: bold;
        margin-bottom: 20px;
    }
""") as demo:
    # Application title centered and bold
    gr.Markdown("<div class='main-title'>DocQueryAI</div>")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## πŸ“„ Document Input")
            # Display the name of the active PDF
            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
            # File upload widget for PDFs
            pdf_file = gr.File(file_types=[".pdf"], type="filepath")
            # Button to start processing
            upload_button = gr.Button("πŸ“€ Process Document", variant="primary")
            # Status text below the button
            status_box = gr.Textbox(label="Status", interactive=False)

        with gr.Column():
            gr.Markdown("## ❓ Ask Questions")
            # Text area for user questions
            question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
            # Button to trigger Q&A
            ask_button = gr.Button("πŸ” Ask Question", variant="primary")
            # Output textbox for the generated answer
            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

    # Footer section with summary and keywords extraction
    with gr.Row():
        summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
        summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
        keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
        keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False)

    # Clear everything
    clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
    gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS | Gradio</div>")

    # Bind events to functions
    upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input])
    ask_button.click(ask_question, [pdf_display, question_input], answer_output)
    summary_button.click(generate_summary, [], summary_output)
    keywords_button.click(extract_keywords, [], keywords_output)
    clear_button.click(clear_interface, [], [pdf_file, pdf_display, question_input])

if __name__ == "__main__":
    # Launch the Gradio app, share=True exposes a public URL in Spaces
    demo.launch(debug=True, share=True)