Muzamil305 commited on
Commit
9da1dd9
Β·
verified Β·
1 Parent(s): e86b027

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ import gradio as gr
4
+
5
+ # Import vectorstore and embeddings from langchain community package
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ # Text splitter to break large documents into manageable chunks
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ # HF Inference client for running Mistral-7B chat completions
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # ── Globals ───────────────────────────────────────────────────────────────────
14
+ index = None # FAISS index storing document embeddings
15
+ retriever = None # Retriever to fetch relevant chunks
16
+ current_pdf_name = None # Name of the currently loaded PDF
17
+ pdf_text = None # Full text of the uploaded PDF
18
+
19
+ # ── HF Inference client (token injected via Spaces secrets) ─────────────────────
20
+ # Instantiate client for conversational endpoint (Mistral-7B-Instruct)
21
+ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
22
+
23
+ # ── Embeddings ───────────────────────────────────────────────────────────────
24
+ # Use BGE embeddings from BAAI for vectorizing text chunks
25
+ embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
26
+
27
+ def process_pdf(pdf_file):
28
+ """
29
+ 1. Reads and extracts text from each page of the uploaded PDF.
30
+ 2. Splits the combined text into overlapping chunks for retrieval.
31
+ 3. Builds a FAISS index over those chunks and initializes a retriever.
32
+ Args:
33
+ pdf_file: Filepath to the uploaded PDF.
34
+ Returns:
35
+ - PDF filename shown in UI
36
+ - Status message with number of chunks
37
+ - Enables the question input field
38
+ """
39
+ global current_pdf_name, index, retriever, pdf_text
40
+
41
+ # If no file uploaded, prompt the user
42
+ if pdf_file is None:
43
+ return None, "❌ Please upload a PDF file.", gr.update(interactive=False)
44
+
45
+ # Save current filename for display and context
46
+ current_pdf_name = os.path.basename(pdf_file.name)
47
+
48
+ # Extract text from all pages
49
+ with open(pdf_file.name, "rb") as f:
50
+ reader = PyPDF2.PdfReader(f)
51
+ pages = [page.extract_text() or "" for page in reader.pages]
52
+ pdf_text = "\n\n".join(pages) # Combine page texts
53
+
54
+ # Break text into 1,000-character chunks with 100-char overlap
55
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
56
+ chunks = splitter.split_text(pdf_text)
57
+
58
+ # Build and store FAISS index for similarity search
59
+ index = FAISS.from_texts(chunks, embeddings)
60
+
61
+ # Create retriever configured to return top-2 most relevant chunks
62
+ retriever = index.as_retriever(search_kwargs={"k": 2})
63
+
64
+ # Return filename, success status, and enable the question box
65
+ status = f"βœ… Indexed '{current_pdf_name}' β€” {len(chunks)} chunks"
66
+ return current_pdf_name, status, gr.update(interactive=True)
67
+
68
+
69
+ def ask_question(pdf_name, question):
70
+ """
71
+ 1. Retrieves the top-k most relevant text chunks from the FAISS index.
72
+ 2. Constructs a prompt combining those excerpts with the user question.
73
+ 3. Calls the HF chat endpoint to generate an answer.
74
+ Args:
75
+ pdf_name: The displayed PDF filename (unused internally).
76
+ question: The user's question about the document.
77
+ Returns:
78
+ The generated answer as a string.
79
+ """
80
+ global retriever
81
+
82
+ # Ensure a PDF is loaded first
83
+ if index is None or retriever is None:
84
+ return "❌ Please upload and index a PDF first."
85
+ # Prompt user to type something if empty
86
+ if not question.strip():
87
+ return "❌ Please enter a question."
88
+
89
+ # Fetch relevant document chunks
90
+ docs = retriever.get_relevant_documents(question)
91
+ context = "\n\n".join(doc.page_content for doc in docs)
92
+
93
+ # Prepare the conversational prompt
94
+ prompt = (
95
+ "Use the following document excerpts to answer the question.\n\n"
96
+ f"{context}\n\n"
97
+ f"Question: {question}\n"
98
+ "Answer:"
99
+ )
100
+
101
+ # Run chat completion with the prompt as the user's message
102
+ response = client.chat_completion(
103
+ messages=[{"role": "user", "content": prompt}],
104
+ max_tokens=128,
105
+ temperature=0.5
106
+ )
107
+
108
+ # Parse assistant reply from the choices
109
+ answer = response["choices"][0]["message"]["content"].strip()
110
+ return answer
111
+
112
+
113
+ def generate_summary():
114
+ """
115
+ Uses the first 2,000 characters of the loaded PDF text to ask the model for a concise summary.
116
+ """
117
+ if not pdf_text:
118
+ return "❌ Please upload and index a PDF first."
119
+
120
+ # Shorten long docs to 2k chars for summarization
121
+ prompt = (
122
+ "Please provide a concise summary of the following document:\n\n"
123
+ f"{pdf_text[:2000]}..."
124
+ )
125
+ response = client.chat_completion(
126
+ messages=[{"role": "user", "content": prompt}],
127
+ max_tokens=150,
128
+ temperature=0.5
129
+ )
130
+ return response["choices"][0]["message"]["content"].strip()
131
+
132
+
133
+ def extract_keywords():
134
+ """
135
+ Uses the first 2,000 characters to ask the model to extract key terms or concepts.
136
+ """
137
+ if not pdf_text:
138
+ return "❌ Please upload and index a PDF first."
139
+
140
+ prompt = (
141
+ "Extract 10–15 key terms or concepts from the following document:\n\n"
142
+ f"{pdf_text[:2000]}..."
143
+ )
144
+ response = client.chat_completion(
145
+ messages=[{"role": "user", "content": prompt}],
146
+ max_tokens=60,
147
+ temperature=0.5
148
+ )
149
+ return response["choices"][0]["message"]["content"].strip()
150
+
151
+
152
+ def clear_interface():
153
+ """
154
+ Resets all global state back to None, and clears inputs in the UI.
155
+ """
156
+ global index, retriever, current_pdf_name, pdf_text
157
+ index = retriever = None
158
+ current_pdf_name = pdf_text = None
159
+ # Clear displayed filename and re-disable question input
160
+ return None, "", gr.update(interactive=False)
161
+
162
+ # ── Gradio UI ────────────────────────────────────────────────────────────────
163
+ theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
164
+
165
+ with gr.Blocks(theme=theme, css="""
166
+ .container { border-radius: 10px; padding: 15px; }
167
+ .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
168
+ .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
169
+ /* Center and enlarge the main heading */
170
+ .main-title {
171
+ text-align: center;
172
+ font-size: 64px;
173
+ font-weight: bold;
174
+ margin-bottom: 20px;
175
+ }
176
+ """) as demo:
177
+ # Application title centered and bold
178
+ gr.Markdown("<div class='main-title'>DocQueryAI</div>")
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ gr.Markdown("## πŸ“„ Document Input")
183
+ # Display the name of the active PDF
184
+ pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
185
+ # File upload widget for PDFs
186
+ pdf_file = gr.File(file_types=[".pdf"], type="filepath")
187
+ # Button to start processing
188
+ upload_button = gr.Button("πŸ“€ Process Document", variant="primary")
189
+ # Status text below the button
190
+ status_box = gr.Textbox(label="Status", interactive=False)
191
+
192
+ with gr.Column():
193
+ gr.Markdown("## ❓ Ask Questions")
194
+ # Text area for user questions
195
+ question_input = gr.Textbox(lines=3, placeholder="Enter your question here…")
196
+ # Button to trigger Q&A
197
+ ask_button = gr.Button("πŸ” Ask Question", variant="primary")
198
+ # Output textbox for the generated answer
199
+ answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
200
+
201
+ # Footer section with summary and keywords extraction
202
+ with gr.Row():
203
+ summary_button = gr.Button("πŸ“‹ Generate Summary", variant="secondary")
204
+ summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
205
+ keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
206
+ keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False)
207
+
208
+ # Clear everything
209
+ clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
210
+ gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS | Gradio</div>")
211
+
212
+ # Bind events to functions
213
+ upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input])
214
+ ask_button.click(ask_question, [pdf_display, question_input], answer_output)
215
+ summary_button.click(generate_summary, [], summary_output)
216
+ keywords_button.click(extract_keywords, [], keywords_output)
217
+ clear_button.click(clear_interface, [], [pdf_file, pdf_display, question_input])
218
+
219
+ if __name__ == "__main__":
220
+ # Launch the Gradio app, share=True exposes a public URL in Spaces
221
+ demo.launch(debug=True, share=True)