Spaces:
Sleeping
Sleeping
File size: 9,214 Bytes
9da1dd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
import os
import PyPDF2
import gradio as gr
# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import CharacterTextSplitter
# HF Inference client for running Mistral-7B chat completions
from huggingface_hub import InferenceClient
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
index = None # FAISS index storing document embeddings
retriever = None # Retriever to fetch relevant chunks
current_pdf_name = None # Name of the currently loaded PDF
pdf_text = None # Full text of the uploaded PDF
# ββ HF Inference client (token injected via Spaces secrets) βββββββββββββββββββββ
# Instantiate client for conversational endpoint (Mistral-7B-Instruct)
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# ββ Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Use BGE embeddings from BAAI for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
def process_pdf(pdf_file):
"""
1. Reads and extracts text from each page of the uploaded PDF.
2. Splits the combined text into overlapping chunks for retrieval.
3. Builds a FAISS index over those chunks and initializes a retriever.
Args:
pdf_file: Filepath to the uploaded PDF.
Returns:
- PDF filename shown in UI
- Status message with number of chunks
- Enables the question input field
"""
global current_pdf_name, index, retriever, pdf_text
# If no file uploaded, prompt the user
if pdf_file is None:
return None, "β Please upload a PDF file.", gr.update(interactive=False)
# Save current filename for display and context
current_pdf_name = os.path.basename(pdf_file.name)
# Extract text from all pages
with open(pdf_file.name, "rb") as f:
reader = PyPDF2.PdfReader(f)
pages = [page.extract_text() or "" for page in reader.pages]
pdf_text = "\n\n".join(pages) # Combine page texts
# Break text into 1,000-character chunks with 100-char overlap
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_text(pdf_text)
# Build and store FAISS index for similarity search
index = FAISS.from_texts(chunks, embeddings)
# Create retriever configured to return top-2 most relevant chunks
retriever = index.as_retriever(search_kwargs={"k": 2})
# Return filename, success status, and enable the question box
status = f"β
Indexed '{current_pdf_name}' β {len(chunks)} chunks"
return current_pdf_name, status, gr.update(interactive=True)
def ask_question(pdf_name, question):
"""
1. Retrieves the top-k most relevant text chunks from the FAISS index.
2. Constructs a prompt combining those excerpts with the user question.
3. Calls the HF chat endpoint to generate an answer.
Args:
pdf_name: The displayed PDF filename (unused internally).
question: The user's question about the document.
Returns:
The generated answer as a string.
"""
global retriever
# Ensure a PDF is loaded first
if index is None or retriever is None:
return "β Please upload and index a PDF first."
# Prompt user to type something if empty
if not question.strip():
return "β Please enter a question."
# Fetch relevant document chunks
docs = retriever.get_relevant_documents(question)
context = "\n\n".join(doc.page_content for doc in docs)
# Prepare the conversational prompt
prompt = (
"Use the following document excerpts to answer the question.\n\n"
f"{context}\n\n"
f"Question: {question}\n"
"Answer:"
)
# Run chat completion with the prompt as the user's message
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=128,
temperature=0.5
)
# Parse assistant reply from the choices
answer = response["choices"][0]["message"]["content"].strip()
return answer
def generate_summary():
"""
Uses the first 2,000 characters of the loaded PDF text to ask the model for a concise summary.
"""
if not pdf_text:
return "β Please upload and index a PDF first."
# Shorten long docs to 2k chars for summarization
prompt = (
"Please provide a concise summary of the following document:\n\n"
f"{pdf_text[:2000]}..."
)
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=150,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
def extract_keywords():
"""
Uses the first 2,000 characters to ask the model to extract key terms or concepts.
"""
if not pdf_text:
return "β Please upload and index a PDF first."
prompt = (
"Extract 10β15 key terms or concepts from the following document:\n\n"
f"{pdf_text[:2000]}..."
)
response = client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=60,
temperature=0.5
)
return response["choices"][0]["message"]["content"].strip()
def clear_interface():
"""
Resets all global state back to None, and clears inputs in the UI.
"""
global index, retriever, current_pdf_name, pdf_text
index = retriever = None
current_pdf_name = pdf_text = None
# Clear displayed filename and re-disable question input
return None, "", gr.update(interactive=False)
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")
with gr.Blocks(theme=theme, css="""
.container { border-radius: 10px; padding: 15px; }
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
/* Center and enlarge the main heading */
.main-title {
text-align: center;
font-size: 64px;
font-weight: bold;
margin-bottom: 20px;
}
""") as demo:
# Application title centered and bold
gr.Markdown("<div class='main-title'>DocQueryAI</div>")
with gr.Row():
with gr.Column():
gr.Markdown("## π Document Input")
# Display the name of the active PDF
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
# File upload widget for PDFs
pdf_file = gr.File(file_types=[".pdf"], type="filepath")
# Button to start processing
upload_button = gr.Button("π€ Process Document", variant="primary")
# Status text below the button
status_box = gr.Textbox(label="Status", interactive=False)
with gr.Column():
gr.Markdown("## β Ask Questions")
# Text area for user questions
question_input = gr.Textbox(lines=3, placeholder="Enter your question hereβ¦")
# Button to trigger Q&A
ask_button = gr.Button("π Ask Question", variant="primary")
# Output textbox for the generated answer
answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
# Footer section with summary and keywords extraction
with gr.Row():
summary_button = gr.Button("π Generate Summary", variant="secondary")
summary_output = gr.Textbox(label="Summary", lines=4, interactive=False)
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary")
keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False)
# Clear everything
clear_button = gr.Button("ποΈ Clear All", variant="secondary")
gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS | Gradio</div>")
# Bind events to functions
upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input])
ask_button.click(ask_question, [pdf_display, question_input], answer_output)
summary_button.click(generate_summary, [], summary_output)
keywords_button.click(extract_keywords, [], keywords_output)
clear_button.click(clear_interface, [], [pdf_file, pdf_display, question_input])
if __name__ == "__main__":
# Launch the Gradio app, share=True exposes a public URL in Spaces
demo.launch(debug=True, share=True)
|