Spaces:
Sleeping
Sleeping
import os | |
import PyPDF2 | |
import gradio as gr | |
# Import vectorstore and embeddings from langchain community package | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
# Text splitter to break large documents into manageable chunks | |
from langchain.text_splitter import CharacterTextSplitter | |
# HF Inference client for running Mistral-7B chat completions | |
from huggingface_hub import InferenceClient | |
# ββ Globals βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
index = None # FAISS index storing document embeddings | |
retriever = None # Retriever to fetch relevant chunks | |
current_pdf_name = None # Name of the currently loaded PDF | |
pdf_text = None # Full text of the uploaded PDF | |
# ββ HF Inference client (token injected via Spaces secrets) βββββββββββββββββββββ | |
# Instantiate client for conversational endpoint (Mistral-7B-Instruct) | |
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3") | |
# ββ Embeddings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# Use BGE embeddings from BAAI for vectorizing text chunks | |
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") | |
def process_pdf(pdf_file): | |
""" | |
1. Reads and extracts text from each page of the uploaded PDF. | |
2. Splits the combined text into overlapping chunks for retrieval. | |
3. Builds a FAISS index over those chunks and initializes a retriever. | |
Args: | |
pdf_file: Filepath to the uploaded PDF. | |
Returns: | |
- PDF filename shown in UI | |
- Status message with number of chunks | |
- Enables the question input field | |
""" | |
global current_pdf_name, index, retriever, pdf_text | |
# If no file uploaded, prompt the user | |
if pdf_file is None: | |
return None, "β Please upload a PDF file.", gr.update(interactive=False) | |
# Save current filename for display and context | |
current_pdf_name = os.path.basename(pdf_file.name) | |
# Extract text from all pages | |
with open(pdf_file.name, "rb") as f: | |
reader = PyPDF2.PdfReader(f) | |
pages = [page.extract_text() or "" for page in reader.pages] | |
pdf_text = "\n\n".join(pages) # Combine page texts | |
# Break text into 1,000-character chunks with 100-char overlap | |
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
chunks = splitter.split_text(pdf_text) | |
# Build and store FAISS index for similarity search | |
index = FAISS.from_texts(chunks, embeddings) | |
# Create retriever configured to return top-2 most relevant chunks | |
retriever = index.as_retriever(search_kwargs={"k": 2}) | |
# Return filename, success status, and enable the question box | |
status = f"β Indexed '{current_pdf_name}' β {len(chunks)} chunks" | |
return current_pdf_name, status, gr.update(interactive=True) | |
def ask_question(pdf_name, question): | |
""" | |
1. Retrieves the top-k most relevant text chunks from the FAISS index. | |
2. Constructs a prompt combining those excerpts with the user question. | |
3. Calls the HF chat endpoint to generate an answer. | |
Args: | |
pdf_name: The displayed PDF filename (unused internally). | |
question: The user's question about the document. | |
Returns: | |
The generated answer as a string. | |
""" | |
global retriever | |
# Ensure a PDF is loaded first | |
if index is None or retriever is None: | |
return "β Please upload and index a PDF first." | |
# Prompt user to type something if empty | |
if not question.strip(): | |
return "β Please enter a question." | |
# Fetch relevant document chunks | |
docs = retriever.get_relevant_documents(question) | |
context = "\n\n".join(doc.page_content for doc in docs) | |
# Prepare the conversational prompt | |
prompt = ( | |
"Use the following document excerpts to answer the question.\n\n" | |
f"{context}\n\n" | |
f"Question: {question}\n" | |
"Answer:" | |
) | |
# Run chat completion with the prompt as the user's message | |
response = client.chat_completion( | |
messages=[{"role": "user", "content": prompt}], | |
max_tokens=128, | |
temperature=0.5 | |
) | |
# Parse assistant reply from the choices | |
answer = response["choices"][0]["message"]["content"].strip() | |
return answer | |
def generate_summary(): | |
""" | |
Uses the first 2,000 characters of the loaded PDF text to ask the model for a concise summary. | |
""" | |
if not pdf_text: | |
return "β Please upload and index a PDF first." | |
# Shorten long docs to 2k chars for summarization | |
prompt = ( | |
"Please provide a concise summary of the following document:\n\n" | |
f"{pdf_text[:2000]}..." | |
) | |
response = client.chat_completion( | |
messages=[{"role": "user", "content": prompt}], | |
max_tokens=150, | |
temperature=0.5 | |
) | |
return response["choices"][0]["message"]["content"].strip() | |
def extract_keywords(): | |
""" | |
Uses the first 2,000 characters to ask the model to extract key terms or concepts. | |
""" | |
if not pdf_text: | |
return "β Please upload and index a PDF first." | |
prompt = ( | |
"Extract 10β15 key terms or concepts from the following document:\n\n" | |
f"{pdf_text[:2000]}..." | |
) | |
response = client.chat_completion( | |
messages=[{"role": "user", "content": prompt}], | |
max_tokens=60, | |
temperature=0.5 | |
) | |
return response["choices"][0]["message"]["content"].strip() | |
def clear_interface(): | |
""" | |
Resets all global state back to None, and clears inputs in the UI. | |
""" | |
global index, retriever, current_pdf_name, pdf_text | |
index = retriever = None | |
current_pdf_name = pdf_text = None | |
# Clear displayed filename and re-disable question input | |
return None, "", gr.update(interactive=False) | |
# ββ Gradio UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue") | |
with gr.Blocks(theme=theme, css=""" | |
.container { border-radius: 10px; padding: 15px; } | |
.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); } | |
.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; } | |
/* Center and enlarge the main heading */ | |
.main-title { | |
text-align: center; | |
font-size: 64px; | |
font-weight: bold; | |
margin-bottom: 20px; | |
} | |
""") as demo: | |
# Application title centered and bold | |
gr.Markdown("<div class='main-title'>DocQueryAI</div>") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## π Document Input") | |
# Display the name of the active PDF | |
pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active") | |
# File upload widget for PDFs | |
pdf_file = gr.File(file_types=[".pdf"], type="filepath") | |
# Button to start processing | |
upload_button = gr.Button("π€ Process Document", variant="primary") | |
# Status text below the button | |
status_box = gr.Textbox(label="Status", interactive=False) | |
with gr.Column(): | |
gr.Markdown("## β Ask Questions") | |
# Text area for user questions | |
question_input = gr.Textbox(lines=3, placeholder="Enter your question hereβ¦") | |
# Button to trigger Q&A | |
ask_button = gr.Button("π Ask Question", variant="primary") | |
# Output textbox for the generated answer | |
answer_output = gr.Textbox(label="Answer", lines=8, interactive=False) | |
# Footer section with summary and keywords extraction | |
with gr.Row(): | |
summary_button = gr.Button("π Generate Summary", variant="secondary") | |
summary_output = gr.Textbox(label="Summary", lines=4, interactive=False) | |
keywords_button = gr.Button("π·οΈ Extract Keywords", variant="secondary") | |
keywords_output = gr.Textbox(label="Keywords", lines=4, interactive=False) | |
# Clear everything | |
clear_button = gr.Button("ποΈ Clear All", variant="secondary") | |
gr.Markdown("<div class='footer'>Powered by LangChain + Mistral 7B + FAISS | Gradio</div>") | |
# Bind events to functions | |
upload_button.click(process_pdf, [pdf_file], [pdf_display, status_box, question_input]) | |
ask_button.click(ask_question, [pdf_display, question_input], answer_output) | |
summary_button.click(generate_summary, [], summary_output) | |
keywords_button.click(extract_keywords, [], keywords_output) | |
clear_button.click(clear_interface, [], [pdf_file, pdf_display, question_input]) | |
if __name__ == "__main__": | |
# Launch the Gradio app, share=True exposes a public URL in Spaces | |
demo.launch(debug=True, share=True) | |