Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import pdfplumber | |
# ---- App Setup ---- | |
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded') | |
st.title("Chatbot for Gender Strategy Document") | |
# ---- Helper Functions ---- | |
def extract_text_from_pdf(pdf_path): | |
"""Extracts text from a PDF file.""" | |
text = "" | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
def preprocess_text(document_text): | |
"""Standardizes paragraph breaks to ensure consistent splitting.""" | |
standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n") | |
return standardized_text | |
def semantic_search(query, corpus, model): | |
"""Performs semantic search to find the most relevant text in the corpus.""" | |
query_embedding = model.encode(query, convert_to_tensor=True) | |
corpus_embeddings = model.encode(corpus, convert_to_tensor=True) | |
scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] | |
best_match_idx = scores.argmax().item() | |
return corpus[best_match_idx], scores[best_match_idx].item() | |
# ---- Load PDF and Extract Text ---- | |
def load_pdf_and_prepare_embeddings(pdf_path): | |
"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings.""" | |
document_text = extract_text_from_pdf(pdf_path) | |
standardized_text = preprocess_text(document_text) | |
chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
return chunks, model | |
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf" | |
chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path) | |
# ---- User Input Section ---- | |
st.sidebar.header("Ask a Question") | |
query = st.sidebar.text_area("Type your question here:") | |
if st.sidebar.button("Submit"): | |
if query.strip() == "": | |
st.sidebar.error("Please enter a question.") | |
else: | |
with st.spinner("Searching for the best answer..."): | |
answer, score = semantic_search(query, chunks, embedding_model) | |
st.write("### Your Question:") | |
st.write(query) | |
st.write("### Best Match:") | |
st.write(answer) | |
st.write(f"**Relevance Score:** {score:.2f}") | |
# ---- Info Section ---- | |
with st.expander("ℹ️ - About this app"): | |
st.write( | |
""" | |
This chatbot allows users to ask questions about the Gender Strategy document. | |
It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document. | |
- The document is pre-loaded and processed into searchable chunks. | |
- The model ranks the relevance of the results based on cosine similarity. | |
For feedback or improvements, please contact the developer. | |
""" | |
) | |