Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
4 |
+
import pdfplumber
|
5 |
+
|
6 |
+
# ---- App Setup ----
|
7 |
+
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
8 |
+
st.title("Chatbot to talk to the GIZ Gender Strategy")
|
9 |
+
|
10 |
+
# ---- Helper Functions ----
|
11 |
+
def extract_text_from_pdf(pdf_path):
|
12 |
+
"""Extracts text from a PDF file."""
|
13 |
+
text = ""
|
14 |
+
with pdfplumber.open(pdf_path) as pdf:
|
15 |
+
for page in pdf.pages:
|
16 |
+
text += page.extract_text()
|
17 |
+
return text
|
18 |
+
|
19 |
+
def preprocess_text(document_text):
|
20 |
+
"""Cleans up the text by removing excess whitespaces."""
|
21 |
+
# Standardize spaces
|
22 |
+
standardized_text = ' '.join(document_text.split())
|
23 |
+
return standardized_text
|
24 |
+
|
25 |
+
def chunk_text(document_text, chunk_size=500):
|
26 |
+
"""Splits the text into manageable chunks."""
|
27 |
+
chunks = []
|
28 |
+
while len(document_text) > chunk_size:
|
29 |
+
chunk = document_text[:chunk_size]
|
30 |
+
last_period = chunk.rfind(".")
|
31 |
+
chunk = document_text[:last_period + 1] # Split at last sentence end
|
32 |
+
chunks.append(chunk)
|
33 |
+
document_text = document_text[last_period + 1:].strip() # Remaining text
|
34 |
+
if document_text:
|
35 |
+
chunks.append(document_text)
|
36 |
+
return chunks
|
37 |
+
|
38 |
+
def semantic_search(query, corpus, model):
|
39 |
+
"""Performs semantic search to find the most relevant text in the corpus."""
|
40 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
41 |
+
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
|
42 |
+
|
43 |
+
scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
44 |
+
best_match_idx = scores.argmax().item()
|
45 |
+
return corpus[best_match_idx], scores[best_match_idx].item()
|
46 |
+
|
47 |
+
# ---- Load PDF and Extract Text ----
|
48 |
+
@st.cache_data
|
49 |
+
def load_pdf_and_prepare_embeddings(pdf_path):
|
50 |
+
"""Loads a PDF, extracts text, preprocesses, and creates chunks with embeddings."""
|
51 |
+
document_text = extract_text_from_pdf(pdf_path)
|
52 |
+
standardized_text = preprocess_text(document_text)
|
53 |
+
chunks = chunk_text(standardized_text)
|
54 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
55 |
+
return chunks, model
|
56 |
+
|
57 |
+
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
|
58 |
+
chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
|
59 |
+
|
60 |
+
# ---- User Input Section ----
|
61 |
+
st.sidebar.header("Ask a Question")
|
62 |
+
query = st.sidebar.text_area("Type your question here:")
|
63 |
+
|
64 |
+
if st.sidebar.button("Submit"):
|
65 |
+
if query.strip() == "":
|
66 |
+
st.sidebar.error("Please enter a question.")
|
67 |
+
else:
|
68 |
+
with st.spinner("Searching for the best answer..."):
|
69 |
+
answer, score = semantic_search(query, chunks, embedding_model)
|
70 |
+
st.write("### Your Question:")
|
71 |
+
st.write(query)
|
72 |
+
st.write("### Best Match:")
|
73 |
+
st.write(answer)
|
74 |
+
st.write(f"**Relevance Score:** {score:.2f}")
|
75 |
+
|
76 |
+
# ---- Info Section ----
|
77 |
+
with st.expander("ℹ️ - About this app"):
|
78 |
+
st.write(
|
79 |
+
"""
|
80 |
+
This chatbot allows users to ask questions about the Gender Strategy document.
|
81 |
+
It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
|
82 |
+
|
83 |
+
- The document is pre-loaded and processed into searchable chunks.
|
84 |
+
- The model ranks the relevance of the results based on cosine similarity.
|
85 |
+
|
86 |
+
For feedback or improvements, please contact the developer.
|
87 |
+
"""
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
# to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]
|