Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import pdfplumber
|
|
5 |
|
6 |
# ---- App Setup ----
|
7 |
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
8 |
-
st.title("Chatbot
|
9 |
|
10 |
# ---- Helper Functions ----
|
11 |
def extract_text_from_pdf(pdf_path):
|
@@ -17,24 +17,10 @@ def extract_text_from_pdf(pdf_path):
|
|
17 |
return text
|
18 |
|
19 |
def preprocess_text(document_text):
|
20 |
-
"""
|
21 |
-
|
22 |
-
standardized_text = ' '.join(document_text.split())
|
23 |
return standardized_text
|
24 |
|
25 |
-
def chunk_text(document_text, chunk_size=500):
|
26 |
-
"""Splits the text into manageable chunks."""
|
27 |
-
chunks = []
|
28 |
-
while len(document_text) > chunk_size:
|
29 |
-
chunk = document_text[:chunk_size]
|
30 |
-
last_period = chunk.rfind(".")
|
31 |
-
chunk = document_text[:last_period + 1] # Split at last sentence end
|
32 |
-
chunks.append(chunk)
|
33 |
-
document_text = document_text[last_period + 1:].strip() # Remaining text
|
34 |
-
if document_text:
|
35 |
-
chunks.append(document_text)
|
36 |
-
return chunks
|
37 |
-
|
38 |
def semantic_search(query, corpus, model):
|
39 |
"""Performs semantic search to find the most relevant text in the corpus."""
|
40 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
@@ -47,10 +33,10 @@ def semantic_search(query, corpus, model):
|
|
47 |
# ---- Load PDF and Extract Text ----
|
48 |
@st.cache_data
|
49 |
def load_pdf_and_prepare_embeddings(pdf_path):
|
50 |
-
"""Loads a PDF, extracts text,
|
51 |
document_text = extract_text_from_pdf(pdf_path)
|
52 |
standardized_text = preprocess_text(document_text)
|
53 |
-
chunks =
|
54 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
55 |
return chunks, model
|
56 |
|
@@ -86,6 +72,3 @@ with st.expander("ℹ️ - About this app"):
|
|
86 |
For feedback or improvements, please contact the developer.
|
87 |
"""
|
88 |
)
|
89 |
-
|
90 |
-
|
91 |
-
# to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]
|
|
|
5 |
|
6 |
# ---- App Setup ----
|
7 |
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
|
8 |
+
st.title("Chatbot for Gender Strategy Document")
|
9 |
|
10 |
# ---- Helper Functions ----
|
11 |
def extract_text_from_pdf(pdf_path):
|
|
|
17 |
return text
|
18 |
|
19 |
def preprocess_text(document_text):
|
20 |
+
"""Standardizes paragraph breaks to ensure consistent splitting."""
|
21 |
+
standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n")
|
|
|
22 |
return standardized_text
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def semantic_search(query, corpus, model):
|
25 |
"""Performs semantic search to find the most relevant text in the corpus."""
|
26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
33 |
# ---- Load PDF and Extract Text ----
|
34 |
@st.cache_data
|
35 |
def load_pdf_and_prepare_embeddings(pdf_path):
|
36 |
+
"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
|
37 |
document_text = extract_text_from_pdf(pdf_path)
|
38 |
standardized_text = preprocess_text(document_text)
|
39 |
+
chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
|
40 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
41 |
return chunks, model
|
42 |
|
|
|
72 |
For feedback or improvements, please contact the developer.
|
73 |
"""
|
74 |
)
|
|
|
|
|
|