NiborKowon commited on
Commit
148cd4f
·
verified ·
1 Parent(s): 9cc5f66

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import pdfplumber
5
+
6
+ # ---- App Setup ----
7
+ st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
8
+ st.title("Chatbot to talk to the GIZ Gender Strategy")
9
+
10
+ # ---- Helper Functions ----
11
+ def extract_text_from_pdf(pdf_path):
12
+ """Extracts text from a PDF file."""
13
+ text = ""
14
+ with pdfplumber.open(pdf_path) as pdf:
15
+ for page in pdf.pages:
16
+ text += page.extract_text()
17
+ return text
18
+
19
+ def preprocess_text(document_text):
20
+ """Cleans up the text by removing excess whitespaces."""
21
+ # Standardize spaces
22
+ standardized_text = ' '.join(document_text.split())
23
+ return standardized_text
24
+
25
+ def chunk_text(document_text, chunk_size=500):
26
+ """Splits the text into manageable chunks."""
27
+ chunks = []
28
+ while len(document_text) > chunk_size:
29
+ chunk = document_text[:chunk_size]
30
+ last_period = chunk.rfind(".")
31
+ chunk = document_text[:last_period + 1] # Split at last sentence end
32
+ chunks.append(chunk)
33
+ document_text = document_text[last_period + 1:].strip() # Remaining text
34
+ if document_text:
35
+ chunks.append(document_text)
36
+ return chunks
37
+
38
+ def semantic_search(query, corpus, model):
39
+ """Performs semantic search to find the most relevant text in the corpus."""
40
+ query_embedding = model.encode(query, convert_to_tensor=True)
41
+ corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
42
+
43
+ scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
44
+ best_match_idx = scores.argmax().item()
45
+ return corpus[best_match_idx], scores[best_match_idx].item()
46
+
47
+ # ---- Load PDF and Extract Text ----
48
+ @st.cache_data
49
+ def load_pdf_and_prepare_embeddings(pdf_path):
50
+ """Loads a PDF, extracts text, preprocesses, and creates chunks with embeddings."""
51
+ document_text = extract_text_from_pdf(pdf_path)
52
+ standardized_text = preprocess_text(document_text)
53
+ chunks = chunk_text(standardized_text)
54
+ model = SentenceTransformer('all-MiniLM-L6-v2')
55
+ return chunks, model
56
+
57
+ pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
58
+ chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
59
+
60
+ # ---- User Input Section ----
61
+ st.sidebar.header("Ask a Question")
62
+ query = st.sidebar.text_area("Type your question here:")
63
+
64
+ if st.sidebar.button("Submit"):
65
+ if query.strip() == "":
66
+ st.sidebar.error("Please enter a question.")
67
+ else:
68
+ with st.spinner("Searching for the best answer..."):
69
+ answer, score = semantic_search(query, chunks, embedding_model)
70
+ st.write("### Your Question:")
71
+ st.write(query)
72
+ st.write("### Best Match:")
73
+ st.write(answer)
74
+ st.write(f"**Relevance Score:** {score:.2f}")
75
+
76
+ # ---- Info Section ----
77
+ with st.expander("ℹ️ - About this app"):
78
+ st.write(
79
+ """
80
+ This chatbot allows users to ask questions about the Gender Strategy document.
81
+ It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
82
+
83
+ - The document is pre-loaded and processed into searchable chunks.
84
+ - The model ranks the relevance of the results based on cosine similarity.
85
+
86
+ For feedback or improvements, please contact the developer.
87
+ """
88
+ )
89
+
90
+
91
+ # to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]