NiborKowon commited on
Commit
1fd1916
·
verified ·
1 Parent(s): 6784b35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -22
app.py CHANGED
@@ -5,7 +5,7 @@ import pdfplumber
5
 
6
  # ---- App Setup ----
7
  st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
8
- st.title("Chatbot to talk to the GIZ Gender Strategy")
9
 
10
  # ---- Helper Functions ----
11
  def extract_text_from_pdf(pdf_path):
@@ -17,24 +17,10 @@ def extract_text_from_pdf(pdf_path):
17
  return text
18
 
19
  def preprocess_text(document_text):
20
- """Cleans up the text by removing excess whitespaces."""
21
- # Standardize spaces
22
- standardized_text = ' '.join(document_text.split())
23
  return standardized_text
24
 
25
- def chunk_text(document_text, chunk_size=500):
26
- """Splits the text into manageable chunks."""
27
- chunks = []
28
- while len(document_text) > chunk_size:
29
- chunk = document_text[:chunk_size]
30
- last_period = chunk.rfind(".")
31
- chunk = document_text[:last_period + 1] # Split at last sentence end
32
- chunks.append(chunk)
33
- document_text = document_text[last_period + 1:].strip() # Remaining text
34
- if document_text:
35
- chunks.append(document_text)
36
- return chunks
37
-
38
  def semantic_search(query, corpus, model):
39
  """Performs semantic search to find the most relevant text in the corpus."""
40
  query_embedding = model.encode(query, convert_to_tensor=True)
@@ -47,10 +33,10 @@ def semantic_search(query, corpus, model):
47
  # ---- Load PDF and Extract Text ----
48
  @st.cache_data
49
  def load_pdf_and_prepare_embeddings(pdf_path):
50
- """Loads a PDF, extracts text, preprocesses, and creates chunks with embeddings."""
51
  document_text = extract_text_from_pdf(pdf_path)
52
  standardized_text = preprocess_text(document_text)
53
- chunks = chunk_text(standardized_text)
54
  model = SentenceTransformer('all-MiniLM-L6-v2')
55
  return chunks, model
56
 
@@ -86,6 +72,3 @@ with st.expander("ℹ️ - About this app"):
86
  For feedback or improvements, please contact the developer.
87
  """
88
  )
89
-
90
-
91
- # to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]
 
5
 
6
  # ---- App Setup ----
7
  st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
8
+ st.title("Chatbot for Gender Strategy Document")
9
 
10
  # ---- Helper Functions ----
11
  def extract_text_from_pdf(pdf_path):
 
17
  return text
18
 
19
  def preprocess_text(document_text):
20
+ """Standardizes paragraph breaks to ensure consistent splitting."""
21
+ standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n")
 
22
  return standardized_text
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def semantic_search(query, corpus, model):
25
  """Performs semantic search to find the most relevant text in the corpus."""
26
  query_embedding = model.encode(query, convert_to_tensor=True)
 
33
  # ---- Load PDF and Extract Text ----
34
  @st.cache_data
35
  def load_pdf_and_prepare_embeddings(pdf_path):
36
+ """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
37
  document_text = extract_text_from_pdf(pdf_path)
38
  standardized_text = preprocess_text(document_text)
39
+ chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
40
  model = SentenceTransformer('all-MiniLM-L6-v2')
41
  return chunks, model
42
 
 
72
  For feedback or improvements, please contact the developer.
73
  """
74
  )