Spaces:

GIZ
/

gender-strategy-chatbot-giz

Running

App Files Files Community

NiborKowon commited on Dec 23, 2024

Commit

1fd1916

verified ·

1 Parent(s): 6784b35

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -22

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pdfplumber
 # ---- App Setup ----
 st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
-st.title("Chatbot to talk to the GIZ Gender Strategy")
 # ---- Helper Functions ----
 def extract_text_from_pdf(pdf_path):
@@ -17,24 +17,10 @@ def extract_text_from_pdf(pdf_path):
     return text
 def preprocess_text(document_text):
-    """Cleans up the text by removing excess whitespaces."""
-    # Standardize spaces
-    standardized_text = ' '.join(document_text.split())
     return standardized_text
-def chunk_text(document_text, chunk_size=500):
-    """Splits the text into manageable chunks."""
-    chunks = []
-    while len(document_text) > chunk_size:
-        chunk = document_text[:chunk_size]
-        last_period = chunk.rfind(".")
-        chunk = document_text[:last_period + 1]  # Split at last sentence end
-        chunks.append(chunk)
-        document_text = document_text[last_period + 1:].strip()  # Remaining text
-    if document_text:
-        chunks.append(document_text)
-    return chunks
 def semantic_search(query, corpus, model):
     """Performs semantic search to find the most relevant text in the corpus."""
     query_embedding = model.encode(query, convert_to_tensor=True)
@@ -47,10 +33,10 @@ def semantic_search(query, corpus, model):
 # ---- Load PDF and Extract Text ----
 @st.cache_data
 def load_pdf_and_prepare_embeddings(pdf_path):
-    """Loads a PDF, extracts text, preprocesses, and creates chunks with embeddings."""
     document_text = extract_text_from_pdf(pdf_path)
     standardized_text = preprocess_text(document_text)
-    chunks = chunk_text(standardized_text)
     model = SentenceTransformer('all-MiniLM-L6-v2')
     return chunks, model
@@ -86,6 +72,3 @@ with st.expander("ℹ️ - About this app"):
         For feedback or improvements, please contact the developer.
         """
     )
-# to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]

 # ---- App Setup ----
 st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
+st.title("Chatbot for Gender Strategy Document")
 # ---- Helper Functions ----
 def extract_text_from_pdf(pdf_path):
     return text
 def preprocess_text(document_text):
+    """Standardizes paragraph breaks to ensure consistent splitting."""
+    standardized_text = document_text.replace("\n", " ").replace("  ", "\n\n")
     return standardized_text
 def semantic_search(query, corpus, model):
     """Performs semantic search to find the most relevant text in the corpus."""
     query_embedding = model.encode(query, convert_to_tensor=True)
 # ---- Load PDF and Extract Text ----
 @st.cache_data
 def load_pdf_and_prepare_embeddings(pdf_path):
+    """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
     document_text = extract_text_from_pdf(pdf_path)
     standardized_text = preprocess_text(document_text)
+    chunks = standardized_text.split("\n\n")  # Splitting text into chunks by paragraphs
     model = SentenceTransformer('all-MiniLM-L6-v2')
     return chunks, model
         For feedback or improvements, please contact the developer.
         """
     )