Spaces:

GIZ
/

gender-strategy-chatbot-giz

Sleeping

App Files Files Community

NiborKowon commited on Dec 23, 2024

Commit

148cd4f

verified ·

1 Parent(s): 9cc5f66

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import streamlit as st
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer, util
+import pdfplumber
+# ---- App Setup ----
+st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
+st.title("Chatbot to talk to the GIZ Gender Strategy")
+# ---- Helper Functions ----
+def extract_text_from_pdf(pdf_path):
+    """Extracts text from a PDF file."""
+    text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
+def preprocess_text(document_text):
+    """Cleans up the text by removing excess whitespaces."""
+    # Standardize spaces
+    standardized_text = ' '.join(document_text.split())
+    return standardized_text
+def chunk_text(document_text, chunk_size=500):
+    """Splits the text into manageable chunks."""
+    chunks = []
+    while len(document_text) > chunk_size:
+        chunk = document_text[:chunk_size]
+        last_period = chunk.rfind(".")
+        chunk = document_text[:last_period + 1]  # Split at last sentence end
+        chunks.append(chunk)
+        document_text = document_text[last_period + 1:].strip()  # Remaining text
+    if document_text:
+        chunks.append(document_text)
+    return chunks
+def semantic_search(query, corpus, model):
+    """Performs semantic search to find the most relevant text in the corpus."""
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
+    scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
+    best_match_idx = scores.argmax().item()
+    return corpus[best_match_idx], scores[best_match_idx].item()
+# ---- Load PDF and Extract Text ----
+@st.cache_data
+def load_pdf_and_prepare_embeddings(pdf_path):
+    """Loads a PDF, extracts text, preprocesses, and creates chunks with embeddings."""
+    document_text = extract_text_from_pdf(pdf_path)
+    standardized_text = preprocess_text(document_text)
+    chunks = chunk_text(standardized_text)
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    return chunks, model
+pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
+chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)
+# ---- User Input Section ----
+st.sidebar.header("Ask a Question")
+query = st.sidebar.text_area("Type your question here:")
+if st.sidebar.button("Submit"):
+    if query.strip() == "":
+        st.sidebar.error("Please enter a question.")
+    else:
+        with st.spinner("Searching for the best answer..."):
+            answer, score = semantic_search(query, chunks, embedding_model)
+            st.write("### Your Question:")
+            st.write(query)
+            st.write("### Best Match:")
+            st.write(answer)
+            st.write(f"**Relevance Score:** {score:.2f}")
+# ---- Info Section ----
+with st.expander("ℹ️ - About this app"):
+    st.write(
+        """
+        This chatbot allows users to ask questions about the Gender Strategy document.
+        It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.
+        - The document is pre-loaded and processed into searchable chunks.
+        - The model ranks the relevance of the results based on cosine similarity.
+        For feedback or improvements, please contact the developer.
+        """
+    )
+# to start the app locally: streamlit run c:/Users/nowok_rob/Documents/Code_local/genderstrat_2025/GIZGenderstratlocal/app.py [ARGUMENTS]