Spaces:

GIZ
/

gender-strategy-chatbot-giz

Sleeping

App Files Files Community

gender-strategy-chatbot-giz / app.py

NiborKowon

Update app.py

1fd1916 verified 3 months ago

raw

history blame

2.98 kB

	import streamlit as st
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer, util
	import pdfplumber

	# ---- App Setup ----
	st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
	st.title("Chatbot for Gender Strategy Document")

	# ---- Helper Functions ----
	def extract_text_from_pdf(pdf_path):
	"""Extracts text from a PDF file."""
	text = ""
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text()
	return text

	def preprocess_text(document_text):
	"""Standardizes paragraph breaks to ensure consistent splitting."""
	standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n")
	return standardized_text

	def semantic_search(query, corpus, model):
	"""Performs semantic search to find the most relevant text in the corpus."""
	query_embedding = model.encode(query, convert_to_tensor=True)
	corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

	scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
	best_match_idx = scores.argmax().item()
	return corpus[best_match_idx], scores[best_match_idx].item()

	# ---- Load PDF and Extract Text ----
	@st.cache_data
	def load_pdf_and_prepare_embeddings(pdf_path):
	"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
	document_text = extract_text_from_pdf(pdf_path)
	standardized_text = preprocess_text(document_text)
	chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs
	model = SentenceTransformer('all-MiniLM-L6-v2')
	return chunks, model

	pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
	chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)

	# ---- User Input Section ----
	st.sidebar.header("Ask a Question")
	query = st.sidebar.text_area("Type your question here:")

	if st.sidebar.button("Submit"):
	if query.strip() == "":
	st.sidebar.error("Please enter a question.")
	else:
	with st.spinner("Searching for the best answer..."):
	answer, score = semantic_search(query, chunks, embedding_model)
	st.write("### Your Question:")
	st.write(query)
	st.write("### Best Match:")
	st.write(answer)
	st.write(f"Relevance Score: {score:.2f}")

	# ---- Info Section ----
	with st.expander("ℹ️ - About this app"):
	st.write(
	"""
	This chatbot allows users to ask questions about the Gender Strategy document.
	It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.

	- The document is pre-loaded and processed into searchable chunks.
	- The model ranks the relevance of the results based on cosine similarity.

	For feedback or improvements, please contact the developer.
	"""
	)