|
import streamlit as st |
|
from transformers import pipeline |
|
from sentence_transformers import SentenceTransformer, util |
|
import pdfplumber |
|
|
|
|
|
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded') |
|
st.title("Chatbot for Gender Strategy Document") |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
"""Extracts text from a PDF file.""" |
|
text = "" |
|
with pdfplumber.open(pdf_path) as pdf: |
|
for page in pdf.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
def preprocess_text(document_text): |
|
"""Standardizes paragraph breaks to ensure consistent splitting.""" |
|
standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n") |
|
return standardized_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf" |
|
|
|
|
|
original_text = extract_text_from_pdf(pdf_path) |
|
|
|
|
|
processed_text = preprocess_text(original_text) |
|
|
|
|
|
st.title("Test der preprocess_text-Funktion mit der Gender Strategy") |
|
|
|
st.write("### Originaler Text (Aus dem PDF):") |
|
|
|
st.write(original_text[:1000]) |
|
|
|
st.write("### Verarbeiteter Text (Nach preprocess_text):") |
|
|
|
st.write(processed_text[:1000]) |
|
|
|
|
|
st.write("**Hinweis**: Der vollständige Text kann sehr lang sein, daher zeigen wir nur einen Ausschnitt an.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def semantic_search(query, corpus, model): |
|
"""Performs semantic search to find the most relevant text in the corpus.""" |
|
query_embedding = model.encode(query, convert_to_tensor=True) |
|
corpus_embeddings = model.encode(corpus, convert_to_tensor=True) |
|
|
|
scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] |
|
best_match_idx = scores.argmax().item() |
|
return corpus[best_match_idx], scores[best_match_idx].item() |
|
|
|
|
|
@st.cache_data |
|
def load_pdf_and_prepare_embeddings(pdf_path): |
|
"""Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings.""" |
|
document_text = extract_text_from_pdf(pdf_path) |
|
standardized_text = preprocess_text(document_text) |
|
chunks = standardized_text.split("\n\n") |
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
return chunks, model |
|
|
|
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf" |
|
chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path) |
|
|
|
|
|
st.sidebar.header("Ask a Question") |
|
query = st.sidebar.text_area("Type your question here:") |
|
|
|
if st.sidebar.button("Submit"): |
|
if query.strip() == "": |
|
st.sidebar.error("Please enter a question.") |
|
else: |
|
with st.spinner("Searching for the best answer..."): |
|
answer, score = semantic_search(query, chunks, embedding_model) |
|
st.write("### Your Question:") |
|
st.write(query) |
|
st.write("### Best Match:") |
|
st.write(answer) |
|
st.write(f"**Relevance Score:** {score:.2f}") |
|
|
|
|
|
with st.expander("ℹ️ - About this app"): |
|
st.write( |
|
""" |
|
This chatbot allows users to ask questions about the Gender Strategy document. |
|
It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document. |
|
|
|
- The document is pre-loaded and processed into searchable chunks. |
|
- The model ranks the relevance of the results based on cosine similarity. |
|
|
|
For feedback or improvements, please contact the developer. |
|
""" |
|
) |
|
|