import streamlit as st from transformers import pipeline from sentence_transformers import SentenceTransformer, util import pdfplumber # ---- App Setup ---- st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded') st.title("Chatbot for Gender Strategy Document") # ---- Helper Functions ---- def extract_text_from_pdf(pdf_path): """Extracts text from a PDF file.""" text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() return text def preprocess_text(document_text): """Standardizes paragraph breaks to ensure consistent splitting.""" standardized_text = document_text.replace("\n", " ").replace(" ", "\n\n") return standardized_text # ---- Testen mit dem Original-PDF-Dokument ---- # Pfad zum PDF-Dokument (stellen Sie sicher, dass der Pfad korrekt ist) pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf" # Extrahieren des Texts aus dem PDF original_text = extract_text_from_pdf(pdf_path) # Vorverarbeiten des Texts processed_text = preprocess_text(original_text) # Streamlit-Interface zum Anzeigen des Ergebnisses st.title("Test der preprocess_text-Funktion mit der Gender Strategy") st.write("### Originaler Text (Aus dem PDF):") # Zeigen Sie die ersten 1000 Zeichen des Originaltextes an, um nicht zu viel Text auf einmal zu laden st.write(original_text[:1000]) # Zeigen Sie nur einen Teil des Textes, um die Performance zu optimieren st.write("### Verarbeiteter Text (Nach preprocess_text):") # Zeigen Sie die ersten 1000 Zeichen des verarbeiteten Textes an st.write(processed_text[:1000]) # Zeigen Sie nur einen Teil des Textes, um die Performance zu optimieren # Optional: Wenn der Text zu lang ist, können Sie weitere Textabschnitte anzeigen lassen st.write("**Hinweis**: Der vollständige Text kann sehr lang sein, daher zeigen wir nur einen Ausschnitt an.") def semantic_search(query, corpus, model): """Performs semantic search to find the most relevant text in the corpus.""" query_embedding = model.encode(query, convert_to_tensor=True) corpus_embeddings = model.encode(corpus, convert_to_tensor=True) scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0] best_match_idx = scores.argmax().item() return corpus[best_match_idx], scores[best_match_idx].item() # ---- Load PDF and Extract Text ---- @st.cache_data def load_pdf_and_prepare_embeddings(pdf_path): """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings.""" document_text = extract_text_from_pdf(pdf_path) standardized_text = preprocess_text(document_text) chunks = standardized_text.split("\n\n") # Splitting text into chunks by paragraphs model = SentenceTransformer('all-MiniLM-L6-v2') return chunks, model pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf" chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path) # ---- User Input Section ---- st.sidebar.header("Ask a Question") query = st.sidebar.text_area("Type your question here:") if st.sidebar.button("Submit"): if query.strip() == "": st.sidebar.error("Please enter a question.") else: with st.spinner("Searching for the best answer..."): answer, score = semantic_search(query, chunks, embedding_model) st.write("### Your Question:") st.write(query) st.write("### Best Match:") st.write(answer) st.write(f"**Relevance Score:** {score:.2f}") # ---- Info Section ---- with st.expander("ℹ️ - About this app"): st.write( """ This chatbot allows users to ask questions about the Gender Strategy document. It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document. - The document is pre-loaded and processed into searchable chunks. - The model ranks the relevance of the results based on cosine similarity. For feedback or improvements, please contact the developer. """ )