File size: 4,159 Bytes
148cd4f
 
 
 
 
 
 
1fd1916
148cd4f
 
 
 
 
 
 
 
 
 
 
1fd1916
 
148cd4f
 
05deae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148cd4f
 
 
 
 
 
 
 
 
 
 
 
1fd1916
148cd4f
 
1fd1916
148cd4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import pdfplumber

# ---- App Setup ----
st.set_page_config(page_title='Gender Strategy Chatbot', layout='wide', initial_sidebar_state='expanded')
st.title("Chatbot for Gender Strategy Document")

# ---- Helper Functions ----
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(document_text):
    """Standardizes paragraph breaks to ensure consistent splitting."""
    standardized_text = document_text.replace("\n", " ").replace("  ", "\n\n")
    return standardized_text






# ---- Testen mit dem Original-PDF-Dokument ----

# Pfad zum PDF-Dokument (stellen Sie sicher, dass der Pfad korrekt ist)
pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"

# Extrahieren des Texts aus dem PDF
original_text = extract_text_from_pdf(pdf_path)

# Vorverarbeiten des Texts
processed_text = preprocess_text(original_text)

# Streamlit-Interface zum Anzeigen des Ergebnisses
st.title("Test der preprocess_text-Funktion mit der Gender Strategy")

st.write("### Originaler Text (Aus dem PDF):")
# Zeigen Sie die ersten 1000 Zeichen des Originaltextes an, um nicht zu viel Text auf einmal zu laden
st.write(original_text[:1000])  # Zeigen Sie nur einen Teil des Textes, um die Performance zu optimieren

st.write("### Verarbeiteter Text (Nach preprocess_text):")
# Zeigen Sie die ersten 1000 Zeichen des verarbeiteten Textes an
st.write(processed_text[:1000])  # Zeigen Sie nur einen Teil des Textes, um die Performance zu optimieren

# Optional: Wenn der Text zu lang ist, können Sie weitere Textabschnitte anzeigen lassen
st.write("**Hinweis**: Der vollständige Text kann sehr lang sein, daher zeigen wir nur einen Ausschnitt an.")







def semantic_search(query, corpus, model):
    """Performs semantic search to find the most relevant text in the corpus."""
    query_embedding = model.encode(query, convert_to_tensor=True)
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    
    scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    best_match_idx = scores.argmax().item()
    return corpus[best_match_idx], scores[best_match_idx].item()

# ---- Load PDF and Extract Text ----
@st.cache_data
def load_pdf_and_prepare_embeddings(pdf_path):
    """Loads a PDF, extracts text, standardizes formatting, splits into chunks, and prepares embeddings."""
    document_text = extract_text_from_pdf(pdf_path)
    standardized_text = preprocess_text(document_text)
    chunks = standardized_text.split("\n\n")  # Splitting text into chunks by paragraphs
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return chunks, model

pdf_path = "giz-2019-en-gender-strategy-web-version-with-bookmarks.pdf"
chunks, embedding_model = load_pdf_and_prepare_embeddings(pdf_path)

# ---- User Input Section ----
st.sidebar.header("Ask a Question")
query = st.sidebar.text_area("Type your question here:")

if st.sidebar.button("Submit"):
    if query.strip() == "":
        st.sidebar.error("Please enter a question.")
    else:
        with st.spinner("Searching for the best answer..."):
            answer, score = semantic_search(query, chunks, embedding_model)
            st.write("### Your Question:")
            st.write(query)
            st.write("### Best Match:")
            st.write(answer)
            st.write(f"**Relevance Score:** {score:.2f}")

# ---- Info Section ----
with st.expander("ℹ️ - About this app"):
    st.write(
        """
        This chatbot allows users to ask questions about the Gender Strategy document. 
        It uses a semantic search model (`all-MiniLM-L6-v2`) to find the most relevant passages from the document.

        - The document is pre-loaded and processed into searchable chunks.
        - The model ranks the relevance of the results based on cosine similarity.

        For feedback or improvements, please contact the developer.
        """
    )