File size: 2,825 Bytes
dc248e5
8388817
7dd1af0
 
 
 
dc248e5
7dd1af0
 
dc248e5
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
 
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
7dd1af0
dc248e5
 
7dd1af0
dc248e5
7dd1af0
 
dc248e5
 
 
 
 
 
7dd1af0
 
 
e2a2ca8
7dd1af0
 
 
dc248e5
7dd1af0
dc248e5
 
 
 
7dd1af0
 
 
 
 
dc248e5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Import necessary libraries

import streamlit as st
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to fetch data
def fetch_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()
    documents = []

    for course in documents_raw:
        course_name = course['course']
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)
    
    return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

# TextSearch class
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)
        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.vectorizers[f] = cv
            self.matrices[f] = X

    def search(self, query, filters={}, boost={}):
        score = np.zeros(len(self.df))
        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s
        
        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask
        
        idx = np.argsort(-score)[:5]
        return self.df.iloc[idx].to_dict(orient='records')

# Main Streamlit application
st.title("FAQ Search Engine for DataTalks")

# Load data
df = fetch_data()

# Initialize TextSearch
text_search = TextSearch(text_fields=['section', 'question', 'text'])
text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3})

# Input fields for query and filters
query = st.text_input("Enter your query:")
courses = st.multiselect("Select course(s): Pls, select just one", options=df['course'].unique())

# Search button
if st.button("Search"):
    filters = {}
    if courses:
        filters['course'] = courses[0] if len(courses) == 1 else courses
    results = text_search.search(query, filters=filters, boost={'question': 3.0})

    for i, result in enumerate(results):
        st.write(f"### Result {i+1}")
        st.write(f"**Course**: {result['course']}")
        st.write(f"**Section**: {result['section']}")
        st.write(f"**Question**: {result['question']}")
        st.write(f"**Text**: {result['text']}")
        st.write("")  
        st.markdown("---")