File size: 3,080 Bytes
dc248e5
8388817
7dd1af0
 
 
 
dc248e5
7dd1af0
 
dc248e5
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
 
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
 
 
 
 
7dd1af0
dc248e5
 
 
7dd1af0
dc248e5
 
7dd1af0
dc248e5
7dd1af0
 
dc248e5
 
 
 
 
 
7dd1af0
 
 
dc248e5
7dd1af0
 
 
dc248e5
7dd1af0
dc248e5
 
7dd1af0
dc248e5
 
 
 
 
 
 
 
7dd1af0
 
 
 
 
dc248e5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Import necessary libraries

import streamlit as st
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to fetch data
def fetch_data():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()
    documents = []

    for course in documents_raw:
        course_name = course['course']
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)
    
    return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

# TextSearch class
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)
        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.vectorizers[f] = cv
            self.matrices[f] = X

    def search(self, query, filters={}, boost={}):
        score = np.zeros(len(self.df))
        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s
        
        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask
        
        idx = np.argsort(-score)[:5]
        return self.df.iloc[idx].to_dict(orient='records')

# Main Streamlit application
st.title("FAQ Search Engine for DataTalks")

# Load data
df = fetch_data()

# Initialize TextSearch
text_search = TextSearch(text_fields=['section', 'question', 'text'])
text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3})

# Input fields for query and filters
query = st.text_input("Enter your query:")
courses = st.multiselect("Select course(s):", options=df['course'].unique())

# Search button
if st.button("Search"):
    filters = {}
    if courses:
        filters['course'] = courses[0] if len(courses) == 1 else courses
    results = text_search.search(query, filters=filters, boost={'question': 3.0})
    
    # Display results
    # for i, result in enumerate(results):
    #     st.write(f"### Result {i+1}")
    #     st.write(f"**Course**: {result['course']}")
    #     st.write(f"**Question**: {result['question']}")
    #     st.write(f"**Response**: {result['text']}")

    for i, result in enumerate(results):
        st.write(f"### Result {i+1}")
        st.write(f"**Course**: {result['course']}")
        st.write(f"**Section**: {result['section']}")
        st.write(f"**Question**: {result['question']}")
        st.write(f"**Text**: {result['text']}")
        st.write("")  
        st.markdown("---")