# Import necessary libraries import streamlit as st import pandas as pd import numpy as np import requests from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Function to fetch data def fetch_data(): docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json' docs_response = requests.get(docs_url) documents_raw = docs_response.json() documents = [] for course in documents_raw: course_name = course['course'] for doc in course['documents']: doc['course'] = course_name documents.append(doc) return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text']) # TextSearch class class TextSearch: def __init__(self, text_fields): self.text_fields = text_fields self.matrices = {} self.vectorizers = {} def fit(self, records, vectorizer_params={}): self.df = pd.DataFrame(records) for f in self.text_fields: cv = TfidfVectorizer(**vectorizer_params) X = cv.fit_transform(self.df[f]) self.vectorizers[f] = cv self.matrices[f] = X def search(self, query, filters={}, boost={}): score = np.zeros(len(self.df)) for f in self.text_fields: b = boost.get(f, 1.0) q = self.vectorizers[f].transform([query]) s = cosine_similarity(self.matrices[f], q).flatten() score = score + b * s for field, value in filters.items(): mask = (self.df[field] == value).values score = score * mask idx = np.argsort(-score)[:5] return self.df.iloc[idx].to_dict(orient='records') # Main Streamlit application st.title("FAQ Search Engine for DataTalks") # Load data df = fetch_data() # Initialize TextSearch text_search = TextSearch(text_fields=['section', 'question', 'text']) text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3}) # Input fields for query and filters query = st.text_input("Enter your query:") courses = st.multiselect("Select course(s): Pls, select just one", options=df['course'].unique()) # Search button if st.button("Search"): filters = {} if courses: filters['course'] = courses[0] if len(courses) == 1 else courses results = text_search.search(query, filters=filters, boost={'question': 3.0}) for i, result in enumerate(results): st.write(f"### Result {i+1}") st.write(f"**Course**: {result['course']}") st.write(f"**Section**: {result['section']}") st.write(f"**Question**: {result['question']}") st.write(f"**Text**: {result['text']}") st.write("") st.markdown("---")