Spaces:
Sleeping
Sleeping
| # Import necessary libraries | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import requests | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Function to fetch data | |
| def fetch_data(): | |
| docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json' | |
| docs_response = requests.get(docs_url) | |
| documents_raw = docs_response.json() | |
| documents = [] | |
| for course in documents_raw: | |
| course_name = course['course'] | |
| for doc in course['documents']: | |
| doc['course'] = course_name | |
| documents.append(doc) | |
| return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text']) | |
| # TextSearch class | |
| class TextSearch: | |
| def __init__(self, text_fields): | |
| self.text_fields = text_fields | |
| self.matrices = {} | |
| self.vectorizers = {} | |
| def fit(self, records, vectorizer_params={}): | |
| self.df = pd.DataFrame(records) | |
| for f in self.text_fields: | |
| cv = TfidfVectorizer(**vectorizer_params) | |
| X = cv.fit_transform(self.df[f]) | |
| self.vectorizers[f] = cv | |
| self.matrices[f] = X | |
| def search(self, query, filters={}, boost={}): | |
| score = np.zeros(len(self.df)) | |
| for f in self.text_fields: | |
| b = boost.get(f, 1.0) | |
| q = self.vectorizers[f].transform([query]) | |
| s = cosine_similarity(self.matrices[f], q).flatten() | |
| score = score + b * s | |
| for field, value in filters.items(): | |
| mask = (self.df[field] == value).values | |
| score = score * mask | |
| idx = np.argsort(-score)[:5] | |
| return self.df.iloc[idx].to_dict(orient='records') | |
| # Main Streamlit application | |
| st.title("FAQ Search Engine for DataTalks") | |
| # Load data | |
| df = fetch_data() | |
| # Initialize TextSearch | |
| text_search = TextSearch(text_fields=['section', 'question', 'text']) | |
| text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3}) | |
| # Input fields for query and filters | |
| query = st.text_input("Enter your query:") | |
| courses = st.multiselect("Select course(s): Pls, select just one", options=df['course'].unique()) | |
| # Search button | |
| if st.button("Search"): | |
| filters = {} | |
| if courses: | |
| filters['course'] = courses[0] if len(courses) == 1 else courses | |
| results = text_search.search(query, filters=filters, boost={'question': 3.0}) | |
| for i, result in enumerate(results): | |
| st.write(f"### Result {i+1}") | |
| st.write(f"**Course**: {result['course']}") | |
| st.write(f"**Section**: {result['section']}") | |
| st.write(f"**Question**: {result['question']}") | |
| st.write(f"**Text**: {result['text']}") | |
| st.write("") | |
| st.markdown("---") |