Spaces:
Sleeping
Sleeping
File size: 3,080 Bytes
dc248e5 8388817 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 7dd1af0 dc248e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# Import necessary libraries
import streamlit as st
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Function to fetch data
def fetch_data():
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()
documents = []
for course in documents_raw:
course_name = course['course']
for doc in course['documents']:
doc['course'] = course_name
documents.append(doc)
return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
# TextSearch class
class TextSearch:
def __init__(self, text_fields):
self.text_fields = text_fields
self.matrices = {}
self.vectorizers = {}
def fit(self, records, vectorizer_params={}):
self.df = pd.DataFrame(records)
for f in self.text_fields:
cv = TfidfVectorizer(**vectorizer_params)
X = cv.fit_transform(self.df[f])
self.vectorizers[f] = cv
self.matrices[f] = X
def search(self, query, filters={}, boost={}):
score = np.zeros(len(self.df))
for f in self.text_fields:
b = boost.get(f, 1.0)
q = self.vectorizers[f].transform([query])
s = cosine_similarity(self.matrices[f], q).flatten()
score = score + b * s
for field, value in filters.items():
mask = (self.df[field] == value).values
score = score * mask
idx = np.argsort(-score)[:5]
return self.df.iloc[idx].to_dict(orient='records')
# Main Streamlit application
st.title("FAQ Search Engine for DataTalks")
# Load data
df = fetch_data()
# Initialize TextSearch
text_search = TextSearch(text_fields=['section', 'question', 'text'])
text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3})
# Input fields for query and filters
query = st.text_input("Enter your query:")
courses = st.multiselect("Select course(s):", options=df['course'].unique())
# Search button
if st.button("Search"):
filters = {}
if courses:
filters['course'] = courses[0] if len(courses) == 1 else courses
results = text_search.search(query, filters=filters, boost={'question': 3.0})
# Display results
# for i, result in enumerate(results):
# st.write(f"### Result {i+1}")
# st.write(f"**Course**: {result['course']}")
# st.write(f"**Question**: {result['question']}")
# st.write(f"**Response**: {result['text']}")
for i, result in enumerate(results):
st.write(f"### Result {i+1}")
st.write(f"**Course**: {result['course']}")
st.write(f"**Section**: {result['section']}")
st.write(f"**Question**: {result['question']}")
st.write(f"**Text**: {result['text']}")
st.write("")
st.markdown("---") |