Spaces:
Sleeping
Sleeping
# Install neccessary requirements | |
pip install requirments.txt | |
# Import neccessary libraries | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import requests | |
import torch | |
from tqdm.auto import tqdm | |
from transformers import BertModel, BertTokenizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# CourseFAQBot class | |
class CourseFAQBot: | |
def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8): | |
self.tokenizer = BertTokenizer.from_pretrained(model_name) | |
self.model = BertModel.from_pretrained(model_name) | |
self.model.eval() # Set the model to evaluation mode if not training | |
self.batch_size = batch_size | |
self.df = self._download_and_process_documents(docs_url) | |
self.document_embeddings = self.compute_embeddings(self.df['text'].tolist()) | |
def _download_and_process_documents(self, docs_url): | |
""" | |
Download and process the document data. | |
""" | |
docs_response = requests.get(docs_url) | |
documents_raw = docs_response.json() | |
documents = [] | |
for course in documents_raw: | |
course_name = course['course'] | |
for doc in course['documents']: | |
doc['course'] = course_name | |
documents.append(doc) | |
# Create the DataFrame | |
return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text']) | |
def make_batches(self, seq, n): | |
""" | |
Split a sequence into batches of size n. | |
""" | |
result = [] | |
for i in range(0, len(seq), n): | |
batch = seq[i:i+n] | |
result.append(batch) | |
return result | |
def compute_embeddings(self, texts): | |
""" | |
Compute embeddings for a list of texts using a pre-trained transformer model. | |
""" | |
text_batches = self.make_batches(texts, self.batch_size) | |
all_embeddings = [] | |
for batch in tqdm(text_batches, desc="Computing embeddings"): | |
encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = self.model(**encoded_input) | |
hidden_states = outputs.last_hidden_state | |
batch_embeddings = hidden_states.mean(dim=1) | |
batch_embeddings_np = batch_embeddings.cpu().numpy() | |
all_embeddings.append(batch_embeddings_np) | |
final_embeddings = np.vstack(all_embeddings) | |
return final_embeddings | |
def query(self, query_text, top_n=10): | |
""" | |
Perform a query to find the most relevant documents. | |
""" | |
query_embedding = self.compute_embeddings([query_text]) | |
similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten() | |
top_n_indices = similarities.argsort()[-top_n:][::-1] | |
top_n_documents = self.df.iloc[top_n_indices] | |
return top_n_documents | |
# Streamlit application | |
st.title("FAQ Search Engine for DataTalks") | |
# Initialize CourseFAQBot | |
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json' | |
faq_bot = CourseFAQBot(docs_url=docs_url) | |
# Input fields for query and filters | |
query = st.text_input("Enter your query:") | |
courses = st.multiselect("Select course(s):", options=faq_bot.df['course'].unique()) | |
# Search button | |
if st.button("Search"): | |
results = faq_bot.query(query) | |
# Filter results by selected courses if any | |
if courses: | |
results = results[results['course'].isin(courses)] | |
# Display results with space in between | |
for i, result in enumerate(results.to_dict(orient='records')): | |
st.write(f"### Result {i+1}") | |
st.write(f"**Course**: {result['course']}") | |
st.write(f"**Section**: {result['section']}") | |
st.write(f"**Question**: {result['question']}") | |
st.write(f"**Text**: {result['text']}") | |
st.write("") # Adds a blank space between results | |
st.markdown("---") | |