# Install neccessary requirements pip install requirments.txt # Import neccessary libraries import streamlit as st import pandas as pd import numpy as np import requests import torch from tqdm.auto import tqdm from transformers import BertModel, BertTokenizer from sklearn.metrics.pairwise import cosine_similarity # CourseFAQBot class class CourseFAQBot: def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8): self.tokenizer = BertTokenizer.from_pretrained(model_name) self.model = BertModel.from_pretrained(model_name) self.model.eval() # Set the model to evaluation mode if not training self.batch_size = batch_size self.df = self._download_and_process_documents(docs_url) self.document_embeddings = self.compute_embeddings(self.df['text'].tolist()) def _download_and_process_documents(self, docs_url): """ Download and process the document data. """ docs_response = requests.get(docs_url) documents_raw = docs_response.json() documents = [] for course in documents_raw: course_name = course['course'] for doc in course['documents']: doc['course'] = course_name documents.append(doc) # Create the DataFrame return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text']) def make_batches(self, seq, n): """ Split a sequence into batches of size n. """ result = [] for i in range(0, len(seq), n): batch = seq[i:i+n] result.append(batch) return result def compute_embeddings(self, texts): """ Compute embeddings for a list of texts using a pre-trained transformer model. """ text_batches = self.make_batches(texts, self.batch_size) all_embeddings = [] for batch in tqdm(text_batches, desc="Computing embeddings"): encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): outputs = self.model(**encoded_input) hidden_states = outputs.last_hidden_state batch_embeddings = hidden_states.mean(dim=1) batch_embeddings_np = batch_embeddings.cpu().numpy() all_embeddings.append(batch_embeddings_np) final_embeddings = np.vstack(all_embeddings) return final_embeddings def query(self, query_text, top_n=10): """ Perform a query to find the most relevant documents. """ query_embedding = self.compute_embeddings([query_text]) similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten() top_n_indices = similarities.argsort()[-top_n:][::-1] top_n_documents = self.df.iloc[top_n_indices] return top_n_documents # Streamlit application st.title("FAQ Search Engine for DataTalks") # Initialize CourseFAQBot docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json' faq_bot = CourseFAQBot(docs_url=docs_url) # Input fields for query and filters query = st.text_input("Enter your query:") courses = st.multiselect("Select course(s):", options=faq_bot.df['course'].unique()) # Search button if st.button("Search"): results = faq_bot.query(query) # Filter results by selected courses if any if courses: results = results[results['course'].isin(courses)] # Display results with space in between for i, result in enumerate(results.to_dict(orient='records')): st.write(f"### Result {i+1}") st.write(f"**Course**: {result['course']}") st.write(f"**Section**: {result['section']}") st.write(f"**Question**: {result['question']}") st.write(f"**Text**: {result['text']}") st.write("") # Adds a blank space between results st.markdown("---")