course_faq_bot / app.py
onisj's picture
Update app.py
8388817 verified
raw
history blame
4.03 kB
# Install neccessary requirements
pip install requirments.txt
# Import neccessary libraries
import streamlit as st
import pandas as pd
import numpy as np
import requests
import torch
from tqdm.auto import tqdm
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
# CourseFAQBot class
class CourseFAQBot:
def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8):
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertModel.from_pretrained(model_name)
self.model.eval() # Set the model to evaluation mode if not training
self.batch_size = batch_size
self.df = self._download_and_process_documents(docs_url)
self.document_embeddings = self.compute_embeddings(self.df['text'].tolist())
def _download_and_process_documents(self, docs_url):
"""
Download and process the document data.
"""
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()
documents = []
for course in documents_raw:
course_name = course['course']
for doc in course['documents']:
doc['course'] = course_name
documents.append(doc)
# Create the DataFrame
return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
def make_batches(self, seq, n):
"""
Split a sequence into batches of size n.
"""
result = []
for i in range(0, len(seq), n):
batch = seq[i:i+n]
result.append(batch)
return result
def compute_embeddings(self, texts):
"""
Compute embeddings for a list of texts using a pre-trained transformer model.
"""
text_batches = self.make_batches(texts, self.batch_size)
all_embeddings = []
for batch in tqdm(text_batches, desc="Computing embeddings"):
encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
outputs = self.model(**encoded_input)
hidden_states = outputs.last_hidden_state
batch_embeddings = hidden_states.mean(dim=1)
batch_embeddings_np = batch_embeddings.cpu().numpy()
all_embeddings.append(batch_embeddings_np)
final_embeddings = np.vstack(all_embeddings)
return final_embeddings
def query(self, query_text, top_n=10):
"""
Perform a query to find the most relevant documents.
"""
query_embedding = self.compute_embeddings([query_text])
similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten()
top_n_indices = similarities.argsort()[-top_n:][::-1]
top_n_documents = self.df.iloc[top_n_indices]
return top_n_documents
# Streamlit application
st.title("FAQ Search Engine for DataTalks")
# Initialize CourseFAQBot
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
faq_bot = CourseFAQBot(docs_url=docs_url)
# Input fields for query and filters
query = st.text_input("Enter your query:")
courses = st.multiselect("Select course(s):", options=faq_bot.df['course'].unique())
# Search button
if st.button("Search"):
results = faq_bot.query(query)
# Filter results by selected courses if any
if courses:
results = results[results['course'].isin(courses)]
# Display results with space in between
for i, result in enumerate(results.to_dict(orient='records')):
st.write(f"### Result {i+1}")
st.write(f"**Course**: {result['course']}")
st.write(f"**Section**: {result['section']}")
st.write(f"**Question**: {result['question']}")
st.write(f"**Text**: {result['text']}")
st.write("") # Adds a blank space between results
st.markdown("---")