File size: 4,025 Bytes
8388817
 
 
 
7dd1af0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Install neccessary requirements
pip install requirments.txt

# Import neccessary libraries
import streamlit as st
import pandas as pd
import numpy as np
import requests
import torch
from tqdm.auto import tqdm
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# CourseFAQBot class
class CourseFAQBot:
    def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()  # Set the model to evaluation mode if not training
        self.batch_size = batch_size
        self.df = self._download_and_process_documents(docs_url)
        self.document_embeddings = self.compute_embeddings(self.df['text'].tolist())

    def _download_and_process_documents(self, docs_url):
        """
        Download and process the document data.
        """
        docs_response = requests.get(docs_url)
        documents_raw = docs_response.json()
        
        documents = []
        for course in documents_raw:
            course_name = course['course']
            for doc in course['documents']:
                doc['course'] = course_name
                documents.append(doc)
        
        # Create the DataFrame
        return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

    def make_batches(self, seq, n):
        """
        Split a sequence into batches of size n.
        """
        result = []
        for i in range(0, len(seq), n):
            batch = seq[i:i+n]
            result.append(batch)
        return result

    def compute_embeddings(self, texts):
        """
        Compute embeddings for a list of texts using a pre-trained transformer model.
        """
        text_batches = self.make_batches(texts, self.batch_size)
        all_embeddings = []
        
        for batch in tqdm(text_batches, desc="Computing embeddings"):
            encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                outputs = self.model(**encoded_input)
                hidden_states = outputs.last_hidden_state
                batch_embeddings = hidden_states.mean(dim=1)
                batch_embeddings_np = batch_embeddings.cpu().numpy()
                all_embeddings.append(batch_embeddings_np)
        
        final_embeddings = np.vstack(all_embeddings)
        return final_embeddings

    def query(self, query_text, top_n=10):
        """
        Perform a query to find the most relevant documents.
        """
        query_embedding = self.compute_embeddings([query_text])
        similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten()
        top_n_indices = similarities.argsort()[-top_n:][::-1]
        top_n_documents = self.df.iloc[top_n_indices]
        return top_n_documents

# Streamlit application
st.title("FAQ Search Engine for DataTalks")

# Initialize CourseFAQBot
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
faq_bot = CourseFAQBot(docs_url=docs_url)

# Input fields for query and filters
query = st.text_input("Enter your query:")
courses = st.multiselect("Select course(s):", options=faq_bot.df['course'].unique())

# Search button
if st.button("Search"):
    results = faq_bot.query(query)
    
    # Filter results by selected courses if any
    if courses:
        results = results[results['course'].isin(courses)]
    
    # Display results with space in between
    for i, result in enumerate(results.to_dict(orient='records')):
        st.write(f"### Result {i+1}")
        st.write(f"**Course**: {result['course']}")
        st.write(f"**Section**: {result['section']}")
        st.write(f"**Question**: {result['question']}")
        st.write(f"**Text**: {result['text']}")
        st.write("")  # Adds a blank space between results
        st.markdown("---")