onisj commited on
Commit
dc248e5
·
verified ·
1 Parent(s): 6aedfe0

app redone

Browse files
Files changed (1) hide show
  1. app.py +62 -79
app.py CHANGED
@@ -1,107 +1,90 @@
1
- # Install neccessary requirements
2
- pip install requirments.txt
3
 
4
- # Import neccessary libraries
5
  import streamlit as st
6
  import pandas as pd
7
  import numpy as np
8
  import requests
9
- import torch
10
- from tqdm.auto import tqdm
11
- from transformers import BertModel, BertTokenizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
14
- # CourseFAQBot class
15
- class CourseFAQBot:
16
- def __init__(self, model_name="bert-base-uncased", docs_url=None, batch_size=8):
17
- self.tokenizer = BertTokenizer.from_pretrained(model_name)
18
- self.model = BertModel.from_pretrained(model_name)
19
- self.model.eval() # Set the model to evaluation mode if not training
20
- self.batch_size = batch_size
21
- self.df = self._download_and_process_documents(docs_url)
22
- self.document_embeddings = self.compute_embeddings(self.df['text'].tolist())
23
 
24
- def _download_and_process_documents(self, docs_url):
25
- """
26
- Download and process the document data.
27
- """
28
- docs_response = requests.get(docs_url)
29
- documents_raw = docs_response.json()
30
-
31
- documents = []
32
- for course in documents_raw:
33
- course_name = course['course']
34
- for doc in course['documents']:
35
- doc['course'] = course_name
36
- documents.append(doc)
37
-
38
- # Create the DataFrame
39
- return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
40
 
41
- def make_batches(self, seq, n):
42
- """
43
- Split a sequence into batches of size n.
44
- """
45
- result = []
46
- for i in range(0, len(seq), n):
47
- batch = seq[i:i+n]
48
- result.append(batch)
49
- return result
50
 
51
- def compute_embeddings(self, texts):
52
- """
53
- Compute embeddings for a list of texts using a pre-trained transformer model.
54
- """
55
- text_batches = self.make_batches(texts, self.batch_size)
56
- all_embeddings = []
 
57
 
58
- for batch in tqdm(text_batches, desc="Computing embeddings"):
59
- encoded_input = self.tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
60
- with torch.no_grad():
61
- outputs = self.model(**encoded_input)
62
- hidden_states = outputs.last_hidden_state
63
- batch_embeddings = hidden_states.mean(dim=1)
64
- batch_embeddings_np = batch_embeddings.cpu().numpy()
65
- all_embeddings.append(batch_embeddings_np)
66
 
67
- final_embeddings = np.vstack(all_embeddings)
68
- return final_embeddings
69
 
70
- def query(self, query_text, top_n=10):
71
- """
72
- Perform a query to find the most relevant documents.
73
- """
74
- query_embedding = self.compute_embeddings([query_text])
75
- similarities = cosine_similarity(query_embedding, self.document_embeddings).flatten()
76
- top_n_indices = similarities.argsort()[-top_n:][::-1]
77
- top_n_documents = self.df.iloc[top_n_indices]
78
- return top_n_documents
79
-
80
- # Streamlit application
81
  st.title("FAQ Search Engine for DataTalks")
82
 
83
- # Initialize CourseFAQBot
84
- docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
85
- faq_bot = CourseFAQBot(docs_url=docs_url)
 
 
 
86
 
87
  # Input fields for query and filters
88
  query = st.text_input("Enter your query:")
89
- courses = st.multiselect("Select course(s):", options=faq_bot.df['course'].unique())
90
 
91
  # Search button
92
  if st.button("Search"):
93
- results = faq_bot.query(query)
94
-
95
- # Filter results by selected courses if any
96
  if courses:
97
- results = results[results['course'].isin(courses)]
 
98
 
99
- # Display results with space in between
100
- for i, result in enumerate(results.to_dict(orient='records')):
 
 
 
 
 
 
101
  st.write(f"### Result {i+1}")
102
  st.write(f"**Course**: {result['course']}")
103
  st.write(f"**Section**: {result['section']}")
104
  st.write(f"**Question**: {result['question']}")
105
  st.write(f"**Text**: {result['text']}")
106
- st.write("") # Adds a blank space between results
107
- st.markdown("---")
 
1
+ # Import necessary libraries
 
2
 
 
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
6
  import requests
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
 
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
+ # Function to fetch data
11
+ def fetch_data():
12
+ docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
13
+ docs_response = requests.get(docs_url)
14
+ documents_raw = docs_response.json()
15
+ documents = []
 
 
 
16
 
17
+ for course in documents_raw:
18
+ course_name = course['course']
19
+ for doc in course['documents']:
20
+ doc['course'] = course_name
21
+ documents.append(doc)
22
+
23
+ return pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
24
+
25
+ # TextSearch class
26
+ class TextSearch:
27
+ def __init__(self, text_fields):
28
+ self.text_fields = text_fields
29
+ self.matrices = {}
30
+ self.vectorizers = {}
 
 
31
 
32
+ def fit(self, records, vectorizer_params={}):
33
+ self.df = pd.DataFrame(records)
34
+ for f in self.text_fields:
35
+ cv = TfidfVectorizer(**vectorizer_params)
36
+ X = cv.fit_transform(self.df[f])
37
+ self.vectorizers[f] = cv
38
+ self.matrices[f] = X
 
 
39
 
40
+ def search(self, query, filters={}, boost={}):
41
+ score = np.zeros(len(self.df))
42
+ for f in self.text_fields:
43
+ b = boost.get(f, 1.0)
44
+ q = self.vectorizers[f].transform([query])
45
+ s = cosine_similarity(self.matrices[f], q).flatten()
46
+ score = score + b * s
47
 
48
+ for field, value in filters.items():
49
+ mask = (self.df[field] == value).values
50
+ score = score * mask
 
 
 
 
 
51
 
52
+ idx = np.argsort(-score)[:5]
53
+ return self.df.iloc[idx].to_dict(orient='records')
54
 
55
+ # Main Streamlit application
 
 
 
 
 
 
 
 
 
 
56
  st.title("FAQ Search Engine for DataTalks")
57
 
58
+ # Load data
59
+ df = fetch_data()
60
+
61
+ # Initialize TextSearch
62
+ text_search = TextSearch(text_fields=['section', 'question', 'text'])
63
+ text_search.fit(df.to_dict(orient='records'), vectorizer_params={'stop_words': 'english', 'min_df': 3})
64
 
65
  # Input fields for query and filters
66
  query = st.text_input("Enter your query:")
67
+ courses = st.multiselect("Select course(s):", options=df['course'].unique())
68
 
69
  # Search button
70
  if st.button("Search"):
71
+ filters = {}
 
 
72
  if courses:
73
+ filters['course'] = courses[0] if len(courses) == 1 else courses
74
+ results = text_search.search(query, filters=filters, boost={'question': 3.0})
75
 
76
+ # Display results
77
+ # for i, result in enumerate(results):
78
+ # st.write(f"### Result {i+1}")
79
+ # st.write(f"**Course**: {result['course']}")
80
+ # st.write(f"**Question**: {result['question']}")
81
+ # st.write(f"**Response**: {result['text']}")
82
+
83
+ for i, result in enumerate(results):
84
  st.write(f"### Result {i+1}")
85
  st.write(f"**Course**: {result['course']}")
86
  st.write(f"**Section**: {result['section']}")
87
  st.write(f"**Question**: {result['question']}")
88
  st.write(f"**Text**: {result['text']}")
89
+ st.write("")
90
+ st.markdown("---")