frankjosh commited on
Commit
856c3dc
·
verified ·
1 Parent(s): 73463ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -15
app.py CHANGED
@@ -8,7 +8,7 @@ from sklearn.metrics.pairwise import cosine_similarity
8
  from transformers import AutoTokenizer, AutoModel
9
  import torch
10
  from torch.utils.data import DataLoader, Dataset
11
- from datasets import load_dataset # For loading dataset
12
  from datetime import datetime
13
  from typing import List, Dict, Any
14
  from functools import partial
@@ -24,20 +24,36 @@ if 'feedback' not in st.session_state:
24
  st.session_state.feedback = {}
25
 
26
  # Define subset size and batch size for optimization
27
- SUBSET_SIZE = 500 # Smaller subset for faster precomputation
28
  BATCH_SIZE = 8 # Smaller batch size to reduce memory overhead
29
 
30
- # Caching key resources: Model, Tokenizer, and Precomputed Embeddings
31
  @st.cache_resource
32
- def load_model_and_tokenizer():
33
  """
34
- Load the pre-trained model and tokenizer using Hugging Face Transformers.
35
- Cached to ensure it loads only once.
36
  """
37
- model_name = "Salesforce/codet5-small"
38
- tokenizer = AutoTokenizer.from_pretrained(model_name)
39
- model = AutoModel.from_pretrained(model_name).to(device)
40
- model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return tokenizer, model
42
 
43
  @st.cache_resource
@@ -56,7 +72,6 @@ def load_data():
56
  data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
57
  return data
58
 
59
-
60
  @st.cache_resource
61
  def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
62
  """
@@ -113,10 +128,13 @@ def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BAT
113
  )
114
 
115
  embeddings = []
116
- for batch in dataloader:
 
117
  batch_embeddings = generate_embeddings_batch(_model, batch, device)
118
  embeddings.extend(batch_embeddings)
 
119
 
 
120
  data['embedding'] = embeddings
121
  return data
122
 
@@ -136,8 +154,18 @@ def find_similar_repos(query_embedding: np.ndarray, data: pd.DataFrame, top_n=5)
136
  """
137
  Compute cosine similarity and return the top N most similar repositories.
138
  """
139
- similarities = cosine_similarity([query_embedding], np.stack(data['embedding'].values))[0]
 
 
 
 
 
 
 
 
 
140
  data['similarity'] = similarities
 
141
  return data.nlargest(top_n, 'similarity')
142
 
143
  def display_recommendations(recommendations: pd.DataFrame):
@@ -154,8 +182,10 @@ def display_recommendations(recommendations: pd.DataFrame):
154
  st.title("Repository Recommender System 🚀")
155
  st.caption("Find repositories based on your project description.")
156
 
157
- # Load resources
158
- tokenizer, model = load_model_and_tokenizer()
 
 
159
  data = load_data()
160
  data = precompute_embeddings(data, tokenizer, model)
161
 
 
8
  from transformers import AutoTokenizer, AutoModel
9
  import torch
10
  from torch.utils.data import DataLoader, Dataset
11
+ from datasets import load_dataset
12
  from datetime import datetime
13
  from typing import List, Dict, Any
14
  from functools import partial
 
24
  st.session_state.feedback = {}
25
 
26
  # Define subset size and batch size for optimization
27
+ SUBSET_SIZE = 500 # Subset for faster precomputation
28
  BATCH_SIZE = 8 # Smaller batch size to reduce memory overhead
29
 
 
30
  @st.cache_resource
31
+ def load_model_and_tokenizer_with_progress():
32
  """
33
+ Load the pre-trained model and tokenizer using Hugging Face Transformers
34
+ with a progress bar for better user experience.
35
  """
36
+ progress_bar = st.progress(0)
37
+ status_text = st.empty()
38
+
39
+ try:
40
+ progress_bar.progress(10)
41
+ status_text.text("Loading tokenizer...")
42
+ model_name = "Salesforce/codet5-small"
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+
45
+ progress_bar.progress(50)
46
+ status_text.text("Loading model...")
47
+ model = AutoModel.from_pretrained(model_name).to(device)
48
+ model.eval()
49
+
50
+ progress_bar.progress(100)
51
+ status_text.text("Model loaded successfully!")
52
+
53
+ finally:
54
+ progress_bar.empty()
55
+ status_text.empty()
56
+
57
  return tokenizer, model
58
 
59
  @st.cache_resource
 
72
  data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
73
  return data
74
 
 
75
  @st.cache_resource
76
  def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
77
  """
 
128
  )
129
 
130
  embeddings = []
131
+ progress_bar = st.progress(0) # Progress bar for embedding computation
132
+ for i, batch in enumerate(dataloader):
133
  batch_embeddings = generate_embeddings_batch(_model, batch, device)
134
  embeddings.extend(batch_embeddings)
135
+ progress_bar.progress((i + 1) / len(dataloader))
136
 
137
+ progress_bar.empty()
138
  data['embedding'] = embeddings
139
  return data
140
 
 
154
  """
155
  Compute cosine similarity and return the top N most similar repositories.
156
  """
157
+ # Reshape query_embedding to 2D
158
+ query_embedding = query_embedding.reshape(1, -1)
159
+
160
+ # Convert data['embedding'] to a 2D array
161
+ embeddings = np.vstack(data['embedding'].values)
162
+
163
+ # Compute cosine similarity
164
+ similarities = cosine_similarity(query_embedding, embeddings)[0]
165
+
166
+ # Add similarity scores to the DataFrame
167
  data['similarity'] = similarities
168
+
169
  return data.nlargest(top_n, 'similarity')
170
 
171
  def display_recommendations(recommendations: pd.DataFrame):
 
182
  st.title("Repository Recommender System 🚀")
183
  st.caption("Find repositories based on your project description.")
184
 
185
+ # Load resources with progress bar
186
+ tokenizer, model = load_model_and_tokenizer_with_progress()
187
+
188
+ # Load data and precompute embeddings
189
  data = load_data()
190
  data = precompute_embeddings(data, tokenizer, model)
191