Muhammad541 commited on
Commit
a047faf
·
verified ·
1 Parent(s): e94d8bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -32
app.py CHANGED
@@ -179,38 +179,42 @@ def load_precomputed_resources():
179
  def precompute_resources():
180
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
181
  logger.info("Precomputing resources offline")
182
- tfidf_vectorizer = TfidfVectorizer(stop_words='english')
183
- all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
184
- tfidf_vectorizer.fit(all_texts)
185
-
186
- skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
187
- question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
188
- answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
189
-
190
- faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
191
- faiss_index.add(answer_embeddings)
192
-
193
- # Precompute course similarities
194
- course_skills = courses_df['skills'].fillna("").tolist()
195
- course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
196
- skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
197
- course_similarity = util.pytorch_cos_sim(skill_embeddings.clone().detach(), course_embeddings.clone().detach()).numpy()
198
-
199
- # Precompute job similarities
200
- job_skills = jobs_df['required_skills'].fillna("").tolist()
201
- job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
202
- job_similarity = util.pytorch_cos_sim(skill_embeddings.clone().detach(), job_embeddings.clone().detach()).numpy()
203
-
204
- # Save precomputed resources
205
- with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
206
- with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
207
- with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
208
- faiss.write_index(faiss_index, FAISS_INDEX_PATH)
209
- with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
210
- with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
211
- with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
212
- universal_model.save(UNIVERSAL_MODEL_PATH)
213
- logger.info(f"Precomputed resources saved to {chosen_model_dir}")
 
 
 
 
214
 
215
  # Evaluation with precomputed data
216
  def evaluate_response(args):
 
179
  def precompute_resources():
180
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
181
  logger.info("Precomputing resources offline")
182
+ try:
183
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
184
+ all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
185
+ tfidf_vectorizer.fit(all_texts)
186
+
187
+ skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
188
+ question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
189
+ answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
190
+
191
+ faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
192
+ faiss_index.add(answer_embeddings)
193
+
194
+ # Precompute course similarities
195
+ course_skills = courses_df['skills'].fillna("").tolist()
196
+ course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
197
+ skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
198
+ course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
199
+
200
+ # Precompute job similarities
201
+ job_skills = jobs_df['required_skills'].fillna("").tolist()
202
+ job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
203
+ job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
204
+
205
+ # Save precomputed resources
206
+ with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
207
+ with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
208
+ with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
209
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
210
+ with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
211
+ with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
212
+ with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
213
+ universal_model.save(UNIVERSAL_MODEL_PATH)
214
+ logger.info(f"Precomputed resources saved to {chosen_model_dir}")
215
+ except Exception as e:
216
+ logger.error(f"Error during precomputation: {e}")
217
+ raise
218
 
219
  # Evaluation with precomputed data
220
  def evaluate_response(args):