Muhammad541 commited on
Commit
2a0b0fa
·
verified ·
1 Parent(s): 87aca35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -105
app.py CHANGED
@@ -10,6 +10,8 @@ import scipy.special
10
  from sklearn.feature_extraction.text import TfidfVectorizer
11
  from flask import Flask, request, jsonify
12
  import logging
 
 
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
@@ -18,6 +20,14 @@ logger = logging.getLogger(__name__)
18
  # Disable tokenizers parallelism to avoid fork-related deadlocks
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
 
 
 
 
 
 
 
 
 
21
  # Paths for saving artifacts
22
  MODEL_DIR = "./saved_models"
23
  FALLBACK_MODEL_DIR = "/tmp/saved_models"
@@ -58,24 +68,21 @@ def load_dataset(file_path, required_columns=[], additional_columns=['popularity
58
  missing_required = [col for col in required_columns if col not in df.columns]
59
  missing_additional = [col for col in additional_columns if col not in df.columns]
60
 
61
- # Handle missing required columns
62
  if missing_required:
63
  logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
64
  for col in missing_required:
65
  df[col] = ""
66
 
67
- # Handle missing additional columns (popularity, completion_rate, etc.)
68
  if missing_additional:
69
  logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
70
  for col in missing_additional:
71
  if col == 'popularity':
72
- df[col] = 0.8 # Default value for popularity
73
  elif col == 'completion_rate':
74
- df[col] = 0.7 # Default value for completion_rate
75
  else:
76
- df[col] = 0.0 # Default for other additional columns
77
 
78
- # Ensure 'level' column has valid values (not empty)
79
  if 'level' in df.columns:
80
  df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
81
  else:
@@ -104,24 +111,6 @@ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Qu
104
  'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
105
  })
106
 
107
- courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], ['popularity', 'completion_rate'], {
108
- 'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
109
- 'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
110
- 'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
111
- 'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
112
- 'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
113
- 'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
114
- })
115
-
116
- jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], [], {
117
- 'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
118
- 'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
119
- 'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
120
- 'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
121
- 'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
122
- 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
123
- })
124
-
125
  # Validate questions_df
126
  if questions_df is None or questions_df.empty:
127
  logger.error("questions_df is empty or could not be loaded. Exiting.")
@@ -175,7 +164,7 @@ def load_precomputed_resources():
175
  else:
176
  precompute_resources()
177
 
178
- # Precompute Resources Offline (to be run separately)
179
  def precompute_resources():
180
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
181
  logger.info("Precomputing resources offline")
@@ -191,18 +180,6 @@ def precompute_resources():
191
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
192
  faiss_index.add(answer_embeddings)
193
 
194
- # Precompute course similarities
195
- course_skills = courses_df['skills'].fillna("").tolist()
196
- course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
197
- skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
198
- course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
199
-
200
- # Precompute job similarities
201
- job_skills = jobs_df['required_skills'].fillna("").tolist()
202
- job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
203
- job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
204
-
205
- # Save precomputed resources
206
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
207
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
208
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
@@ -243,65 +220,50 @@ def evaluate_response(args):
243
  logger.error(f"Evaluation error for {skill}: {e}")
244
  return skill, 0.0, False
245
 
246
- # Course recommendation with precomputed similarity
247
- def recommend_courses(skills_to_improve, user_level, upgrade=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  try:
249
- if not skills_to_improve or courses_df.empty:
250
- logger.info("No skills to improve or courses_df is empty.")
251
- return []
252
-
253
- skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
254
- if not skill_indices:
255
- logger.info("No matching skill indices found.")
256
  return []
257
 
258
- similarities = course_similarity[skill_indices]
259
- # Use default arrays to avoid KeyError
260
- popularity = courses_df['popularity'].values if 'popularity' in courses_df else np.full(len(courses_df), 0.8)
261
- completion_rate = courses_df['completion_rate'].values if 'completion_rate' in courses_df else np.full(len(courses_df), 0.7)
262
- total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
263
-
264
  target_level = 'Advanced' if upgrade else user_level
265
- idx = np.argsort(-total_scores)[:5]
266
- candidates = courses_df.iloc[idx]
267
-
268
- # Filter by level, but fallback to all courses if none match
269
- filtered_candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
270
- if filtered_candidates.empty:
271
- logger.warning(f"No courses found for level {target_level}. Returning top courses regardless of level.")
272
- filtered_candidates = candidates
273
-
274
- return filtered_candidates[['course_title', 'Organization']].values.tolist()[:3]
275
  except Exception as e:
276
  logger.error(f"Course recommendation error: {e}")
277
  return []
278
 
279
- # Job recommendation with precomputed similarity
280
- def recommend_jobs(user_skills, user_level):
281
  try:
282
- if jobs_df.empty:
283
- return []
284
-
285
- skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
286
- if not skill_indices:
287
  return []
288
 
289
- similarities = job_similarity[skill_indices]
290
- total_scores = 0.5 * np.max(similarities, axis=0)
291
-
292
- if 'level' not in jobs_df.columns:
293
- jobs_df['level'] = 'Intermediate'
294
- level_col = jobs_df['level'].astype(str)
295
- level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
296
- user_level_num = level_map.get(user_level, 1)
297
- level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
298
-
299
- location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
300
- total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
301
- top_job_indices = np.argsort(-total_job_scores)[:5]
302
-
303
- return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
304
- jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
305
  except Exception as e:
306
  logger.error(f"Job recommendation error: {e}")
307
  return []
@@ -313,13 +275,29 @@ app = Flask(__name__)
313
  def health_check():
314
  return jsonify({"status": "active", "model_dir": chosen_model_dir})
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  @app.route('/assess', methods=['POST'])
317
  def assess_skills():
318
  try:
319
  data = request.get_json()
320
- if not data or 'skills' not in data or 'answers' not in data:
321
  return jsonify({"error": "Missing required fields"}), 400
322
 
 
323
  user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
324
  answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
325
  user_level = data.get('user_level', 'Intermediate').strip()
@@ -327,23 +305,12 @@ def assess_skills():
327
  if len(answers) != len(user_skills):
328
  return jsonify({"error": "Answers count must match skills count"}), 400
329
 
330
- load_precomputed_resources() # Load precomputed resources before processing
331
-
332
- user_questions = []
333
- for skill in user_skills:
334
- skill_questions = questions_df[questions_df['Skill'] == skill]
335
- if not skill_questions.empty:
336
- user_questions.append(skill_questions.sample(1).iloc[0])
337
- else:
338
- user_questions.append({
339
- 'Skill': skill,
340
- 'Question': f"What are the best practices for using {skill} in a production environment?",
341
- 'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
342
- })
343
- user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
344
 
345
  user_responses = []
346
- for idx, row in user_questions.iterrows():
347
  answer = answers[idx]
348
  if not answer or answer.lower() == 'skip':
349
  user_responses.append((row['Skill'], None, None))
@@ -366,12 +333,19 @@ def assess_skills():
366
  ai_flags[skill] = is_ai
367
  scores_list.append(score)
368
 
 
 
 
 
 
 
 
369
  mean_score = np.mean(scores_list) if scores_list else 50
370
  dynamic_threshold = max(40, mean_score)
371
  weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
372
 
373
- courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
374
- jobs = recommend_jobs(user_skills, user_level)
375
 
376
  return jsonify({
377
  "assessment_results": {
@@ -388,8 +362,8 @@ def assess_skills():
388
  "weak_skills": weak_skills,
389
  "skipped_questions": skipped_questions
390
  },
391
- "recommended_courses": courses[:3],
392
- "recommended_jobs": jobs[:5]
393
  })
394
  except Exception as e:
395
  logger.error(f"Assessment error: {e}")
 
10
  from sklearn.feature_extraction.text import TfidfVectorizer
11
  from flask import Flask, request, jsonify
12
  import logging
13
+ from pymongo import MongoClient
14
+ import requests
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
 
20
  # Disable tokenizers parallelism to avoid fork-related deadlocks
21
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
 
23
+ # MongoDB connection
24
+ MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
25
+ client = MongoClient(MONGO_URI)
26
+ db = client.get_database("test") # Adjust the database name as needed
27
+ users_collection = db["users"]
28
+ courses_collection = db["courses"]
29
+ jobs_collection = db["jobs"]
30
+
31
  # Paths for saving artifacts
32
  MODEL_DIR = "./saved_models"
33
  FALLBACK_MODEL_DIR = "/tmp/saved_models"
 
68
  missing_required = [col for col in required_columns if col not in df.columns]
69
  missing_additional = [col for col in additional_columns if col not in df.columns]
70
 
 
71
  if missing_required:
72
  logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
73
  for col in missing_required:
74
  df[col] = ""
75
 
 
76
  if missing_additional:
77
  logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
78
  for col in missing_additional:
79
  if col == 'popularity':
80
+ df[col] = 0.8
81
  elif col == 'completion_rate':
82
+ df[col] = 0.7
83
  else:
84
+ df[col] = 0.0
85
 
 
86
  if 'level' in df.columns:
87
  df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
88
  else:
 
111
  'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
112
  })
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # Validate questions_df
115
  if questions_df is None or questions_df.empty:
116
  logger.error("questions_df is empty or could not be loaded. Exiting.")
 
164
  else:
165
  precompute_resources()
166
 
167
+ # Precompute Resources Offline
168
  def precompute_resources():
169
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
170
  logger.info("Precomputing resources offline")
 
180
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
181
  faiss_index.add(answer_embeddings)
182
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
184
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
185
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
 
220
  logger.error(f"Evaluation error for {skill}: {e}")
221
  return skill, 0.0, False
222
 
223
+ # Fetch questions for given skills
224
+ def get_questions_for_skills(skills):
225
+ user_questions = []
226
+ for skill in skills:
227
+ skill_questions = questions_df[questions_df['Skill'] == skill]
228
+ if not skill_questions.empty:
229
+ user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
230
+ else:
231
+ user_questions.append({
232
+ 'Skill': skill,
233
+ 'Question': f"What are the best practices for using {skill} in a production environment?",
234
+ 'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
235
+ })
236
+ return user_questions
237
+
238
+ # Recommend courses from MongoDB
239
+ def recommend_courses_from_mongo(skills_to_improve, user_level, upgrade=False):
240
  try:
241
+ if not skills_to_improve:
 
 
 
 
 
 
242
  return []
243
 
 
 
 
 
 
 
244
  target_level = 'Advanced' if upgrade else user_level
245
+ query = {
246
+ "skills": {"$in": skills_to_improve},
247
+ "category": {"$regex": target_level, "$options": "i"}
248
+ }
249
+ courses = courses_collection.find(query).limit(3)
250
+ return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
 
 
 
 
251
  except Exception as e:
252
  logger.error(f"Course recommendation error: {e}")
253
  return []
254
 
255
+ # Recommend jobs from MongoDB
256
+ def recommend_jobs_from_mongo(user_skills, user_level):
257
  try:
258
+ if not user_skills:
 
 
 
 
259
  return []
260
 
261
+ query = {
262
+ "skills": {"$in": user_skills},
263
+ "status": "active"
264
+ }
265
+ jobs = jobs_collection.find(query).limit(5)
266
+ return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
 
 
 
 
 
 
 
 
 
 
267
  except Exception as e:
268
  logger.error(f"Job recommendation error: {e}")
269
  return []
 
275
  def health_check():
276
  return jsonify({"status": "active", "model_dir": chosen_model_dir})
277
 
278
+ @app.route('/get_questions', methods=['POST'])
279
+ def get_questions():
280
+ try:
281
+ data = request.get_json()
282
+ if not data or 'skills' not in data:
283
+ return jsonify({"error": "Missing skills field"}), 400
284
+
285
+ user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
286
+ load_precomputed_resources()
287
+ questions = get_questions_for_skills(user_skills)
288
+ return jsonify({"questions": questions})
289
+ except Exception as e:
290
+ logger.error(f"Get questions error: {e}")
291
+ return jsonify({"error": "Internal server error"}), 500
292
+
293
  @app.route('/assess', methods=['POST'])
294
  def assess_skills():
295
  try:
296
  data = request.get_json()
297
+ if not data or 'skills' not in data or 'answers' not in data or 'userId' not in data:
298
  return jsonify({"error": "Missing required fields"}), 400
299
 
300
+ user_id = data['userId']
301
  user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
302
  answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
303
  user_level = data.get('user_level', 'Intermediate').strip()
 
305
  if len(answers) != len(user_skills):
306
  return jsonify({"error": "Answers count must match skills count"}), 400
307
 
308
+ load_precomputed_resources()
309
+ user_questions = get_questions_for_skills(user_skills)
310
+ user_questions_df = pd.DataFrame(user_questions).reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  user_responses = []
313
+ for idx, row in user_questions_df.iterrows():
314
  answer = answers[idx]
315
  if not answer or answer.lower() == 'skip':
316
  user_responses.append((row['Skill'], None, None))
 
333
  ai_flags[skill] = is_ai
334
  scores_list.append(score)
335
 
336
+ # Update user profile with scores
337
+ skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
338
+ users_collection.update_one(
339
+ {"_id": user_id},
340
+ {"$set": {"skillScores": skill_scores}}
341
+ )
342
+
343
  mean_score = np.mean(scores_list) if scores_list else 50
344
  dynamic_threshold = max(40, mean_score)
345
  weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
346
 
347
+ courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
348
+ jobs = recommend_jobs_from_mongo(user_skills, user_level)
349
 
350
  return jsonify({
351
  "assessment_results": {
 
362
  "weak_skills": weak_skills,
363
  "skipped_questions": skipped_questions
364
  },
365
+ "recommended_courses": courses,
366
+ "recommended_jobs": jobs
367
  })
368
  except Exception as e:
369
  logger.error(f"Assessment error: {e}")