Muhammad541 commited on
Commit
2f417d6
·
verified ·
1 Parent(s): ceba453

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -40
app.py CHANGED
@@ -22,10 +22,10 @@ logger = logging.getLogger(__name__)
22
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
  # Paths for saving artifacts
25
- MODEL_DIR = "./saved_models"
26
- FALLBACK_MODEL_DIR = "/tmp/saved_models"
27
 
28
- # Directory handling with improved error handling
29
  try:
30
  os.makedirs(MODEL_DIR, exist_ok=True)
31
  logger.info(f"Using model directory: {MODEL_DIR}")
@@ -44,7 +44,7 @@ QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
44
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
45
 
46
  # Improved dataset loading with fallback
47
- def load_dataset(file_path, required_columns=[]):
48
  try:
49
  df = pd.read_csv(file_path)
50
  for col in required_columns:
@@ -54,17 +54,20 @@ def load_dataset(file_path, required_columns=[]):
54
  return df
55
  except Exception as e:
56
  logger.error(f"Error loading {file_path}: {e}")
 
 
 
57
  return None
58
 
59
  # Load datasets with fallbacks
60
- questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"]) or pd.DataFrame({
61
  'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
62
  'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
63
  'Intermediate Python question', 'Basic Kubernetes question'],
64
  'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
65
  })
66
 
67
- courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"]) or pd.DataFrame({
68
  'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
69
  'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
70
  'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
@@ -73,7 +76,7 @@ courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "
73
  'completion_rate': [0.7, 0.65, 0.8, 0.6]
74
  })
75
 
76
- jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"]) or pd.DataFrame({
77
  'job_title': ['DevOps Engineer', 'Cloud Architect'],
78
  'company_name': ['Tech Corp', 'Cloud Inc'],
79
  'location': ['Remote', 'Silicon Valley'],
@@ -81,19 +84,32 @@ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company
81
  'job_description': ['DevOps role description', 'Cloud architecture position']
82
  })
83
 
84
- # Model loading with validation
85
- def load_model(model_class, path, default_name):
86
- try:
87
- return model_class.from_pretrained(path)
88
- except Exception as e:
89
- logger.warning(f"Failed to load model from {path}: {e}. Using default {default_name}.")
90
- return model_class.from_pretrained(default_name)
91
-
92
- universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH) if os.path.exists(UNIVERSAL_MODEL_PATH) else SentenceTransformer("all-MiniLM-L6-v2")
93
- detector_model = load_model(AutoModelForSequenceClassification, DETECTOR_MODEL_PATH, "roberta-base-openai-detector")
94
- detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoTokenizer.from_pretrained("roberta-base-openai-detector")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- # Enhanced resource initialization
97
  def initialize_resources(user_skills):
98
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
99
 
@@ -247,37 +263,76 @@ def assess_skills():
247
  initialize_resources(user_skills)
248
 
249
  # Get relevant questions
250
- user_questions = questions_df[questions_df['Skill'].str.lower().isin([s.lower() for s in user_skills])]
251
- if user_questions.empty:
252
- user_questions = questions_df.sample(len(user_skills))
253
-
254
- user_questions = user_questions.sample(len(user_skills)).reset_index(drop=True)
255
- responses = list(zip(user_questions['Skill'], answers, user_questions['Question']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- # Parallel processing with error handling
258
  with Pool(processes=min(cpu_count(), 4)) as pool:
259
- results = pool.map(evaluate_response, responses)
 
260
 
261
- # Process results
262
- assessment = []
263
- scores = []
 
264
  for skill, score, is_ai in results:
265
- assessment.append(f"{skill}: {score}% ({'AI' if is_ai else 'Human'})")
266
- scores.append(score)
267
-
268
- mean_score = np.mean(scores) if scores else 0
269
- weak_skills = [skill for skill, score, _ in results if score < max(60, mean_score)]
 
 
 
 
 
 
270
 
271
  # Generate recommendations
272
  courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
273
  jobs = recommend_jobs(user_skills, user_level)
274
 
275
  return jsonify({
276
- "assessment": assessment,
277
- "mean_score": round(mean_score, 1),
278
- "weak_skills": weak_skills,
279
- "courses": courses[:3], # Top 3 courses
280
- "jobs": jobs[:5] # Top 5 jobs
 
 
 
 
 
 
 
 
 
 
 
281
  })
282
  except Exception as e:
283
  logger.error(f"Assessment error: {e}")
 
22
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
  # Paths for saving artifacts
25
+ MODEL_DIR = "./saved_models" # Primary location in /app/saved_models
26
+ FALLBACK_MODEL_DIR = "/tmp/saved_models" # Fallback if ./saved_models fails
27
 
28
+ # Try to use the primary directory, fall back to /tmp if needed
29
  try:
30
  os.makedirs(MODEL_DIR, exist_ok=True)
31
  logger.info(f"Using model directory: {MODEL_DIR}")
 
44
  FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
45
 
46
  # Improved dataset loading with fallback
47
+ def load_dataset(file_path, required_columns=[], fallback_data=None):
48
  try:
49
  df = pd.read_csv(file_path)
50
  for col in required_columns:
 
54
  return df
55
  except Exception as e:
56
  logger.error(f"Error loading {file_path}: {e}")
57
+ if fallback_data is not None:
58
+ logger.info(f"Using fallback data for {file_path}")
59
+ return pd.DataFrame(fallback_data)
60
  return None
61
 
62
  # Load datasets with fallbacks
63
+ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], {
64
  'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
65
  'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
66
  'Intermediate Python question', 'Basic Kubernetes question'],
67
  'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
68
  })
69
 
70
+ courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], {
71
  'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
72
  'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
73
  'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
 
76
  'completion_rate': [0.7, 0.65, 0.8, 0.6]
77
  })
78
 
79
+ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], {
80
  'job_title': ['DevOps Engineer', 'Cloud Architect'],
81
  'company_name': ['Tech Corp', 'Cloud Inc'],
82
  'location': ['Remote', 'Silicon Valley'],
 
84
  'job_description': ['DevOps role description', 'Cloud architecture position']
85
  })
86
 
87
+ # Validate questions_df
88
+ if questions_df is None or questions_df.empty:
89
+ logger.error("questions_df is empty or could not be loaded. Exiting.")
90
+ exit(1)
91
+ if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
92
+ logger.error("questions_df is missing required columns. Exiting.")
93
+ exit(1)
94
+ logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {questions_df['Skill'].unique().tolist()}")
95
+
96
+ # Load or Initialize Models
97
+ if os.path.exists(UNIVERSAL_MODEL_PATH):
98
+ universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
99
+ else:
100
+ universal_model = SentenceTransformer("all-MiniLM-L6-v2")
101
+
102
+ if os.path.exists(DETECTOR_MODEL_PATH):
103
+ detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
104
+ detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
105
+ else:
106
+ detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
107
+ detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
108
+
109
+ # Precompute Resources with Validation
110
+ def resources_valid(saved_skills, current_skills):
111
+ return set(saved_skills) == set(current_skills)
112
 
 
113
  def initialize_resources(user_skills):
114
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
115
 
 
263
  initialize_resources(user_skills)
264
 
265
  # Get relevant questions
266
+ filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])]
267
+ if filtered_questions.empty:
268
+ return jsonify({"error": "No matching questions found for the user's skills."}), 500
269
+
270
+ user_questions = []
271
+ for skill in user_skills:
272
+ skill_questions = filtered_questions[filtered_questions['Skill'].str.lower() == skill.lower()]
273
+ if not skill_questions.empty:
274
+ user_questions.append(skill_questions.sample(1).iloc[0])
275
+ else:
276
+ user_questions.append({
277
+ 'Skill': skill,
278
+ 'Question': f"What are the best practices for using {skill} in a production environment?",
279
+ 'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
280
+ })
281
+ user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
282
+
283
+ if len(user_questions) != len(user_skills):
284
+ return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
285
+
286
+ user_responses = []
287
+ for idx, row in user_questions.iterrows():
288
+ answer = answers[idx]
289
+ if not answer or answer.lower() == 'skip':
290
+ user_responses.append((row['Skill'], None, row['Question']))
291
+ else:
292
+ user_responses.append((row['Skill'], answer, row['Question']))
293
 
 
294
  with Pool(processes=min(cpu_count(), 4)) as pool:
295
+ eval_args = [(skill, user_code, question) for skill, user_code, question in user_responses if user_code]
296
+ results = pool.map(evaluate_response, eval_args)
297
 
298
+ user_scores = {}
299
+ ai_flags = {}
300
+ scores_list = []
301
+ skipped_questions = [f"{skill} ({question})" for skill, user_code, question in user_responses if user_code is None]
302
  for skill, score, is_ai in results:
303
+ if skill in user_scores:
304
+ user_scores[skill] = max(user_scores[skill], score)
305
+ ai_flags[skill] = ai_flags[skill] or is_ai
306
+ else:
307
+ user_scores[skill] = score
308
+ ai_flags[skill] = is_ai
309
+ scores_list.append(score)
310
+
311
+ mean_score = np.mean(scores_list) if scores_list else 50
312
+ dynamic_threshold = max(40, mean_score)
313
+ weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
314
 
315
  # Generate recommendations
316
  courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
317
  jobs = recommend_jobs(user_skills, user_level)
318
 
319
  return jsonify({
320
+ "assessment_results": {
321
+ "skills": [
322
+ {
323
+ "skill": skill,
324
+ "progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
325
+ "score": f"{score:.2f} %",
326
+ "origin": "AI-Generated" if is_ai else "Human-Written"
327
+ } for skill, score, is_ai in results
328
+ ],
329
+ "mean_score": mean_score,
330
+ "dynamic_threshold": dynamic_threshold,
331
+ "weak_skills": weak_skills,
332
+ "skipped_questions": skipped_questions
333
+ },
334
+ "recommended_courses": courses[:3],
335
+ "recommended_jobs": jobs[:5]
336
  })
337
  except Exception as e:
338
  logger.error(f"Assessment error: {e}")