Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ import scipy.special
|
|
10 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
from flask import Flask, request, jsonify
|
12 |
import logging
|
|
|
|
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
@@ -18,6 +20,14 @@ logger = logging.getLogger(__name__)
|
|
18 |
# Disable tokenizers parallelism to avoid fork-related deadlocks
|
19 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Paths for saving artifacts
|
22 |
MODEL_DIR = "./saved_models"
|
23 |
FALLBACK_MODEL_DIR = "/tmp/saved_models"
|
@@ -58,24 +68,21 @@ def load_dataset(file_path, required_columns=[], additional_columns=['popularity
|
|
58 |
missing_required = [col for col in required_columns if col not in df.columns]
|
59 |
missing_additional = [col for col in additional_columns if col not in df.columns]
|
60 |
|
61 |
-
# Handle missing required columns
|
62 |
if missing_required:
|
63 |
logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
|
64 |
for col in missing_required:
|
65 |
df[col] = ""
|
66 |
|
67 |
-
# Handle missing additional columns (popularity, completion_rate, etc.)
|
68 |
if missing_additional:
|
69 |
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
|
70 |
for col in missing_additional:
|
71 |
if col == 'popularity':
|
72 |
-
df[col] = 0.8
|
73 |
elif col == 'completion_rate':
|
74 |
-
df[col] = 0.7
|
75 |
else:
|
76 |
-
df[col] = 0.0
|
77 |
|
78 |
-
# Ensure 'level' column has valid values (not empty)
|
79 |
if 'level' in df.columns:
|
80 |
df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
|
81 |
else:
|
@@ -104,24 +111,6 @@ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Qu
|
|
104 |
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
105 |
})
|
106 |
|
107 |
-
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], ['popularity', 'completion_rate'], {
|
108 |
-
'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
109 |
-
'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
|
110 |
-
'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
|
111 |
-
'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
|
112 |
-
'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
|
113 |
-
'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
|
114 |
-
})
|
115 |
-
|
116 |
-
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], [], {
|
117 |
-
'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
|
118 |
-
'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
|
119 |
-
'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
|
120 |
-
'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
|
121 |
-
'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
|
122 |
-
'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
|
123 |
-
})
|
124 |
-
|
125 |
# Validate questions_df
|
126 |
if questions_df is None or questions_df.empty:
|
127 |
logger.error("questions_df is empty or could not be loaded. Exiting.")
|
@@ -175,7 +164,7 @@ def load_precomputed_resources():
|
|
175 |
else:
|
176 |
precompute_resources()
|
177 |
|
178 |
-
# Precompute Resources Offline
|
179 |
def precompute_resources():
|
180 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
181 |
logger.info("Precomputing resources offline")
|
@@ -191,18 +180,6 @@ def precompute_resources():
|
|
191 |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
192 |
faiss_index.add(answer_embeddings)
|
193 |
|
194 |
-
# Precompute course similarities
|
195 |
-
course_skills = courses_df['skills'].fillna("").tolist()
|
196 |
-
course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
197 |
-
skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
198 |
-
course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
|
199 |
-
|
200 |
-
# Precompute job similarities
|
201 |
-
job_skills = jobs_df['required_skills'].fillna("").tolist()
|
202 |
-
job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
203 |
-
job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
|
204 |
-
|
205 |
-
# Save precomputed resources
|
206 |
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
207 |
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
208 |
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
@@ -243,65 +220,50 @@ def evaluate_response(args):
|
|
243 |
logger.error(f"Evaluation error for {skill}: {e}")
|
244 |
return skill, 0.0, False
|
245 |
|
246 |
-
#
|
247 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
try:
|
249 |
-
if not skills_to_improve
|
250 |
-
logger.info("No skills to improve or courses_df is empty.")
|
251 |
-
return []
|
252 |
-
|
253 |
-
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
|
254 |
-
if not skill_indices:
|
255 |
-
logger.info("No matching skill indices found.")
|
256 |
return []
|
257 |
|
258 |
-
similarities = course_similarity[skill_indices]
|
259 |
-
# Use default arrays to avoid KeyError
|
260 |
-
popularity = courses_df['popularity'].values if 'popularity' in courses_df else np.full(len(courses_df), 0.8)
|
261 |
-
completion_rate = courses_df['completion_rate'].values if 'completion_rate' in courses_df else np.full(len(courses_df), 0.7)
|
262 |
-
total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
|
263 |
-
|
264 |
target_level = 'Advanced' if upgrade else user_level
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
logger.warning(f"No courses found for level {target_level}. Returning top courses regardless of level.")
|
272 |
-
filtered_candidates = candidates
|
273 |
-
|
274 |
-
return filtered_candidates[['course_title', 'Organization']].values.tolist()[:3]
|
275 |
except Exception as e:
|
276 |
logger.error(f"Course recommendation error: {e}")
|
277 |
return []
|
278 |
|
279 |
-
#
|
280 |
-
def
|
281 |
try:
|
282 |
-
if
|
283 |
-
return []
|
284 |
-
|
285 |
-
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
|
286 |
-
if not skill_indices:
|
287 |
return []
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
|
296 |
-
user_level_num = level_map.get(user_level, 1)
|
297 |
-
level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
|
298 |
-
|
299 |
-
location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
|
300 |
-
total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
|
301 |
-
top_job_indices = np.argsort(-total_job_scores)[:5]
|
302 |
-
|
303 |
-
return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
|
304 |
-
jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
|
305 |
except Exception as e:
|
306 |
logger.error(f"Job recommendation error: {e}")
|
307 |
return []
|
@@ -313,13 +275,29 @@ app = Flask(__name__)
|
|
313 |
def health_check():
|
314 |
return jsonify({"status": "active", "model_dir": chosen_model_dir})
|
315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
@app.route('/assess', methods=['POST'])
|
317 |
def assess_skills():
|
318 |
try:
|
319 |
data = request.get_json()
|
320 |
-
if not data or 'skills' not in data or 'answers' not in data:
|
321 |
return jsonify({"error": "Missing required fields"}), 400
|
322 |
|
|
|
323 |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
|
324 |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
|
325 |
user_level = data.get('user_level', 'Intermediate').strip()
|
@@ -327,23 +305,12 @@ def assess_skills():
|
|
327 |
if len(answers) != len(user_skills):
|
328 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
329 |
|
330 |
-
load_precomputed_resources()
|
331 |
-
|
332 |
-
|
333 |
-
for skill in user_skills:
|
334 |
-
skill_questions = questions_df[questions_df['Skill'] == skill]
|
335 |
-
if not skill_questions.empty:
|
336 |
-
user_questions.append(skill_questions.sample(1).iloc[0])
|
337 |
-
else:
|
338 |
-
user_questions.append({
|
339 |
-
'Skill': skill,
|
340 |
-
'Question': f"What are the best practices for using {skill} in a production environment?",
|
341 |
-
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
|
342 |
-
})
|
343 |
-
user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
|
344 |
|
345 |
user_responses = []
|
346 |
-
for idx, row in
|
347 |
answer = answers[idx]
|
348 |
if not answer or answer.lower() == 'skip':
|
349 |
user_responses.append((row['Skill'], None, None))
|
@@ -366,12 +333,19 @@ def assess_skills():
|
|
366 |
ai_flags[skill] = is_ai
|
367 |
scores_list.append(score)
|
368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
mean_score = np.mean(scores_list) if scores_list else 50
|
370 |
dynamic_threshold = max(40, mean_score)
|
371 |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
372 |
|
373 |
-
courses =
|
374 |
-
jobs =
|
375 |
|
376 |
return jsonify({
|
377 |
"assessment_results": {
|
@@ -388,8 +362,8 @@ def assess_skills():
|
|
388 |
"weak_skills": weak_skills,
|
389 |
"skipped_questions": skipped_questions
|
390 |
},
|
391 |
-
"recommended_courses": courses
|
392 |
-
"recommended_jobs": jobs
|
393 |
})
|
394 |
except Exception as e:
|
395 |
logger.error(f"Assessment error: {e}")
|
|
|
10 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
from flask import Flask, request, jsonify
|
12 |
import logging
|
13 |
+
from pymongo import MongoClient
|
14 |
+
import requests
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
20 |
# Disable tokenizers parallelism to avoid fork-related deadlocks
|
21 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
22 |
|
23 |
+
# MongoDB connection
|
24 |
+
MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
|
25 |
+
client = MongoClient(MONGO_URI)
|
26 |
+
db = client.get_database("test") # Adjust the database name as needed
|
27 |
+
users_collection = db["users"]
|
28 |
+
courses_collection = db["courses"]
|
29 |
+
jobs_collection = db["jobs"]
|
30 |
+
|
31 |
# Paths for saving artifacts
|
32 |
MODEL_DIR = "./saved_models"
|
33 |
FALLBACK_MODEL_DIR = "/tmp/saved_models"
|
|
|
68 |
missing_required = [col for col in required_columns if col not in df.columns]
|
69 |
missing_additional = [col for col in additional_columns if col not in df.columns]
|
70 |
|
|
|
71 |
if missing_required:
|
72 |
logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
|
73 |
for col in missing_required:
|
74 |
df[col] = ""
|
75 |
|
|
|
76 |
if missing_additional:
|
77 |
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
|
78 |
for col in missing_additional:
|
79 |
if col == 'popularity':
|
80 |
+
df[col] = 0.8
|
81 |
elif col == 'completion_rate':
|
82 |
+
df[col] = 0.7
|
83 |
else:
|
84 |
+
df[col] = 0.0
|
85 |
|
|
|
86 |
if 'level' in df.columns:
|
87 |
df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
|
88 |
else:
|
|
|
111 |
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
112 |
})
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
# Validate questions_df
|
115 |
if questions_df is None or questions_df.empty:
|
116 |
logger.error("questions_df is empty or could not be loaded. Exiting.")
|
|
|
164 |
else:
|
165 |
precompute_resources()
|
166 |
|
167 |
+
# Precompute Resources Offline
|
168 |
def precompute_resources():
|
169 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
170 |
logger.info("Precomputing resources offline")
|
|
|
180 |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
181 |
faiss_index.add(answer_embeddings)
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
184 |
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
185 |
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
|
|
220 |
logger.error(f"Evaluation error for {skill}: {e}")
|
221 |
return skill, 0.0, False
|
222 |
|
223 |
+
# Fetch questions for given skills
|
224 |
+
def get_questions_for_skills(skills):
|
225 |
+
user_questions = []
|
226 |
+
for skill in skills:
|
227 |
+
skill_questions = questions_df[questions_df['Skill'] == skill]
|
228 |
+
if not skill_questions.empty:
|
229 |
+
user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
|
230 |
+
else:
|
231 |
+
user_questions.append({
|
232 |
+
'Skill': skill,
|
233 |
+
'Question': f"What are the best practices for using {skill} in a production environment?",
|
234 |
+
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
|
235 |
+
})
|
236 |
+
return user_questions
|
237 |
+
|
238 |
+
# Recommend courses from MongoDB
|
239 |
+
def recommend_courses_from_mongo(skills_to_improve, user_level, upgrade=False):
|
240 |
try:
|
241 |
+
if not skills_to_improve:
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
return []
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
target_level = 'Advanced' if upgrade else user_level
|
245 |
+
query = {
|
246 |
+
"skills": {"$in": skills_to_improve},
|
247 |
+
"category": {"$regex": target_level, "$options": "i"}
|
248 |
+
}
|
249 |
+
courses = courses_collection.find(query).limit(3)
|
250 |
+
return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
|
|
|
|
|
|
|
|
|
251 |
except Exception as e:
|
252 |
logger.error(f"Course recommendation error: {e}")
|
253 |
return []
|
254 |
|
255 |
+
# Recommend jobs from MongoDB
|
256 |
+
def recommend_jobs_from_mongo(user_skills, user_level):
|
257 |
try:
|
258 |
+
if not user_skills:
|
|
|
|
|
|
|
|
|
259 |
return []
|
260 |
|
261 |
+
query = {
|
262 |
+
"skills": {"$in": user_skills},
|
263 |
+
"status": "active"
|
264 |
+
}
|
265 |
+
jobs = jobs_collection.find(query).limit(5)
|
266 |
+
return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
except Exception as e:
|
268 |
logger.error(f"Job recommendation error: {e}")
|
269 |
return []
|
|
|
275 |
def health_check():
|
276 |
return jsonify({"status": "active", "model_dir": chosen_model_dir})
|
277 |
|
278 |
+
@app.route('/get_questions', methods=['POST'])
|
279 |
+
def get_questions():
|
280 |
+
try:
|
281 |
+
data = request.get_json()
|
282 |
+
if not data or 'skills' not in data:
|
283 |
+
return jsonify({"error": "Missing skills field"}), 400
|
284 |
+
|
285 |
+
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
|
286 |
+
load_precomputed_resources()
|
287 |
+
questions = get_questions_for_skills(user_skills)
|
288 |
+
return jsonify({"questions": questions})
|
289 |
+
except Exception as e:
|
290 |
+
logger.error(f"Get questions error: {e}")
|
291 |
+
return jsonify({"error": "Internal server error"}), 500
|
292 |
+
|
293 |
@app.route('/assess', methods=['POST'])
|
294 |
def assess_skills():
|
295 |
try:
|
296 |
data = request.get_json()
|
297 |
+
if not data or 'skills' not in data or 'answers' not in data or 'userId' not in data:
|
298 |
return jsonify({"error": "Missing required fields"}), 400
|
299 |
|
300 |
+
user_id = data['userId']
|
301 |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
|
302 |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
|
303 |
user_level = data.get('user_level', 'Intermediate').strip()
|
|
|
305 |
if len(answers) != len(user_skills):
|
306 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
307 |
|
308 |
+
load_precomputed_resources()
|
309 |
+
user_questions = get_questions_for_skills(user_skills)
|
310 |
+
user_questions_df = pd.DataFrame(user_questions).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
user_responses = []
|
313 |
+
for idx, row in user_questions_df.iterrows():
|
314 |
answer = answers[idx]
|
315 |
if not answer or answer.lower() == 'skip':
|
316 |
user_responses.append((row['Skill'], None, None))
|
|
|
333 |
ai_flags[skill] = is_ai
|
334 |
scores_list.append(score)
|
335 |
|
336 |
+
# Update user profile with scores
|
337 |
+
skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
|
338 |
+
users_collection.update_one(
|
339 |
+
{"_id": user_id},
|
340 |
+
{"$set": {"skillScores": skill_scores}}
|
341 |
+
)
|
342 |
+
|
343 |
mean_score = np.mean(scores_list) if scores_list else 50
|
344 |
dynamic_threshold = max(40, mean_score)
|
345 |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
346 |
|
347 |
+
courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
348 |
+
jobs = recommend_jobs_from_mongo(user_skills, user_level)
|
349 |
|
350 |
return jsonify({
|
351 |
"assessment_results": {
|
|
|
362 |
"weak_skills": weak_skills,
|
363 |
"skipped_questions": skipped_questions
|
364 |
},
|
365 |
+
"recommended_courses": courses,
|
366 |
+
"recommended_jobs": jobs
|
367 |
})
|
368 |
except Exception as e:
|
369 |
logger.error(f"Assessment error: {e}")
|