Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -22,10 +22,10 @@ logger = logging.getLogger(__name__)
|
|
22 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
23 |
|
24 |
# Paths for saving artifacts
|
25 |
-
MODEL_DIR = "./saved_models"
|
26 |
-
FALLBACK_MODEL_DIR = "/tmp/saved_models"
|
27 |
|
28 |
-
#
|
29 |
try:
|
30 |
os.makedirs(MODEL_DIR, exist_ok=True)
|
31 |
logger.info(f"Using model directory: {MODEL_DIR}")
|
@@ -44,7 +44,7 @@ QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
|
|
44 |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
|
45 |
|
46 |
# Improved dataset loading with fallback
|
47 |
-
def load_dataset(file_path, required_columns=[]):
|
48 |
try:
|
49 |
df = pd.read_csv(file_path)
|
50 |
for col in required_columns:
|
@@ -54,17 +54,20 @@ def load_dataset(file_path, required_columns=[]):
|
|
54 |
return df
|
55 |
except Exception as e:
|
56 |
logger.error(f"Error loading {file_path}: {e}")
|
|
|
|
|
|
|
57 |
return None
|
58 |
|
59 |
# Load datasets with fallbacks
|
60 |
-
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"]
|
61 |
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
62 |
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
63 |
'Intermediate Python question', 'Basic Kubernetes question'],
|
64 |
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
65 |
})
|
66 |
|
67 |
-
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"]
|
68 |
'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
|
69 |
'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
|
70 |
'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
|
@@ -73,7 +76,7 @@ courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "
|
|
73 |
'completion_rate': [0.7, 0.65, 0.8, 0.6]
|
74 |
})
|
75 |
|
76 |
-
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"]
|
77 |
'job_title': ['DevOps Engineer', 'Cloud Architect'],
|
78 |
'company_name': ['Tech Corp', 'Cloud Inc'],
|
79 |
'location': ['Remote', 'Silicon Valley'],
|
@@ -81,19 +84,32 @@ jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company
|
|
81 |
'job_description': ['DevOps role description', 'Cloud architecture position']
|
82 |
})
|
83 |
|
84 |
-
#
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
# Enhanced resource initialization
|
97 |
def initialize_resources(user_skills):
|
98 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
|
99 |
|
@@ -247,37 +263,76 @@ def assess_skills():
|
|
247 |
initialize_resources(user_skills)
|
248 |
|
249 |
# Get relevant questions
|
250 |
-
|
251 |
-
if
|
252 |
-
|
253 |
-
|
254 |
-
user_questions =
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
-
# Parallel processing with error handling
|
258 |
with Pool(processes=min(cpu_count(), 4)) as pool:
|
259 |
-
|
|
|
260 |
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
264 |
for skill, score, is_ai in results:
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
# Generate recommendations
|
272 |
courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
273 |
jobs = recommend_jobs(user_skills, user_level)
|
274 |
|
275 |
return jsonify({
|
276 |
-
"
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
})
|
282 |
except Exception as e:
|
283 |
logger.error(f"Assessment error: {e}")
|
|
|
22 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
23 |
|
24 |
# Paths for saving artifacts
|
25 |
+
MODEL_DIR = "./saved_models" # Primary location in /app/saved_models
|
26 |
+
FALLBACK_MODEL_DIR = "/tmp/saved_models" # Fallback if ./saved_models fails
|
27 |
|
28 |
+
# Try to use the primary directory, fall back to /tmp if needed
|
29 |
try:
|
30 |
os.makedirs(MODEL_DIR, exist_ok=True)
|
31 |
logger.info(f"Using model directory: {MODEL_DIR}")
|
|
|
44 |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
|
45 |
|
46 |
# Improved dataset loading with fallback
|
47 |
+
def load_dataset(file_path, required_columns=[], fallback_data=None):
|
48 |
try:
|
49 |
df = pd.read_csv(file_path)
|
50 |
for col in required_columns:
|
|
|
54 |
return df
|
55 |
except Exception as e:
|
56 |
logger.error(f"Error loading {file_path}: {e}")
|
57 |
+
if fallback_data is not None:
|
58 |
+
logger.info(f"Using fallback data for {file_path}")
|
59 |
+
return pd.DataFrame(fallback_data)
|
60 |
return None
|
61 |
|
62 |
# Load datasets with fallbacks
|
63 |
+
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], {
|
64 |
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
65 |
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
66 |
'Intermediate Python question', 'Basic Kubernetes question'],
|
67 |
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
68 |
})
|
69 |
|
70 |
+
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], {
|
71 |
'skills': ['Docker', 'Jenkins', 'Azure', 'Cybersecurity'],
|
72 |
'course_title': ['Docker Mastery', 'Jenkins CI/CD', 'Azure Fundamentals', 'Cybersecurity Basics'],
|
73 |
'Organization': ['Udemy', 'Coursera', 'Microsoft', 'edX'],
|
|
|
76 |
'completion_rate': [0.7, 0.65, 0.8, 0.6]
|
77 |
})
|
78 |
|
79 |
+
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], {
|
80 |
'job_title': ['DevOps Engineer', 'Cloud Architect'],
|
81 |
'company_name': ['Tech Corp', 'Cloud Inc'],
|
82 |
'location': ['Remote', 'Silicon Valley'],
|
|
|
84 |
'job_description': ['DevOps role description', 'Cloud architecture position']
|
85 |
})
|
86 |
|
87 |
+
# Validate questions_df
|
88 |
+
if questions_df is None or questions_df.empty:
|
89 |
+
logger.error("questions_df is empty or could not be loaded. Exiting.")
|
90 |
+
exit(1)
|
91 |
+
if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
|
92 |
+
logger.error("questions_df is missing required columns. Exiting.")
|
93 |
+
exit(1)
|
94 |
+
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {questions_df['Skill'].unique().tolist()}")
|
95 |
+
|
96 |
+
# Load or Initialize Models
|
97 |
+
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
98 |
+
universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
99 |
+
else:
|
100 |
+
universal_model = SentenceTransformer("all-MiniLM-L6-v2")
|
101 |
+
|
102 |
+
if os.path.exists(DETECTOR_MODEL_PATH):
|
103 |
+
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
|
104 |
+
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
|
105 |
+
else:
|
106 |
+
detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
|
107 |
+
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
|
108 |
+
|
109 |
+
# Precompute Resources with Validation
|
110 |
+
def resources_valid(saved_skills, current_skills):
|
111 |
+
return set(saved_skills) == set(current_skills)
|
112 |
|
|
|
113 |
def initialize_resources(user_skills):
|
114 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings
|
115 |
|
|
|
263 |
initialize_resources(user_skills)
|
264 |
|
265 |
# Get relevant questions
|
266 |
+
filtered_questions = questions_df[questions_df['Skill'].str.lower().isin([skill.lower() for skill in user_skills])]
|
267 |
+
if filtered_questions.empty:
|
268 |
+
return jsonify({"error": "No matching questions found for the user's skills."}), 500
|
269 |
+
|
270 |
+
user_questions = []
|
271 |
+
for skill in user_skills:
|
272 |
+
skill_questions = filtered_questions[filtered_questions['Skill'].str.lower() == skill.lower()]
|
273 |
+
if not skill_questions.empty:
|
274 |
+
user_questions.append(skill_questions.sample(1).iloc[0])
|
275 |
+
else:
|
276 |
+
user_questions.append({
|
277 |
+
'Skill': skill,
|
278 |
+
'Question': f"What are the best practices for using {skill} in a production environment?",
|
279 |
+
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
|
280 |
+
})
|
281 |
+
user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
|
282 |
+
|
283 |
+
if len(user_questions) != len(user_skills):
|
284 |
+
return jsonify({"error": f"Internal error: Number of selected questions ({len(user_questions)}) does not match number of skills ({len(user_skills)})."}), 500
|
285 |
+
|
286 |
+
user_responses = []
|
287 |
+
for idx, row in user_questions.iterrows():
|
288 |
+
answer = answers[idx]
|
289 |
+
if not answer or answer.lower() == 'skip':
|
290 |
+
user_responses.append((row['Skill'], None, row['Question']))
|
291 |
+
else:
|
292 |
+
user_responses.append((row['Skill'], answer, row['Question']))
|
293 |
|
|
|
294 |
with Pool(processes=min(cpu_count(), 4)) as pool:
|
295 |
+
eval_args = [(skill, user_code, question) for skill, user_code, question in user_responses if user_code]
|
296 |
+
results = pool.map(evaluate_response, eval_args)
|
297 |
|
298 |
+
user_scores = {}
|
299 |
+
ai_flags = {}
|
300 |
+
scores_list = []
|
301 |
+
skipped_questions = [f"{skill} ({question})" for skill, user_code, question in user_responses if user_code is None]
|
302 |
for skill, score, is_ai in results:
|
303 |
+
if skill in user_scores:
|
304 |
+
user_scores[skill] = max(user_scores[skill], score)
|
305 |
+
ai_flags[skill] = ai_flags[skill] or is_ai
|
306 |
+
else:
|
307 |
+
user_scores[skill] = score
|
308 |
+
ai_flags[skill] = is_ai
|
309 |
+
scores_list.append(score)
|
310 |
+
|
311 |
+
mean_score = np.mean(scores_list) if scores_list else 50
|
312 |
+
dynamic_threshold = max(40, mean_score)
|
313 |
+
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
314 |
|
315 |
# Generate recommendations
|
316 |
courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
317 |
jobs = recommend_jobs(user_skills, user_level)
|
318 |
|
319 |
return jsonify({
|
320 |
+
"assessment_results": {
|
321 |
+
"skills": [
|
322 |
+
{
|
323 |
+
"skill": skill,
|
324 |
+
"progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
|
325 |
+
"score": f"{score:.2f} %",
|
326 |
+
"origin": "AI-Generated" if is_ai else "Human-Written"
|
327 |
+
} for skill, score, is_ai in results
|
328 |
+
],
|
329 |
+
"mean_score": mean_score,
|
330 |
+
"dynamic_threshold": dynamic_threshold,
|
331 |
+
"weak_skills": weak_skills,
|
332 |
+
"skipped_questions": skipped_questions
|
333 |
+
},
|
334 |
+
"recommended_courses": courses[:3],
|
335 |
+
"recommended_jobs": jobs[:5]
|
336 |
})
|
337 |
except Exception as e:
|
338 |
logger.error(f"Assessment error: {e}")
|