Muhammad541 commited on
Commit
a43664e
·
verified ·
1 Parent(s): bbcb202

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -390
app.py CHANGED
@@ -1,399 +1,175 @@
1
- import os
2
  import pandas as pd
3
- import torch
4
- from sentence_transformers import SentenceTransformer, util
5
- import faiss
6
  import numpy as np
 
 
 
7
  import pickle
8
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
- import scipy.special
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
- from flask import Flask, request, jsonify
12
- import logging
13
-
14
- # Set up logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- # Disable tokenizers parallelism to avoid fork-related deadlocks
19
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
-
21
- # Paths for saving artifacts
22
- MODEL_DIR = "./saved_models"
23
- FALLBACK_MODEL_DIR = "/tmp/saved_models"
24
-
25
- try:
26
- os.makedirs(MODEL_DIR, exist_ok=True)
27
- logger.info(f"Using model directory: {MODEL_DIR}")
28
- chosen_model_dir = MODEL_DIR
29
- except Exception as e:
30
- logger.warning(f"Failed to create {MODEL_DIR}: {e}. Using fallback directory.")
31
- os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True)
32
- chosen_model_dir = FALLBACK_MODEL_DIR
33
-
34
- # Update paths
35
- UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model")
36
- DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model")
37
- TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl")
38
- SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl")
39
- QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl")
40
- FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index")
41
- ANSWER_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "answer_embeddings.pkl")
42
- COURSE_SIMILARITY_PATH = os.path.join(chosen_model_dir, "course_similarity.pkl")
43
- JOB_SIMILARITY_PATH = os.path.join(chosen_model_dir, "job_similarity.pkl")
44
-
45
- # Global variables for precomputed data
46
- tfidf_vectorizer = None
47
- skill_tfidf = None
48
- question_to_answer = None
49
- faiss_index = None
50
- answer_embeddings = None
51
- course_similarity = None
52
- job_similarity = None
53
-
54
- # Improved dataset loading with fallback
55
- def load_dataset(file_path, required_columns=[], additional_columns=['popularity', 'completion_rate'], fallback_data=None):
56
- try:
57
- df = pd.read_csv(file_path)
58
- missing_required = [col for col in required_columns if col not in df.columns]
59
- missing_additional = [col for col in additional_columns if col not in df.columns]
60
-
61
- # Handle missing required columns
62
- if missing_required:
63
- logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
64
- for col in missing_required:
65
- df[col] = ""
66
-
67
- # Handle missing additional columns (popularity, completion_rate, etc.)
68
- if missing_additional:
69
- logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
70
- for col in missing_additional:
71
- if col == 'popularity':
72
- df[col] = 0.8 # Default value for popularity
73
- elif col == 'completion_rate':
74
- df[col] = 0.7 # Default value for completion_rate
75
- else:
76
- df[col] = 0.0 # Default for other additional columns
77
-
78
- # Ensure 'level' column has valid values (not empty)
79
- if 'level' in df.columns:
80
- df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
81
- else:
82
- logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
83
- df['level'] = 'Intermediate'
84
-
85
- return df
86
- except ValueError as ve:
87
- logger.error(f"ValueError loading {file_path}: {ve}. Using fallback data.")
88
- if fallback_data is not None:
89
- logger.info(f"Using fallback data for {file_path}")
90
- return pd.DataFrame(fallback_data)
91
- return None
92
- except Exception as e:
93
- logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
94
- if fallback_data is not None:
95
- logger.info(f"Using fallback data for {file_path}")
96
- return pd.DataFrame(fallback_data)
97
- return None
98
-
99
- # Load datasets with fallbacks
100
- questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], [], {
101
- 'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
102
- 'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
103
- 'Intermediate Python question', 'Basic Kubernetes question'],
104
- 'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
105
- })
106
-
107
- courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], ['popularity', 'completion_rate'], {
108
- 'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
109
- 'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
110
- 'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
111
- 'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
112
- 'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
113
- 'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
114
- })
115
-
116
- jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], [], {
117
- 'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
118
- 'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
119
- 'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
120
- 'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
121
- 'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
122
- 'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
123
- })
124
-
125
- # Validate questions_df
126
- if questions_df is None or questions_df.empty:
127
- logger.error("questions_df is empty or could not be loaded. Exiting.")
128
- exit(1)
129
- if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
130
- logger.error("questions_df is missing required columns. Exiting.")
131
- exit(1)
132
- logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
133
-
134
- # Load or Initialize Models with Fallback
135
- def load_universal_model():
136
- default_model = "all-MiniLM-L6-v2"
137
- try:
138
- if os.path.exists(UNIVERSAL_MODEL_PATH):
139
- logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
140
- return SentenceTransformer(UNIVERSAL_MODEL_PATH)
141
- else:
142
- logger.info(f"Loading universal model: {default_model}")
143
- model = SentenceTransformer(default_model)
144
- model.save(UNIVERSAL_MODEL_PATH)
145
- return model
146
- except Exception as e:
147
- logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
148
- exit(1)
149
-
150
- universal_model = load_universal_model()
151
-
152
- if os.path.exists(DETECTOR_MODEL_PATH):
153
- detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
154
- detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
155
- else:
156
- detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
157
- detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
158
-
159
- # Load Precomputed Resources
160
- def load_precomputed_resources():
161
- global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
162
- if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
163
- try:
164
- with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
165
- with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
166
- with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f)
167
- faiss_index = faiss.read_index(FAISS_INDEX_PATH)
168
- with open(ANSWER_EMBEDDINGS_PATH, 'rb') as f: answer_embeddings = pickle.load(f)
169
- with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f)
170
- with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f)
171
- logger.info("Loaded precomputed resources successfully")
172
- except Exception as e:
173
- logger.error(f"Error loading precomputed resources: {e}")
174
- precompute_resources()
175
- else:
176
- precompute_resources()
177
-
178
- # Precompute Resources Offline (to be run separately)
179
- def precompute_resources():
180
- global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
181
- logger.info("Precomputing resources offline")
182
- try:
183
- tfidf_vectorizer = TfidfVectorizer(stop_words='english')
184
- all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
185
- tfidf_vectorizer.fit(all_texts)
186
-
187
- skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
188
- question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
189
- answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
190
-
191
- faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
192
- faiss_index.add(answer_embeddings)
193
-
194
- # Precompute course similarities
195
- course_skills = courses_df['skills'].fillna("").tolist()
196
- course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
197
- skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
198
- course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
199
-
200
- # Precompute job similarities
201
- job_skills = jobs_df['required_skills'].fillna("").tolist()
202
- job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
203
- job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
204
-
205
- # Save precomputed resources
206
- with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
207
- with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
208
- with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
209
- faiss.write_index(faiss_index, FAISS_INDEX_PATH)
210
- with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
211
- with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
212
- with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
213
- universal_model.save(UNIVERSAL_MODEL_PATH)
214
- logger.info(f"Precomputed resources saved to {chosen_model_dir}")
215
- except Exception as e:
216
- logger.error(f"Error during precomputation: {e}")
217
- raise
218
-
219
- # Evaluation with precomputed data
220
- def evaluate_response(args):
221
- try:
222
- skill, user_answer, question_idx = args
223
- if not user_answer:
224
- return skill, 0.0, False
225
-
226
- inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
227
- with torch.no_grad():
228
- logits = detector_model(**inputs).logits
229
- probs = scipy.special.softmax(logits, axis=1).tolist()[0]
230
- is_ai = probs[1] > 0.5
231
-
232
- user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
233
- expected_embedding = torch.tensor(answer_embeddings[question_idx])
234
- score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
235
-
236
- user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
237
- skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
238
- relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
239
- score *= max(0.5, min(1.0, relevance))
240
-
241
- return skill, round(max(0, score), 2), is_ai
242
- except Exception as e:
243
- logger.error(f"Evaluation error for {skill}: {e}")
244
- return skill, 0.0, False
245
-
246
- # Course recommendation with precomputed similarity
247
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
248
- try:
249
- if not skills_to_improve or courses_df.empty:
250
- logger.info("No skills to improve or courses_df is empty.")
251
- return []
252
-
253
- skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
254
- if not skill_indices:
255
- logger.info("No matching skill indices found.")
256
- return []
257
-
258
- similarities = course_similarity[skill_indices]
259
- # Use default arrays to avoid KeyError
260
- popularity = courses_df['popularity'].values if 'popularity' in courses_df else np.full(len(courses_df), 0.8)
261
- completion_rate = courses_df['completion_rate'].values if 'completion_rate' in courses_df else np.full(len(courses_df), 0.7)
262
- total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
263
-
264
- target_level = 'Advanced' if upgrade else user_level
265
- idx = np.argsort(-total_scores)[:5]
266
- candidates = courses_df.iloc[idx]
267
-
268
- # Filter by level, but fallback to all courses if none match
269
- filtered_candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
270
- if filtered_candidates.empty:
271
- logger.warning(f"No courses found for level {target_level}. Returning top courses regardless of level.")
272
- filtered_candidates = candidates
273
-
274
- return filtered_candidates[['course_title', 'Organization']].values.tolist()[:3]
275
- except Exception as e:
276
- logger.error(f"Course recommendation error: {e}")
277
  return []
278
-
279
- # Job recommendation with precomputed similarity
 
 
 
 
 
 
 
 
 
280
  def recommend_jobs(user_skills, user_level):
281
- try:
282
- if jobs_df.empty:
283
- return []
284
-
285
- skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
286
- if not skill_indices:
287
- return []
288
-
289
- similarities = job_similarity[skill_indices]
290
- total_scores = 0.5 * np.max(similarities, axis=0)
291
-
292
- if 'level' not in jobs_df.columns:
293
- jobs_df['level'] = 'Intermediate'
294
- level_col = jobs_df['level'].astype(str)
295
- level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
296
- user_level_num = level_map.get(user_level, 1)
297
- level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
298
-
299
- location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
300
- total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
301
- top_job_indices = np.argsort(-total_job_scores)[:5]
302
-
303
- return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
304
- jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
305
- except Exception as e:
306
- logger.error(f"Job recommendation error: {e}")
307
  return []
308
-
309
- # Flask application setup
310
- app = Flask(__name__)
311
-
312
- @app.route('/')
313
- def health_check():
314
- return jsonify({"status": "active", "model_dir": chosen_model_dir})
315
-
316
- @app.route('/assess', methods=['POST'])
317
- def assess_skills():
318
- try:
319
- data = request.get_json()
320
- if not data or 'skills' not in data or 'answers' not in data:
321
- return jsonify({"error": "Missing required fields"}), 400
322
-
323
- user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
324
- answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
325
- user_level = data.get('user_level', 'Intermediate').strip()
326
-
327
- if len(answers) != len(user_skills):
328
- return jsonify({"error": "Answers count must match skills count"}), 400
329
-
330
- load_precomputed_resources() # Load precomputed resources before processing
331
-
332
- user_questions = []
333
- for skill in user_skills:
334
- skill_questions = questions_df[questions_df['Skill'] == skill]
335
- if not skill_questions.empty:
336
- user_questions.append(skill_questions.sample(1).iloc[0])
337
- else:
338
- user_questions.append({
339
- 'Skill': skill,
340
- 'Question': f"What are the best practices for using {skill} in a production environment?",
341
- 'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures."
342
- })
343
- user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
344
-
345
- user_responses = []
 
 
 
 
 
 
 
 
 
 
 
 
346
  for idx, row in user_questions.iterrows():
347
- answer = answers[idx]
348
- if not answer or answer.lower() == 'skip':
349
- user_responses.append((row['Skill'], None, None))
350
- else:
351
- question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
352
- user_responses.append((row['Skill'], answer, question_idx))
353
-
354
- results = [evaluate_response(response) for response in user_responses]
355
-
356
- user_scores = {}
357
- ai_flags = {}
358
- scores_list = []
359
- skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code]
360
- for skill, score, is_ai in results:
361
- if skill in user_scores:
362
- user_scores[skill] = max(user_scores[skill], score)
363
- ai_flags[skill] = ai_flags[skill] or is_ai
364
- else:
365
- user_scores[skill] = score
366
- ai_flags[skill] = is_ai
367
- scores_list.append(score)
368
-
369
- mean_score = np.mean(scores_list) if scores_list else 50
370
  dynamic_threshold = max(40, mean_score)
371
- weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
372
-
373
- courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills)
374
- jobs = recommend_jobs(user_skills, user_level)
375
-
376
- return jsonify({
377
- "assessment_results": {
378
- "skills": [
379
- {
380
- "skill": skill,
381
- "progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
382
- "score": f"{score:.2f} %",
383
- "origin": "AI-Generated" if is_ai else "Human-Written"
384
- } for skill, score, is_ai in results
385
- ],
386
- "mean_score": mean_score,
387
- "dynamic_threshold": dynamic_threshold,
388
- "weak_skills": weak_skills,
389
- "skipped_questions": skipped_questions
390
- },
391
- "recommended_courses": courses[:3],
392
- "recommended_jobs": jobs[:5]
393
- })
394
- except Exception as e:
395
- logger.error(f"Assessment error: {e}")
396
- return jsonify({"error": "Internal server error"}), 500
397
-
398
- if __name__ == '__main__':
399
- app.run(host='0.0.0.0', port=7860, threaded=True)
 
 
 
 
 
 
 
1
+ import streamlit as st
2
  import pandas as pd
 
 
 
3
  import numpy as np
4
+ import pymongo
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
  import pickle
8
+ import os
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ MONGO_URI = os.getenv("MONGO_URI", "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority")
14
+
15
+ # Connect to MongoDB
16
+ client = pymongo.MongoClient(MONGO_URI)
17
+ db = client['test']
18
+ users_collection = db['users']
19
+ jobs_collection = db['jobs']
20
+ courses_collection = db['courses']
21
+
22
+ # Load datasets
23
+ @st.cache_data
24
+ def load_datasets():
25
+ questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv")
26
+ courses_df = pd.read_csv("coursera_course_dataset_v2_no_null.csv")
27
+ jobs_df = pd.read_csv("Updated_Job_Posting_Dataset.csv")
28
+ return questions_df, courses_df, jobs_df
29
+
30
+ questions_df, courses_df, jobs_df = load_datasets()
31
+
32
+ # Load precomputed resources
33
+ @st.cache_resource
34
+ def load_resources():
35
+ universal_model = SentenceTransformer("all-MiniLM-L6-v2")
36
+ with open("tfidf_vectorizer.pkl", "rb") as f: tfidf_vectorizer = pickle.load(f)
37
+ with open("skill_tfidf.pkl", "rb") as f: skill_tfidf = pickle.load(f)
38
+ with open("question_to_answer.pkl", "rb") as f: question_to_answer = pickle.load(f)
39
+ faiss_index = faiss.read_index("faiss_index.index")
40
+ with open("answer_embeddings.pkl", "rb") as f: answer_embeddings = pickle.load(f)
41
+ with open("course_similarity.pkl", "rb") as f: course_similarity = pickle.load(f)
42
+ with open("job_similarity.pkl", "rb") as f: job_similarity = pickle.load(f)
43
+ return universal_model, tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
44
+
45
+ universal_model, tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity = load_resources()
46
+
47
+ # Evaluate response
48
+ def evaluate_response(skill, user_answer, question_idx):
49
+ if not user_answer or user_answer.lower() == "skip":
50
+ return skill, 0.0
51
+ user_embedding = universal_model.encode([user_answer])[0]
52
+ expected_embedding = answer_embeddings[question_idx]
53
+ score = np.dot(user_embedding, expected_embedding) / (np.linalg.norm(user_embedding) * np.linalg.norm(expected_embedding) + 1e-10) * 100
54
+ user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
55
+ skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
56
+ relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
57
+ return skill, max(0, score * max(0.5, min(1.0, relevance)))
58
+
59
+ # Recommend courses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def recommend_courses(skills_to_improve, user_level, upgrade=False):
61
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
62
+ if not skill_indices:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return []
64
+ similarities = course_similarity[skill_indices]
65
+ popularity = courses_df['popularity'].fillna(0.8).values
66
+ completion_rate = courses_df['completion_rate'].fillna(0.7).values
67
+ total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
68
+ target_level = 'Advanced' if upgrade else user_level
69
+ idx = np.argsort(-total_scores)[:5]
70
+ candidates = courses_df.iloc[idx]
71
+ filtered = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
72
+ return filtered[['course_title', 'Organization']].values.tolist()[:3] if not filtered.empty else candidates[['course_title', 'Organization']].values.tolist()[:3]
73
+
74
+ # Recommend jobs
75
  def recommend_jobs(user_skills, user_level):
76
+ if jobs_df.empty:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  return []
78
+ skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
79
+ if not skill_indices:
80
+ return []
81
+ similarities = job_similarity[skill_indices]
82
+ total_scores = 0.5 * np.max(similarities, axis=0)
83
+ level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
84
+ user_level_num = level_map.get(user_level, 1)
85
+ level_scores = jobs_df['level'].apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2).fillna(0.5)
86
+ location_pref = jobs_df['location'].apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7).fillna(0.7)
87
+ total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
88
+ top_job_indices = np.argsort(-total_job_scores)[:5]
89
+ return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'], jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
90
+
91
+ # Streamlit UI
92
+ st.title("Skill Assessment and Recommendations")
93
+
94
+ # Simulate user signup and skill extraction
95
+ if 'user_skills' not in st.session_state:
96
+ st.session_state.user_skills = []
97
+ st.session_state.user_level = "Intermediate"
98
+
99
+ with st.form("signup_form"):
100
+ name = st.text_input("Name")
101
+ email = st.text_input("Email")
102
+ skills_input = st.text_area("Enter your skills (comma-separated)")
103
+ submit = st.form_submit_button("Sign Up")
104
+ if submit and name and email and skills_input:
105
+ st.session_state.user_skills = [s.strip() for s in skills_input.split(",") if s.strip()]
106
+ user_data = {
107
+ "name": name,
108
+ "email": email,
109
+ "skills": st.session_state.user_skills,
110
+ "createdAt": pd.Timestamp.now(),
111
+ "lastLogin": pd.Timestamp.now()
112
+ }
113
+ users_collection.insert_one(user_data)
114
+ st.success("User registered successfully!")
115
+
116
+ # Skill Assessment
117
+ if st.session_state.user_skills:
118
+ st.write("### Skill Assessment")
119
+ user_questions = []
120
+ for skill in st.session_state.user_skills:
121
+ skill_questions = questions_df[questions_df['Skill'] == skill]
122
+ if not skill_questions.empty:
123
+ user_questions.append(skill_questions.sample(1).iloc[0])
124
+ user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
125
+
126
+ answers = {}
127
+ with st.form("assessment_form"):
128
  for idx, row in user_questions.iterrows():
129
+ answers[row['Question']] = st.text_area(f"Question for {row['Skill']}: {row['Question']}", key=f"q_{idx}")
130
+ submit_assessment = st.form_submit_button("Submit Assessment")
131
+
132
+ if submit_assessment:
133
+ scores = {}
134
+ for idx, row in user_questions.iterrows():
135
+ question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
136
+ skill, score = evaluate_response(row['Skill'], answers.get(row['Question'], ""), question_idx)
137
+ scores[skill] = max(scores.get(skill, 0), score)
138
+
139
+ mean_score = np.mean(list(scores.values())) if scores else 50
 
 
 
 
 
 
 
 
 
 
 
 
140
  dynamic_threshold = max(40, mean_score)
141
+ weak_skills = [skill for skill, score in scores.items() if score < dynamic_threshold]
142
+
143
+ st.session_state.scores = scores
144
+ st.session_state.weak_skills = weak_skills
145
+ st.session_state.mean_score = mean_score
146
+
147
+ # Update user scores in MongoDB
148
+ user = users_collection.find_one({"email": email})
149
+ if user:
150
+ users_collection.update_one(
151
+ {"_id": user["_id"]},
152
+ {"$set": {"skills_scores": scores}}
153
+ )
154
+
155
+ if 'scores' in st.session_state:
156
+ st.write("### Assessment Results")
157
+ for skill, score in st.session_state.scores.items():
158
+ st.write(f"{skill}: {score:.2f}%")
159
+ st.write(f"Mean Score: {st.session_state.mean_score:.2f}%")
160
+ st.write(f"Weak Skills: {', '.join(st.session_state.weak_skills)}")
161
+
162
+ # Recommendations
163
+ st.write("### Recommended Courses")
164
+ courses = recommend_courses(st.session_state.weak_skills or st.session_state.user_skills, st.session_state.user_level)
165
+ for course in courses:
166
+ st.write(f"- {course[0]} by {course[1]}")
167
+
168
+ st.write("### Recommended Jobs")
169
+ jobs = recommend_jobs(st.session_state.user_skills, st.session_state.user_level)
170
+ for job in jobs:
171
+ st.write(f"- {job[0]} at {job[1]} ({job[2]})")
172
+
173
+ # Run the app
174
+ if __name__ == "__main__":
175
+ st.set_page_config(layout="wide")