Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,399 +1,175 @@
|
|
1 |
-
import
|
2 |
import pandas as pd
|
3 |
-
import torch
|
4 |
-
from sentence_transformers import SentenceTransformer, util
|
5 |
-
import faiss
|
6 |
import numpy as np
|
|
|
|
|
|
|
7 |
import pickle
|
8 |
-
|
9 |
-
import
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
# Handle missing required columns
|
62 |
-
if missing_required:
|
63 |
-
logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.")
|
64 |
-
for col in missing_required:
|
65 |
-
df[col] = ""
|
66 |
-
|
67 |
-
# Handle missing additional columns (popularity, completion_rate, etc.)
|
68 |
-
if missing_additional:
|
69 |
-
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
|
70 |
-
for col in missing_additional:
|
71 |
-
if col == 'popularity':
|
72 |
-
df[col] = 0.8 # Default value for popularity
|
73 |
-
elif col == 'completion_rate':
|
74 |
-
df[col] = 0.7 # Default value for completion_rate
|
75 |
-
else:
|
76 |
-
df[col] = 0.0 # Default for other additional columns
|
77 |
-
|
78 |
-
# Ensure 'level' column has valid values (not empty)
|
79 |
-
if 'level' in df.columns:
|
80 |
-
df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
|
81 |
-
else:
|
82 |
-
logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
|
83 |
-
df['level'] = 'Intermediate'
|
84 |
-
|
85 |
-
return df
|
86 |
-
except ValueError as ve:
|
87 |
-
logger.error(f"ValueError loading {file_path}: {ve}. Using fallback data.")
|
88 |
-
if fallback_data is not None:
|
89 |
-
logger.info(f"Using fallback data for {file_path}")
|
90 |
-
return pd.DataFrame(fallback_data)
|
91 |
-
return None
|
92 |
-
except Exception as e:
|
93 |
-
logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
|
94 |
-
if fallback_data is not None:
|
95 |
-
logger.info(f"Using fallback data for {file_path}")
|
96 |
-
return pd.DataFrame(fallback_data)
|
97 |
-
return None
|
98 |
-
|
99 |
-
# Load datasets with fallbacks
|
100 |
-
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], [], {
|
101 |
-
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
102 |
-
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
103 |
-
'Intermediate Python question', 'Basic Kubernetes question'],
|
104 |
-
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer']
|
105 |
-
})
|
106 |
-
|
107 |
-
courses_df = load_dataset("coursera_course_dataset_v2_no_null.csv", ["skills", "course_title", "Organization", "level"], ['popularity', 'completion_rate'], {
|
108 |
-
'skills': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
109 |
-
'course_title': ['Linux Admin', 'Git Mastery', 'Node.js Advanced', 'Python for Data', 'Kubernetes Basics'],
|
110 |
-
'Organization': ['Coursera', 'Udemy', 'Pluralsight', 'edX', 'Linux Foundation'],
|
111 |
-
'level': ['Intermediate', 'Intermediate', 'Advanced', 'Advanced', 'Intermediate'],
|
112 |
-
'popularity': [0.85, 0.9, 0.8, 0.95, 0.9],
|
113 |
-
'completion_rate': [0.65, 0.7, 0.6, 0.8, 0.75]
|
114 |
-
})
|
115 |
-
|
116 |
-
jobs_df = load_dataset("Updated_Job_Posting_Dataset.csv", ["job_title", "company_name", "location", "required_skills", "job_description"], [], {
|
117 |
-
'job_title': ['DevOps Engineer', 'Cloud Architect', 'Software Engineer', 'Data Scientist', 'Security Analyst'],
|
118 |
-
'company_name': ['Tech Corp', 'Cloud Inc', 'Tech Solutions', 'Data Co', 'SecuriTech'],
|
119 |
-
'location': ['Remote', 'Islamabad', 'Karachi', 'Remote', 'Islamabad'],
|
120 |
-
'required_skills': ['Linux, Kubernetes', 'AWS, Kubernetes', 'Python, Node.js', 'Python, SQL', 'Cybersecurity, Linux'],
|
121 |
-
'job_description': ['DevOps role description', 'Cloud architecture position', 'Software engineering role', 'Data science position', 'Security analyst role'],
|
122 |
-
'level': ['Intermediate', 'Advanced', 'Intermediate', 'Intermediate', 'Intermediate']
|
123 |
-
})
|
124 |
-
|
125 |
-
# Validate questions_df
|
126 |
-
if questions_df is None or questions_df.empty:
|
127 |
-
logger.error("questions_df is empty or could not be loaded. Exiting.")
|
128 |
-
exit(1)
|
129 |
-
if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]):
|
130 |
-
logger.error("questions_df is missing required columns. Exiting.")
|
131 |
-
exit(1)
|
132 |
-
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}")
|
133 |
-
|
134 |
-
# Load or Initialize Models with Fallback
|
135 |
-
def load_universal_model():
|
136 |
-
default_model = "all-MiniLM-L6-v2"
|
137 |
-
try:
|
138 |
-
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
139 |
-
logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
|
140 |
-
return SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
141 |
-
else:
|
142 |
-
logger.info(f"Loading universal model: {default_model}")
|
143 |
-
model = SentenceTransformer(default_model)
|
144 |
-
model.save(UNIVERSAL_MODEL_PATH)
|
145 |
-
return model
|
146 |
-
except Exception as e:
|
147 |
-
logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
|
148 |
-
exit(1)
|
149 |
-
|
150 |
-
universal_model = load_universal_model()
|
151 |
-
|
152 |
-
if os.path.exists(DETECTOR_MODEL_PATH):
|
153 |
-
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
|
154 |
-
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
|
155 |
-
else:
|
156 |
-
detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
|
157 |
-
detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
|
158 |
-
|
159 |
-
# Load Precomputed Resources
|
160 |
-
def load_precomputed_resources():
|
161 |
-
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
162 |
-
if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
|
163 |
-
try:
|
164 |
-
with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
|
165 |
-
with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
|
166 |
-
with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f)
|
167 |
-
faiss_index = faiss.read_index(FAISS_INDEX_PATH)
|
168 |
-
with open(ANSWER_EMBEDDINGS_PATH, 'rb') as f: answer_embeddings = pickle.load(f)
|
169 |
-
with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f)
|
170 |
-
with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f)
|
171 |
-
logger.info("Loaded precomputed resources successfully")
|
172 |
-
except Exception as e:
|
173 |
-
logger.error(f"Error loading precomputed resources: {e}")
|
174 |
-
precompute_resources()
|
175 |
-
else:
|
176 |
-
precompute_resources()
|
177 |
-
|
178 |
-
# Precompute Resources Offline (to be run separately)
|
179 |
-
def precompute_resources():
|
180 |
-
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
181 |
-
logger.info("Precomputing resources offline")
|
182 |
-
try:
|
183 |
-
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
184 |
-
all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist()
|
185 |
-
tfidf_vectorizer.fit(all_texts)
|
186 |
-
|
187 |
-
skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()}
|
188 |
-
question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer']))
|
189 |
-
answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy()
|
190 |
-
|
191 |
-
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
192 |
-
faiss_index.add(answer_embeddings)
|
193 |
-
|
194 |
-
# Precompute course similarities
|
195 |
-
course_skills = courses_df['skills'].fillna("").tolist()
|
196 |
-
course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
197 |
-
skill_embeddings = universal_model.encode(questions_df['Skill'].unique().tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
198 |
-
course_similarity = util.pytorch_cos_sim(skill_embeddings, course_embeddings).cpu().numpy()
|
199 |
-
|
200 |
-
# Precompute job similarities
|
201 |
-
job_skills = jobs_df['required_skills'].fillna("").tolist()
|
202 |
-
job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
203 |
-
job_similarity = util.pytorch_cos_sim(skill_embeddings, job_embeddings).cpu().numpy()
|
204 |
-
|
205 |
-
# Save precomputed resources
|
206 |
-
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
207 |
-
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
208 |
-
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
209 |
-
faiss.write_index(faiss_index, FAISS_INDEX_PATH)
|
210 |
-
with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f)
|
211 |
-
with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f)
|
212 |
-
with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f)
|
213 |
-
universal_model.save(UNIVERSAL_MODEL_PATH)
|
214 |
-
logger.info(f"Precomputed resources saved to {chosen_model_dir}")
|
215 |
-
except Exception as e:
|
216 |
-
logger.error(f"Error during precomputation: {e}")
|
217 |
-
raise
|
218 |
-
|
219 |
-
# Evaluation with precomputed data
|
220 |
-
def evaluate_response(args):
|
221 |
-
try:
|
222 |
-
skill, user_answer, question_idx = args
|
223 |
-
if not user_answer:
|
224 |
-
return skill, 0.0, False
|
225 |
-
|
226 |
-
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
|
227 |
-
with torch.no_grad():
|
228 |
-
logits = detector_model(**inputs).logits
|
229 |
-
probs = scipy.special.softmax(logits, axis=1).tolist()[0]
|
230 |
-
is_ai = probs[1] > 0.5
|
231 |
-
|
232 |
-
user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
|
233 |
-
expected_embedding = torch.tensor(answer_embeddings[question_idx])
|
234 |
-
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
|
235 |
-
|
236 |
-
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
|
237 |
-
skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
|
238 |
-
relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
|
239 |
-
score *= max(0.5, min(1.0, relevance))
|
240 |
-
|
241 |
-
return skill, round(max(0, score), 2), is_ai
|
242 |
-
except Exception as e:
|
243 |
-
logger.error(f"Evaluation error for {skill}: {e}")
|
244 |
-
return skill, 0.0, False
|
245 |
-
|
246 |
-
# Course recommendation with precomputed similarity
|
247 |
def recommend_courses(skills_to_improve, user_level, upgrade=False):
|
248 |
-
|
249 |
-
|
250 |
-
logger.info("No skills to improve or courses_df is empty.")
|
251 |
-
return []
|
252 |
-
|
253 |
-
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
|
254 |
-
if not skill_indices:
|
255 |
-
logger.info("No matching skill indices found.")
|
256 |
-
return []
|
257 |
-
|
258 |
-
similarities = course_similarity[skill_indices]
|
259 |
-
# Use default arrays to avoid KeyError
|
260 |
-
popularity = courses_df['popularity'].values if 'popularity' in courses_df else np.full(len(courses_df), 0.8)
|
261 |
-
completion_rate = courses_df['completion_rate'].values if 'completion_rate' in courses_df else np.full(len(courses_df), 0.7)
|
262 |
-
total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
|
263 |
-
|
264 |
-
target_level = 'Advanced' if upgrade else user_level
|
265 |
-
idx = np.argsort(-total_scores)[:5]
|
266 |
-
candidates = courses_df.iloc[idx]
|
267 |
-
|
268 |
-
# Filter by level, but fallback to all courses if none match
|
269 |
-
filtered_candidates = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
|
270 |
-
if filtered_candidates.empty:
|
271 |
-
logger.warning(f"No courses found for level {target_level}. Returning top courses regardless of level.")
|
272 |
-
filtered_candidates = candidates
|
273 |
-
|
274 |
-
return filtered_candidates[['course_title', 'Organization']].values.tolist()[:3]
|
275 |
-
except Exception as e:
|
276 |
-
logger.error(f"Course recommendation error: {e}")
|
277 |
return []
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
def recommend_jobs(user_skills, user_level):
|
281 |
-
|
282 |
-
if jobs_df.empty:
|
283 |
-
return []
|
284 |
-
|
285 |
-
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
|
286 |
-
if not skill_indices:
|
287 |
-
return []
|
288 |
-
|
289 |
-
similarities = job_similarity[skill_indices]
|
290 |
-
total_scores = 0.5 * np.max(similarities, axis=0)
|
291 |
-
|
292 |
-
if 'level' not in jobs_df.columns:
|
293 |
-
jobs_df['level'] = 'Intermediate'
|
294 |
-
level_col = jobs_df['level'].astype(str)
|
295 |
-
level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
|
296 |
-
user_level_num = level_map.get(user_level, 1)
|
297 |
-
level_scores = level_col.apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2)
|
298 |
-
|
299 |
-
location_pref = jobs_df.get('location', pd.Series(['Remote'] * len(jobs_df))).apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7)
|
300 |
-
total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
|
301 |
-
top_job_indices = np.argsort(-total_job_scores)[:5]
|
302 |
-
|
303 |
-
return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'],
|
304 |
-
jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
|
305 |
-
except Exception as e:
|
306 |
-
logger.error(f"Job recommendation error: {e}")
|
307 |
return []
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
for idx, row in user_questions.iterrows():
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
scores_list = []
|
359 |
-
skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code]
|
360 |
-
for skill, score, is_ai in results:
|
361 |
-
if skill in user_scores:
|
362 |
-
user_scores[skill] = max(user_scores[skill], score)
|
363 |
-
ai_flags[skill] = ai_flags[skill] or is_ai
|
364 |
-
else:
|
365 |
-
user_scores[skill] = score
|
366 |
-
ai_flags[skill] = is_ai
|
367 |
-
scores_list.append(score)
|
368 |
-
|
369 |
-
mean_score = np.mean(scores_list) if scores_list else 50
|
370 |
dynamic_threshold = max(40, mean_score)
|
371 |
-
weak_skills = [skill for skill, score in
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
|
|
3 |
import numpy as np
|
4 |
+
import pymongo
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import faiss
|
7 |
import pickle
|
8 |
+
import os
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
# Load environment variables
|
12 |
+
load_dotenv()
|
13 |
+
MONGO_URI = os.getenv("MONGO_URI", "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority")
|
14 |
+
|
15 |
+
# Connect to MongoDB
|
16 |
+
client = pymongo.MongoClient(MONGO_URI)
|
17 |
+
db = client['test']
|
18 |
+
users_collection = db['users']
|
19 |
+
jobs_collection = db['jobs']
|
20 |
+
courses_collection = db['courses']
|
21 |
+
|
22 |
+
# Load datasets
|
23 |
+
@st.cache_data
|
24 |
+
def load_datasets():
|
25 |
+
questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv")
|
26 |
+
courses_df = pd.read_csv("coursera_course_dataset_v2_no_null.csv")
|
27 |
+
jobs_df = pd.read_csv("Updated_Job_Posting_Dataset.csv")
|
28 |
+
return questions_df, courses_df, jobs_df
|
29 |
+
|
30 |
+
questions_df, courses_df, jobs_df = load_datasets()
|
31 |
+
|
32 |
+
# Load precomputed resources
|
33 |
+
@st.cache_resource
|
34 |
+
def load_resources():
|
35 |
+
universal_model = SentenceTransformer("all-MiniLM-L6-v2")
|
36 |
+
with open("tfidf_vectorizer.pkl", "rb") as f: tfidf_vectorizer = pickle.load(f)
|
37 |
+
with open("skill_tfidf.pkl", "rb") as f: skill_tfidf = pickle.load(f)
|
38 |
+
with open("question_to_answer.pkl", "rb") as f: question_to_answer = pickle.load(f)
|
39 |
+
faiss_index = faiss.read_index("faiss_index.index")
|
40 |
+
with open("answer_embeddings.pkl", "rb") as f: answer_embeddings = pickle.load(f)
|
41 |
+
with open("course_similarity.pkl", "rb") as f: course_similarity = pickle.load(f)
|
42 |
+
with open("job_similarity.pkl", "rb") as f: job_similarity = pickle.load(f)
|
43 |
+
return universal_model, tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
44 |
+
|
45 |
+
universal_model, tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity = load_resources()
|
46 |
+
|
47 |
+
# Evaluate response
|
48 |
+
def evaluate_response(skill, user_answer, question_idx):
|
49 |
+
if not user_answer or user_answer.lower() == "skip":
|
50 |
+
return skill, 0.0
|
51 |
+
user_embedding = universal_model.encode([user_answer])[0]
|
52 |
+
expected_embedding = answer_embeddings[question_idx]
|
53 |
+
score = np.dot(user_embedding, expected_embedding) / (np.linalg.norm(user_embedding) * np.linalg.norm(expected_embedding) + 1e-10) * 100
|
54 |
+
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0]
|
55 |
+
skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf))
|
56 |
+
relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10)
|
57 |
+
return skill, max(0, score * max(0.5, min(1.0, relevance)))
|
58 |
+
|
59 |
+
# Recommend courses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def recommend_courses(skills_to_improve, user_level, upgrade=False):
|
61 |
+
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in skills_to_improve if skill in questions_df['Skill'].unique()]
|
62 |
+
if not skill_indices:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
return []
|
64 |
+
similarities = course_similarity[skill_indices]
|
65 |
+
popularity = courses_df['popularity'].fillna(0.8).values
|
66 |
+
completion_rate = courses_df['completion_rate'].fillna(0.7).values
|
67 |
+
total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * popularity + 0.2 * completion_rate
|
68 |
+
target_level = 'Advanced' if upgrade else user_level
|
69 |
+
idx = np.argsort(-total_scores)[:5]
|
70 |
+
candidates = courses_df.iloc[idx]
|
71 |
+
filtered = candidates[candidates['level'].str.contains(target_level, case=False, na=False)]
|
72 |
+
return filtered[['course_title', 'Organization']].values.tolist()[:3] if not filtered.empty else candidates[['course_title', 'Organization']].values.tolist()[:3]
|
73 |
+
|
74 |
+
# Recommend jobs
|
75 |
def recommend_jobs(user_skills, user_level):
|
76 |
+
if jobs_df.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
return []
|
78 |
+
skill_indices = [list(questions_df['Skill'].unique()).index(skill) for skill in user_skills if skill in questions_df['Skill'].unique()]
|
79 |
+
if not skill_indices:
|
80 |
+
return []
|
81 |
+
similarities = job_similarity[skill_indices]
|
82 |
+
total_scores = 0.5 * np.max(similarities, axis=0)
|
83 |
+
level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
|
84 |
+
user_level_num = level_map.get(user_level, 1)
|
85 |
+
level_scores = jobs_df['level'].apply(lambda x: 1 - abs(level_map.get(x, 1) - user_level_num)/2).fillna(0.5)
|
86 |
+
location_pref = jobs_df['location'].apply(lambda x: 1.0 if x in ['Islamabad', 'Karachi'] else 0.7).fillna(0.7)
|
87 |
+
total_job_scores = total_scores + 0.2 * level_scores + 0.1 * location_pref
|
88 |
+
top_job_indices = np.argsort(-total_job_scores)[:5]
|
89 |
+
return [(jobs_df.iloc[i]['job_title'], jobs_df.iloc[i]['company_name'], jobs_df.iloc[i].get('location', 'Remote')) for i in top_job_indices]
|
90 |
+
|
91 |
+
# Streamlit UI
|
92 |
+
st.title("Skill Assessment and Recommendations")
|
93 |
+
|
94 |
+
# Simulate user signup and skill extraction
|
95 |
+
if 'user_skills' not in st.session_state:
|
96 |
+
st.session_state.user_skills = []
|
97 |
+
st.session_state.user_level = "Intermediate"
|
98 |
+
|
99 |
+
with st.form("signup_form"):
|
100 |
+
name = st.text_input("Name")
|
101 |
+
email = st.text_input("Email")
|
102 |
+
skills_input = st.text_area("Enter your skills (comma-separated)")
|
103 |
+
submit = st.form_submit_button("Sign Up")
|
104 |
+
if submit and name and email and skills_input:
|
105 |
+
st.session_state.user_skills = [s.strip() for s in skills_input.split(",") if s.strip()]
|
106 |
+
user_data = {
|
107 |
+
"name": name,
|
108 |
+
"email": email,
|
109 |
+
"skills": st.session_state.user_skills,
|
110 |
+
"createdAt": pd.Timestamp.now(),
|
111 |
+
"lastLogin": pd.Timestamp.now()
|
112 |
+
}
|
113 |
+
users_collection.insert_one(user_data)
|
114 |
+
st.success("User registered successfully!")
|
115 |
+
|
116 |
+
# Skill Assessment
|
117 |
+
if st.session_state.user_skills:
|
118 |
+
st.write("### Skill Assessment")
|
119 |
+
user_questions = []
|
120 |
+
for skill in st.session_state.user_skills:
|
121 |
+
skill_questions = questions_df[questions_df['Skill'] == skill]
|
122 |
+
if not skill_questions.empty:
|
123 |
+
user_questions.append(skill_questions.sample(1).iloc[0])
|
124 |
+
user_questions = pd.DataFrame(user_questions).reset_index(drop=True)
|
125 |
+
|
126 |
+
answers = {}
|
127 |
+
with st.form("assessment_form"):
|
128 |
for idx, row in user_questions.iterrows():
|
129 |
+
answers[row['Question']] = st.text_area(f"Question for {row['Skill']}: {row['Question']}", key=f"q_{idx}")
|
130 |
+
submit_assessment = st.form_submit_button("Submit Assessment")
|
131 |
+
|
132 |
+
if submit_assessment:
|
133 |
+
scores = {}
|
134 |
+
for idx, row in user_questions.iterrows():
|
135 |
+
question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
|
136 |
+
skill, score = evaluate_response(row['Skill'], answers.get(row['Question'], ""), question_idx)
|
137 |
+
scores[skill] = max(scores.get(skill, 0), score)
|
138 |
+
|
139 |
+
mean_score = np.mean(list(scores.values())) if scores else 50
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
dynamic_threshold = max(40, mean_score)
|
141 |
+
weak_skills = [skill for skill, score in scores.items() if score < dynamic_threshold]
|
142 |
+
|
143 |
+
st.session_state.scores = scores
|
144 |
+
st.session_state.weak_skills = weak_skills
|
145 |
+
st.session_state.mean_score = mean_score
|
146 |
+
|
147 |
+
# Update user scores in MongoDB
|
148 |
+
user = users_collection.find_one({"email": email})
|
149 |
+
if user:
|
150 |
+
users_collection.update_one(
|
151 |
+
{"_id": user["_id"]},
|
152 |
+
{"$set": {"skills_scores": scores}}
|
153 |
+
)
|
154 |
+
|
155 |
+
if 'scores' in st.session_state:
|
156 |
+
st.write("### Assessment Results")
|
157 |
+
for skill, score in st.session_state.scores.items():
|
158 |
+
st.write(f"{skill}: {score:.2f}%")
|
159 |
+
st.write(f"Mean Score: {st.session_state.mean_score:.2f}%")
|
160 |
+
st.write(f"Weak Skills: {', '.join(st.session_state.weak_skills)}")
|
161 |
+
|
162 |
+
# Recommendations
|
163 |
+
st.write("### Recommended Courses")
|
164 |
+
courses = recommend_courses(st.session_state.weak_skills or st.session_state.user_skills, st.session_state.user_level)
|
165 |
+
for course in courses:
|
166 |
+
st.write(f"- {course[0]} by {course[1]}")
|
167 |
+
|
168 |
+
st.write("### Recommended Jobs")
|
169 |
+
jobs = recommend_jobs(st.session_state.user_skills, st.session_state.user_level)
|
170 |
+
for job in jobs:
|
171 |
+
st.write(f"- {job[0]} at {job[1]} ({job[2]})")
|
172 |
+
|
173 |
+
# Run the app
|
174 |
+
if __name__ == "__main__":
|
175 |
+
st.set_page_config(layout="wide")
|