Spaces:
Runtime error
Runtime error
import os | |
import numpy as np | |
import torch | |
from sentence_transformers import SentenceTransformer, util | |
import faiss | |
import pickle | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import scipy.special | |
from flask import Flask, request, jsonify | |
import logging | |
from pymongo import MongoClient | |
import pandas as pd | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Disable tokenizers parallelism | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# Paths for saving artifacts | |
MODEL_DIR = "./saved_models" | |
FALLBACK_MODEL_DIR = "/tmp/saved_models" | |
try: | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
logger.info(f"Using model directory: {MODEL_DIR}") | |
chosen_model_dir = MODEL_DIR | |
except Exception as e: | |
logger.warning(f"Failed to create {MODEL_DIR}: {e}. Using fallback directory.") | |
os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True) | |
chosen_model_dir = FALLBACK_MODEL_DIR | |
# Update paths | |
UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model") | |
DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model") | |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index") | |
ANSWER_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "answer_embeddings.pkl") | |
COURSE_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "course_embeddings.pkl") | |
JOB_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "job_embeddings.pkl") | |
# MongoDB connection (use the same URI as your Express app) | |
MONGO_URI = "mongodb://localhost:27017/DMS" # Replace with your MongoDB URI | |
client = MongoClient(MONGO_URI) | |
db = client.get_database() | |
# Load models | |
universal_model = SentenceTransformer(UNIVERSAL_MODEL_PATH) if os.path.exists(UNIVERSAL_MODEL_PATH) else SentenceTransformer("all-MiniLM-L6-v2") | |
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoTokenizer.from_pretrained("roberta-base-openai-detector") | |
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH) if os.path.exists(DETECTOR_MODEL_PATH) else AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector") | |
# Global variables | |
faiss_index = None | |
answer_embeddings = None | |
course_embeddings = None | |
job_embeddings = None | |
# Load data from MongoDB | |
def load_mongodb_data(): | |
global answer_embeddings, course_embeddings, job_embeddings, faiss_index | |
try: | |
# Load questions from Generated_Skill-Based_Questions.csv (for now, keep as fallback; later, move to MongoDB) | |
questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv") # Replace with MongoDB query if stored | |
courses = list(db.courses.find()) # Fetch all courses | |
jobs = list(db.jobs.find()) # Fetch all jobs | |
# Precompute embeddings | |
answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy() | |
course_skills = [course['skills'] for course in courses] # Adjust based on your Course schema | |
course_embeddings = universal_model.encode(course_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy() | |
job_skills = [job['skills'] for job in jobs] # Adjust based on your Job schema | |
job_embeddings = universal_model.encode(job_skills, batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy() | |
# Build FAISS index | |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1]) | |
faiss_index.add(answer_embeddings) | |
# Save precomputed data | |
with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f) | |
with open(COURSE_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(course_embeddings, f) | |
with open(JOB_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(job_embeddings, f) | |
faiss.write_index(faiss_index, FAISS_INDEX_PATH) | |
logger.info("Loaded and precomputed MongoDB data successfully") | |
except Exception as e: | |
logger.error(f"Error loading MongoDB data: {e}") | |
raise | |
# Evaluate response (unchanged logic, but use MongoDB questions if stored) | |
def evaluate_response(args): | |
skill, user_answer, question_idx = args | |
if not user_answer: | |
return skill, 0.0, False | |
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512) | |
with torch.no_grad(): | |
logits = detector_model(**inputs).logits | |
probs = scipy.special.softmax(logits, axis=1).tolist()[0] | |
is_ai = probs[1] > 0.5 | |
user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0] | |
expected_embedding = torch.tensor(answer_embeddings[question_idx]) | |
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100 | |
return skill, round(max(0, score), 2), is_ai | |
# Recommend courses from MongoDB | |
def recommend_courses(skills_to_improve, user_level, upgrade=False): | |
if not skills_to_improve or not course_embeddings: | |
return [] | |
skill_indices = [i for i, skill in enumerate(questions_df['Skill'].unique()) if skill in skills_to_improve] | |
if not skill_indices: | |
return [] | |
similarities = util.pytorch_cos_sim( | |
torch.tensor(universal_model.encode(questions_df['Skill'].unique()[skill_indices].tolist(), batch_size=128)), | |
torch.tensor(course_embeddings) | |
).cpu().numpy() | |
courses = list(db.courses.find()) | |
popularity = [course.get('popularity', 0.8) for course in courses] | |
completion_rate = [course.get('completion_rate', 0.7) for course in courses] | |
total_scores = 0.6 * np.max(similarities, axis=0) + 0.2 * np.array(popularity) + 0.2 * np.array(completion_rate) | |
target_level = 'Advanced' if upgrade else user_level | |
idx = np.argsort(-total_scores)[:5] | |
candidates = [courses[i] for i in idx] | |
filtered_candidates = [c for c in candidates if target_level.lower() in c.get('level', 'Intermediate').lower()] | |
return filtered_candidates[:3] if filtered_candidates else candidates[:3] | |
# Recommend jobs from MongoDB | |
def recommend_jobs(user_skills, user_level): | |
if not job_embeddings: | |
return [] | |
skill_indices = [i for i, skill in enumerate(questions_df['Skill'].unique()) if skill in user_skills] | |
if not skill_indices: | |
return [] | |
similarities = util.pytorch_cos_sim( | |
torch.tensor(universal_model.encode(questions_df['Skill'].unique()[skill_indices].tolist(), batch_size=128)), | |
torch.tensor(job_embeddings) | |
).cpu().numpy() | |
jobs = list(db.jobs.find()) | |
level_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2} | |
user_level_num = level_map.get(user_level, 1) | |
level_scores = [1 - abs(level_map.get(job.get('level', 'Intermediate'), 1) - user_level_num) / 2 for job in jobs] | |
location_pref = [1.0 if job.get('location', 'Remote') in ['Islamabad', 'Karachi'] else 0.7 for job in jobs] | |
total_job_scores = 0.5 * np.max(similarities, axis=0) + 0.2 * np.array(level_scores) + 0.1 * np.array(location_pref) | |
top_job_indices = np.argsort(-total_job_scores)[:5] | |
return [(jobs[i]['jobTitle'], jobs[i]['companyName'], jobs[i].get('location', 'Remote')) for i in top_job_indices] | |
# Flask app setup | |
app = Flask(__name__) | |
def health_check(): | |
return jsonify({"status": "active", "model_dir": chosen_model_dir}) | |
def assess_skills(): | |
try: | |
data = request.get_json() | |
if not data or 'skills' not in data or 'answers' not in data: | |
return jsonify({"error": "Missing required fields"}), 400 | |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)] | |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)] | |
user_level = data.get('user_level', 'Intermediate').strip() | |
if len(answers) != len(user_skills): | |
return jsonify({"error": "Answers count must match skills count"}), 400 | |
load_mongodb_data() # Load and precompute MongoDB data | |
# Generate questions (for now, use CSV as fallback; move to MongoDB later) | |
questions_df = pd.read_csv("Generated_Skill-Based_Questions.csv") | |
user_questions = [] | |
for skill in user_skills: | |
skill_questions = questions_df[questions_df['Skill'] == skill] | |
if not skill_questions.empty: | |
user_questions.append(skill_questions.sample(1).iloc[0]) | |
else: | |
user_questions.append({ | |
'Skill': skill, | |
'Question': f"What are the best practices for using {skill} in a production environment?", | |
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures." | |
}) | |
user_questions = pd.DataFrame(user_questions).reset_index(drop=True) | |
user_responses = [] | |
for idx, row in user_questions.iterrows(): | |
answer = answers[idx] | |
if not answer or answer.lower() == 'skip': | |
user_responses.append((row['Skill'], None, None)) | |
else: | |
question_idx = questions_df.index[questions_df['Question'] == row['Question']][0] | |
user_responses.append((row['Skill'], answer, question_idx)) | |
results = [evaluate_response(response) for response in user_responses] | |
user_scores = {} | |
ai_flags = {} | |
scores_list = [] | |
skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code] | |
for skill, score, is_ai in results: | |
if skill in user_scores: | |
user_scores[skill] = max(user_scores[skill], score) | |
ai_flags[skill] = ai_flags[skill] or is_ai | |
else: | |
user_scores[skill] = score | |
ai_flags[skill] = is_ai | |
scores_list.append(score) | |
mean_score = np.mean(scores_list) if scores_list else 50 | |
dynamic_threshold = max(40, mean_score) | |
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold] | |
courses = recommend_courses(weak_skills or user_skills, user_level, upgrade=not weak_skills) | |
jobs = recommend_jobs(user_skills, user_level) | |
return jsonify({ | |
"assessment_results": { | |
"skills": [ | |
{ | |
"skill": skill, | |
"progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}", | |
"score": f"{score:.2f} %", | |
"origin": "AI-Generated" if is_ai else "Human-Written" | |
} for skill, score, is_ai in results | |
], | |
"mean_score": mean_score, | |
"dynamic_threshold": dynamic_threshold, | |
"weak_skills": weak_skills, | |
"skipped_questions": skipped_questions | |
}, | |
"recommended_courses": [{"course_title": c['title'], "organization": c.get('organization', 'Unknown')} for c in courses], | |
"recommended_jobs": jobs[:5] | |
}) | |
except Exception as e: | |
logger.error(f"Assessment error: {e}") | |
return jsonify({"error": "Internal server error"}), 500 | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860, threaded=True) |