from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Optional, Dict, List import pandas as pd from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import time import os # Set cache directory explicitly (optional, as Dockerfile ENV should handle this) os.environ["HF_HOME"] = "/app/cache" os.environ["TRANSFORMERS_CACHE"] = "/app/cache" app = FastAPI() # Load datasets (updated path to match Dockerfile) DATA_DIR = "/app/data/" # Changed from "data/" to "/app/data/" job_df = pd.read_csv(os.path.join(DATA_DIR, "Updated_Job_Posting_Dataset.csv"), encoding="latin1") course_df = pd.read_csv(os.path.join(DATA_DIR, "coursera_course_dataset_v2_no_null.csv")) coding_df = pd.read_csv(os.path.join(DATA_DIR, "Software Questions.csv"), encoding="latin1") # Preprocess datasets coding_df.rename(columns={'Question': 'question', 'Answer': 'solutions'}, inplace=True) job_df.rename(columns={'company_name': 'company', 'required_skills': 'skills'}, inplace=True) course_df.rename(columns={'Title': 'course_title', 'Skills': 'skills'}, inplace=True) coding_df.dropna(subset=['question', 'solutions'], inplace=True) job_df["job_description"] = job_df["job_description"].fillna("") # Load BERT model and vectorizer bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') vectorizer = TfidfVectorizer() # Pydantic model for request body class UserInput(BaseModel): name: str skills: List[str] # Required list of skills answers: Optional[Dict[str, Dict[str, str]]] = None # Optional answers # Evaluate coding answers def evaluate_coding_with_time(user_code, correct_code, start_time): end_time = time.time() execution_time = end_time - start_time vectorized = vectorizer.fit_transform([user_code, correct_code]) similarity = cosine_similarity(vectorized)[0][1] * 100 if execution_time > 120: similarity -= (execution_time - 120) * 0.1 return round(max(similarity, 0), 2) # Get coding challenges def get_coding_challenges(skills, num_questions=5): skill_challenges = {} for skill in skills: relevant = coding_df[coding_df["question"].str.contains(skill, case=False, na=False)] if not relevant.empty: skill_challenges[skill] = relevant.sample(min(num_questions, len(relevant)))[["question", "solutions"]].to_dict(orient="records") else: skill_challenges[skill] = [] return skill_challenges # Assign proficiency level def get_proficiency_level(score): if score >= 80: return "Expert" elif score >= 50: return "Intermediate" else: return "Beginner" # Recommend courses def recommend_courses(weak_skills): if not weak_skills: return [] courses = course_df[course_df['skills'].str.contains('|'.join(weak_skills), case=False, na=False)] return courses[['course_title', 'Organization']].head(5).to_dict(orient="records") # Recommend jobs def recommend_jobs(skills): if not skills: return [] job_df["job_embeddings"] = job_df["job_description"].apply(lambda x: bert_model.encode(str(x))) user_embedding = bert_model.encode(" ".join(skills)) job_df["BERT_Similarity"] = job_df["job_embeddings"].apply(lambda x: cosine_similarity([x], [user_embedding])[0][0]) top_jobs = job_df.sort_values(by="BERT_Similarity", ascending=False).head(5) return top_jobs[["job_title", "company", "location", "BERT_Similarity"]].to_dict(orient="records") @app.get("/") def read_root(): return {"message": "Skill Assessment API"} @app.post("/assess") def assess_skills(user_input: UserInput): # Extract user data from request user_name = user_input.name user_skills = user_input.skills if not user_skills: raise HTTPException(status_code=400, detail="Skills list cannot be empty") # Fetch coding challenges based on provided skills challenges = get_coding_challenges(user_skills) # Evaluate skills user_scores = {} for skill, challenge_list in challenges.items(): if not challenge_list: user_scores[skill] = 0 continue total_score = 0 num_questions = len(challenge_list) if user_input.answers and skill in user_input.answers: # Use provided answers for challenge in challenge_list: question = challenge["question"] if question in user_input.answers[skill]: start_time = time.time() - 10 # Simulate execution time user_code = user_input.answers[skill][question] correct_code = challenge["solutions"] score = evaluate_coding_with_time(user_code, correct_code, start_time) total_score += score else: total_score += 0 # No answer provided for this question else: # No answers provided; assign default score (50% per question) total_score = 50 * num_questions user_scores[skill] = round(total_score / num_questions, 2) # Proficiency levels proficiency_levels = {skill: get_proficiency_level(score) for skill, score in user_scores.items()} weak_skills = [skill for skill, level in proficiency_levels.items() if level in ["Beginner", "Intermediate"]] # Recommendations courses = recommend_courses(weak_skills) jobs = recommend_jobs(user_skills) return { "name": user_name, "skills": user_skills, "scores": user_scores, "proficiency_levels": proficiency_levels, "recommended_courses": courses, "recommended_jobs": jobs } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)