from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import Optional, Dict, List import pandas as pd from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import time import os # Set cache directory os.environ["HF_HOME"] = "/app/cache" os.environ["TRANSFORMERS_CACHE"] = "/app/cache" app = FastAPI() # Load datasets DATA_DIR = "/app/data/" job_df = pd.read_csv(os.path.join(DATA_DIR, "Updated_Job_Posting_Dataset.csv"), encoding="latin1") course_df = pd.read_csv(os.path.join(DATA_DIR, "coursera_course_dataset_v2_no_null.csv")) coding_df = pd.read_csv(os.path.join(DATA_DIR, "Software Questions.csv"), encoding="latin1") # Preprocess datasets coding_df = coding_df.rename(columns={ 'Question': 'question', 'Answer': 'solutions', 'Category': 'category', 'Difficulty': 'difficulty' }) coding_df.dropna(subset=['question', 'solutions', 'category', 'difficulty'], inplace=True) job_df.rename(columns={'company_name': 'company', 'required_skills': 'skills'}, inplace=True) course_df.rename(columns={'Title': 'course_title', 'Skills': 'skills'}, inplace=True) job_df["job_description"] = job_df["job_description"].fillna("") # Load BERT model and vectorizer bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') vectorizer = TfidfVectorizer() # Pydantic models for request bodies class ChallengeRequest(BaseModel): skills: List[str] difficulty: Optional[str] = None class AssessmentRequest(BaseModel): name: str skills: List[str] answers: Optional[Dict[str, Dict[str, str]]] = None # Get coding challenges def get_coding_challenges(categories: List[str], num_questions=5, difficulty: Optional[str] = None): skill_challenges = {} for category in categories: relevant = coding_df[coding_df["category"].str.contains(category, case=False, na=False)] if difficulty: relevant = relevant[relevant["difficulty"].str.lower() == difficulty.lower()] if not relevant.empty: skill_challenges[category] = relevant.sample(min(num_questions, len(relevant)))[["question", "solutions", "difficulty"]].to_dict(orient="records") else: skill_challenges[category] = [] return skill_challenges # Evaluate coding answers def evaluate_coding_with_time(user_code, correct_code, start_time): end_time = time.time() execution_time = end_time - start_time vectorized = vectorizer.fit_transform([user_code, correct_code]) similarity = cosine_similarity(vectorized)[0][1] * 100 if execution_time > 120: similarity -= (execution_time - 120) * 0.1 return round(max(similarity, 0), 2) # Assign proficiency level def get_proficiency_level(score): if score >= 80: return "Expert" elif score >= 50: return "Intermediate" else: return "Beginner" # Recommend courses def recommend_courses(weak_skills): if not weak_skills: return [] courses = course_df[course_df['skills'].str.contains('|'.join(weak_skills), case=False, na=False)] return courses[['course_title', 'Organization']].head(5).to_dict(orient="records") # Recommend jobs def recommend_jobs(skills): if not skills: return [] job_df["job_embeddings"] = job_df["job_description"].apply(lambda x: bert_model.encode(str(x))) user_embedding = bert_model.encode(" ".join(skills)) job_df["BERT_Similarity"] = job_df["job_embeddings"].apply(lambda x: cosine_similarity([x], [user_embedding])[0][0]) top_jobs = job_df.sort_values(by="BERT_Similarity", ascending=False).head(5) return top_jobs[["job_title", "company", "location", "BERT_Similarity"]].to_dict(orient="records") @app.get("/") def read_root(): return {"message": "Skill Assessment API"} # POST endpoint for fetching challenges @app.post("/challenges") def get_user_challenges(request: ChallengeRequest): skills = request.skills difficulty = request.difficulty if not skills: raise HTTPException(status_code=400, detail="Skills list cannot be empty") challenges = get_coding_challenges(skills, difficulty=difficulty) # Return only questions and difficulty (exclude solutions for the user) return { "challenges": { category: [ {"question": challenge["question"], "difficulty": challenge["difficulty"]} for challenge in challenge_list ] for category, challenge_list in challenges.items() } } # POST endpoint for assessing answers @app.post("/assess") def assess_skills(user_input: AssessmentRequest): user_name = user_input.name user_skills = user_input.skills if not user_skills: raise HTTPException(status_code=400, detail="Skills list cannot be empty") challenges = get_coding_challenges(user_skills) user_scores = {} for skill, challenge_list in challenges.items(): if not challenge_list: user_scores[skill] = 0 continue total_score = 0 num_questions = len(challenge_list) if user_input.answers and skill in user_input.answers: for challenge in challenge_list: question = challenge["question"] if question in user_input.answers[skill]: start_time = time.time() - 10 # Simulate execution time user_code = user_input.answers[skill][question] correct_code = challenge["solutions"] score = evaluate_coding_with_time(user_code, correct_code, start_time) total_score += score else: total_score += 0 else: total_score = 50 * num_questions # Default score for unattempted questions user_scores[skill] = round(total_score / num_questions, 2) proficiency_levels = {skill: get_proficiency_level(score) for skill, score in user_scores.items()} weak_skills = [skill for skill, level in proficiency_levels.items() if level in ["Beginner", "Intermediate"]] courses = recommend_courses(weak_skills) jobs = recommend_jobs(user_skills) return { "name": user_name, "skills": user_skills, "scores": user_scores, "proficiency_levels": proficiency_levels, "recommended_courses": courses, "recommended_jobs": jobs } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)