Spaces:
Running
Running
from PyPDF2 import PdfReader | |
from docx import Document | |
import re | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
import os | |
# Initialize model for semantic similarity | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def extract_text_from_file(file_path): | |
if file_path.endswith('.pdf'): | |
with open(file_path, 'rb') as f: | |
reader = PdfReader(f) | |
text = " ".join([page.extract_text() for page in reader.pages]) | |
elif file_path.endswith('.docx'): | |
doc = Document(file_path) | |
text = " ".join([para.text for para in doc.paragraphs]) | |
else: | |
raise ValueError("Unsupported file format") | |
# Clean text | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def evaluate_cv(cv_path, job_role): | |
# Load job requirements (you would have these stored for each role) | |
job_requirements = load_job_requirements(job_role) | |
# Extract text from CV | |
cv_text = extract_text_from_file(cv_path) | |
# Create embeddings | |
cv_embedding = model.encode(cv_text) | |
requirements_embedding = model.encode(job_requirements["required_skills"]) | |
# Calculate similarity | |
similarity = cosine_similarity( | |
cv_embedding.reshape(1, -1), | |
requirements_embedding.reshape(1, -1) | |
)[0][0] | |
# Check minimum requirements | |
rejection_reasons = [] | |
meets_requirements = True | |
# Check for minimum experience | |
experience_pattern = r"(\d+)\s+years?" | |
experience_matches = re.findall(experience_pattern, cv_text.lower()) | |
total_experience = sum(int(match) for match in experience_matches) if experience_matches else 0 | |
if total_experience < job_requirements["min_experience"]: | |
meets_requirements = False | |
rejection_reasons.append( | |
f"Requires {job_requirements['min_experience']} years experience, found {total_experience}" | |
) | |
# Check education | |
education_keywords = job_requirements["required_education"] | |
has_education = any(keyword.lower() in cv_text.lower() for keyword in education_keywords) | |
if not has_education: | |
meets_requirements = False | |
rejection_reasons.append( | |
f"Required education not found: {', '.join(education_keywords)}" | |
) | |
# Check similarity threshold | |
if similarity < 0.4: # Adjust threshold as needed | |
meets_requirements = False | |
rejection_reasons.append( | |
"CV content doesn't sufficiently match the required skills" | |
) | |
# Prepare CV summary for interview | |
cv_summary = { | |
"text": cv_text, | |
"experience": total_experience, | |
"skills_similarity": float(similarity), | |
"education": has_education | |
} | |
return { | |
"is_qualified": meets_requirements, | |
"rejection_reasons": rejection_reasons, | |
"cv_summary": cv_summary | |
} | |
def load_job_requirements(job_role): | |
# In a real app, these would be stored in a database or files | |
requirements = { | |
"Software Engineer": { | |
"min_experience": 2, | |
"required_education": ["Bachelor in Computer Science", "BSc CS", "Engineering"], | |
"required_skills": """ | |
programming, algorithms, data structures, software development, | |
testing, debugging, version control, agile methodologies | |
""" | |
}, | |
"Data Scientist": { | |
"min_experience": 3, | |
"required_education": ["Master", "PhD", "Statistics", "Data Science"], | |
"required_skills": """ | |
machine learning, statistics, python, R, data analysis, | |
data visualization, SQL, predictive modeling | |
""" | |
} | |
} | |
return requirements.get(job_role, requirements["Software Engineer"]) |