interview_agent / utils /cv_processor.py
Jekyll2000's picture
Create utils/cv_processor.py
dd8e37c verified
raw
history blame
2.09 kB
from PyPDF2 import PdfReader
from docx import Document
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np
import os
class CVProcessor:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.job_reqs = self._load_job_requirements()
def extract_text(self, file_path):
if file_path.endswith('.pdf'):
reader = PdfReader(file_path)
return " ".join([page.extract_text() for page in reader.pages])
elif file_path.endswith('.docx'):
doc = Document(file_path)
return " ".join([para.text for para in doc.paragraphs])
def evaluate(self, cv_path, job_role):
cv_text = self.extract_text(cv_path)
reqs = self.job_reqs[job_role]
# Semantic similarity
cv_embed = self.model.encode(cv_text)
req_embed = self.model.encode(reqs["required_skills"])
similarity = cosine_similarity([cv_embed], [req_embed])[0][0]
# Experience check
exp_matches = re.findall(r"(\d+)\s+years?", cv_text.lower())
total_exp = sum(int(m) for m in exp_matches) if exp_matches else 0
is_qualified = (similarity > 0.4 and
total_exp >= reqs["min_experience"])
return {
"is_qualified": is_qualified,
"cv_summary": {
"text": cv_text[:2000] + "..." if len(cv_text) > 2000 else cv_text,
"experience": total_exp,
"skills_match": float(similarity)
}
}
def _load_job_requirements(self):
return {
"Software Engineer": {
"min_experience": 2,
"required_skills": "programming, algorithms, software development, testing, debugging"
},
"Data Scientist": {
"min_experience": 3,
"required_skills": "machine learning, statistics, python, data analysis, SQL"
}
}