Spaces:

Jekyll2000
/

interview_agent

Running

App Files Files Community

interview_agent / utils /cv_processor.py

Jekyll2000

Create utils/cv_processor.py

15f9017 verified 13 days ago

raw

history blame

3.86 kB

	from PyPDF2 import PdfReader
	from docx import Document
	import re
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import os

	# Initialize model for semantic similarity
	model = SentenceTransformer('all-MiniLM-L6-v2')

	def extract_text_from_file(file_path):
	if file_path.endswith('.pdf'):
	with open(file_path, 'rb') as f:
	reader = PdfReader(f)
	text = " ".join([page.extract_text() for page in reader.pages])
	elif file_path.endswith('.docx'):
	doc = Document(file_path)
	text = " ".join([para.text for para in doc.paragraphs])
	else:
	raise ValueError("Unsupported file format")

	# Clean text
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def evaluate_cv(cv_path, job_role):
	# Load job requirements (you would have these stored for each role)
	job_requirements = load_job_requirements(job_role)

	# Extract text from CV
	cv_text = extract_text_from_file(cv_path)

	# Create embeddings
	cv_embedding = model.encode(cv_text)
	requirements_embedding = model.encode(job_requirements["required_skills"])

	# Calculate similarity
	similarity = cosine_similarity(
	cv_embedding.reshape(1, -1),
	requirements_embedding.reshape(1, -1)
	)[0][0]

	# Check minimum requirements
	rejection_reasons = []
	meets_requirements = True

	# Check for minimum experience
	experience_pattern = r"(\d+)\s+years?"
	experience_matches = re.findall(experience_pattern, cv_text.lower())
	total_experience = sum(int(match) for match in experience_matches) if experience_matches else 0

	if total_experience < job_requirements["min_experience"]:
	meets_requirements = False
	rejection_reasons.append(
	f"Requires {job_requirements['min_experience']} years experience, found {total_experience}"
	)

	# Check education
	education_keywords = job_requirements["required_education"]
	has_education = any(keyword.lower() in cv_text.lower() for keyword in education_keywords)

	if not has_education:
	meets_requirements = False
	rejection_reasons.append(
	f"Required education not found: {', '.join(education_keywords)}"
	)

	# Check similarity threshold
	if similarity < 0.4: # Adjust threshold as needed
	meets_requirements = False
	rejection_reasons.append(
	"CV content doesn't sufficiently match the required skills"
	)

	# Prepare CV summary for interview
	cv_summary = {
	"text": cv_text,
	"experience": total_experience,
	"skills_similarity": float(similarity),
	"education": has_education
	}

	return {
	"is_qualified": meets_requirements,
	"rejection_reasons": rejection_reasons,
	"cv_summary": cv_summary
	}

	def load_job_requirements(job_role):
	# In a real app, these would be stored in a database or files
	requirements = {
	"Software Engineer": {
	"min_experience": 2,
	"required_education": ["Bachelor in Computer Science", "BSc CS", "Engineering"],
	"required_skills": """
	programming, algorithms, data structures, software development,
	testing, debugging, version control, agile methodologies
	"""
	},
	"Data Scientist": {
	"min_experience": 3,
	"required_education": ["Master", "PhD", "Statistics", "Data Science"],
	"required_skills": """
	machine learning, statistics, python, R, data analysis,
	data visualization, SQL, predictive modeling
	"""
	}
	}

	return requirements.get(job_role, requirements["Software Engineer"])