|
|
|
import streamlit as st |
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
import PyPDF2 |
|
import pandas as pd |
|
import re |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
nltk.download('punkt') |
|
|
|
|
|
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$') |
|
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' |
|
float_digit_regex = re.compile(r'^\d{10}$') |
|
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})') |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
model_name = "microsoft/Phi-4-mini-instruct" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) |
|
return tokenizer, model |
|
|
|
tokenizer, model = load_model() |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
text = "" |
|
for page_num in range(len(pdf_reader.pages)): |
|
text += pdf_reader.pages[page_num].extract_text() |
|
return text |
|
|
|
|
|
def get_embeddings(text): |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() |
|
return embeddings.numpy() |
|
|
|
|
|
def calculate_similarity(text1, text2): |
|
emb1 = get_embeddings(text1) |
|
emb2 = get_embeddings(text2) |
|
return cosine_similarity([emb1], [emb2])[0][0] |
|
|
|
|
|
def extract_entities(text): |
|
prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}""" |
|
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False) |
|
outputs = model.generate(**inputs, max_length=500) |
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
st.markdown("# Resume Matching Tool ππ") |
|
st.markdown("An application to match resumes with job descriptions using Phi-3") |
|
|
|
|
|
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True) |
|
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"]) |
|
|
|
if resumes_files and job_descriptions_file: |
|
|
|
job_description_text = extract_text_from_pdf(job_descriptions_file) |
|
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files} |
|
|
|
|
|
job_embedding = get_embeddings(job_description_text) |
|
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()} |
|
|
|
|
|
results = [] |
|
for name, emb in resume_embeddings.items(): |
|
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100 |
|
results.append({ |
|
"Resume": name, |
|
"Similarity Score": f"{similarity:.2f}%", |
|
"Details": "View Details" |
|
}) |
|
|
|
|
|
st.dataframe(pd.DataFrame(results)) |
|
|
|
|
|
st.subheader("Detailed Analysis") |
|
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys())) |
|
|
|
if selected_resume: |
|
resume_text = resumes_texts[selected_resume] |
|
|
|
|
|
st.write("### Extracted Entities") |
|
entities = extract_entities(resume_text) |
|
st.code(entities, language="json") |
|
|
|
|
|
st.write("### Skills Matching") |
|
job_entities = extract_entities(job_description_text) |
|
|
|
|
|
resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL) |
|
job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL) |
|
|
|
if resume_skills and job_skills: |
|
resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')] |
|
job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')] |
|
matched_skills = list(set(resume_skills_list) & set(job_skills_list)) |
|
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}") |
|
|
|
|
|
st.write("### Similarity Heatmap") |
|
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',') |
|
|
|
if skills_keywords: |
|
heatmap_data = [] |
|
for skill in skills_keywords: |
|
skill_emb = get_embeddings(skill.strip()) |
|
row = [] |
|
for name, emb in resume_embeddings.items(): |
|
row.append(cosine_similarity([emb], [skill_emb])[0][0]) |
|
heatmap_data.append(row) |
|
|
|
plt.figure(figsize=(12, 8)) |
|
sns.heatmap(pd.DataFrame(heatmap_data, |
|
columns=list(resumes_texts.keys()), |
|
index=skills_keywords), |
|
annot=True, cmap="YlGnBu") |
|
st.pyplot(plt) |
|
else: |
|
st.warning("Please upload both resumes and job description to proceed.") |