File size: 5,416 Bytes
fde7f4e 8098013 fde7f4e 993065e fde56ba 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e 8098013 fde7f4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# Import necessary libraries
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Download necessary NLTK data
nltk.download('punkt')
# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
# Load Phi-3 model and tokenizer
@st.cache_resource
def load_model():
model_name = "microsoft/Phi-4-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
return tokenizer, model
tokenizer, model = load_model()
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to generate embeddings using Phi-3
def get_embeddings(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
return embeddings.numpy()
# Function to calculate similarity between texts
def calculate_similarity(text1, text2):
emb1 = get_embeddings(text1)
emb2 = get_embeddings(text2)
return cosine_similarity([emb1], [emb2])[0][0]
# Function to extract entities using Phi-3
def extract_entities(text):
prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}"""
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
outputs = model.generate(**inputs, max_length=500)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Streamlit Frontend
st.markdown("# Resume Matching Tool ππ")
st.markdown("An application to match resumes with job descriptions using Phi-3")
# File Upload
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
if resumes_files and job_descriptions_file:
# Process documents
job_description_text = extract_text_from_pdf(job_descriptions_file)
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
# Generate embeddings
job_embedding = get_embeddings(job_description_text)
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
# Calculate similarities
results = []
for name, emb in resume_embeddings.items():
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
results.append({
"Resume": name,
"Similarity Score": f"{similarity:.2f}%",
"Details": "View Details"
})
# Show results
st.dataframe(pd.DataFrame(results))
# Detailed analysis
st.subheader("Detailed Analysis")
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
if selected_resume:
resume_text = resumes_texts[selected_resume]
# Entity extraction using Phi-3
st.write("### Extracted Entities")
entities = extract_entities(resume_text)
st.code(entities, language="json")
# Skills matching
st.write("### Skills Matching")
job_entities = extract_entities(job_description_text)
# Simple text-based matching
resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL)
job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL)
if resume_skills and job_skills:
resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')]
job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')]
matched_skills = list(set(resume_skills_list) & set(job_skills_list))
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
# Visualization
st.write("### Similarity Heatmap")
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
if skills_keywords:
heatmap_data = []
for skill in skills_keywords:
skill_emb = get_embeddings(skill.strip())
row = []
for name, emb in resume_embeddings.items():
row.append(cosine_similarity([emb], [skill_emb])[0][0])
heatmap_data.append(row)
plt.figure(figsize=(12, 8))
sns.heatmap(pd.DataFrame(heatmap_data,
columns=list(resumes_texts.keys()),
index=skills_keywords),
annot=True, cmap="YlGnBu")
st.pyplot(plt)
else:
st.warning("Please upload both resumes and job description to proceed.") |