Spaces:
Sleeping
Sleeping
File size: 5,413 Bytes
fde7f4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# Import necessary libraries
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Download necessary NLTK data
nltk.download('punkt')
# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
# Load Phi-4 model and tokenizer
@st.cache_resource
def load_model():
model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
return tokenizer, model
tokenizer, model = load_model()
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to generate embeddings using Phi-4
def get_embeddings(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
return embeddings.numpy()
# Function to calculate similarity between texts
def calculate_similarity(text1, text2):
emb1 = get_embeddings(text1)
emb2 = get_embeddings(text2)
return cosine_similarity([emb1], [emb2])[0][0]
# Function to tokenize text using SpaCy
def tokenize_text(text, nlp_model):
doc = nlp_model(text, disable=["tagger", "parser"])
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
return tokens
# Function to extract CGPA
def extract_cgpa(resume_text):
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
return float(match.group(1 or 2)) if match else None
# Streamlit Frontend
st.markdown("# Resume Matching Tool ππ")
st.markdown("An application to match resumes with a job description using Phi-4")
# File Upload
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
if resumes_files and job_descriptions_file:
# Load SpaCy model
nlp = spacy.load("en_Resume_Matching_Keywords")
# Process documents
job_description_text = extract_text_from_pdf(job_descriptions_file)
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
# Generate embeddings
job_embedding = get_embeddings(job_description_text)
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
# Calculate similarities
results = []
for name, emb in resume_embeddings.items():
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
results.append({
"Resume": name,
"Similarity Score": f"{similarity:.2f}%",
"Details": "View Details"
})
# Show results
st.dataframe(pd.DataFrame(results))
# Detailed analysis
st.subheader("Detailed Analysis")
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
if selected_resume:
resume_text = resumes_texts[selected_resume]
# Entity extraction
doc = nlp(resume_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
# Display entities
st.write("### Extracted Entities")
entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
st.dataframe(entity_df)
# Skills matching
st.write("### Skills Matching")
skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
matched_skills = list(set(skills) & set(job_skills))
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
# Visualization
st.write("### Similarity Heatmap")
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
if skills_keywords:
heatmap_data = []
for skill in skills_keywords:
skill_emb = get_embeddings(skill.strip())
row = []
for name, emb in resume_embeddings.items():
row.append(cosine_similarity([emb], [skill_emb])[0][0])
heatmap_data.append(row)
plt.figure(figsize=(12, 8))
sns.heatmap(pd.DataFrame(heatmap_data,
columns=list(resumes_texts.keys()),
index=skills_keywords),
annot=True, cmap="YlGnBu")
st.pyplot(plt)
else:
st.warning("Please upload both resumes and job description to proceed.") |