Spaces:
Sleeping
Sleeping
# Import necessary libraries | |
import streamlit as st | |
import nltk | |
from nltk.tokenize import word_tokenize | |
import PyPDF2 | |
import pandas as pd | |
import re | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import spacy | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Download necessary NLTK data | |
nltk.download('punkt') | |
# Define regular expressions for pattern matching | |
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$') | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
float_digit_regex = re.compile(r'^\d{10}$') | |
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})') | |
# Load Phi-4 model and tokenizer | |
def load_model(): | |
model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModel.from_pretrained(model_name) | |
return tokenizer, model | |
tokenizer, model = load_model() | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page_num].extract_text() | |
return text | |
# Function to generate embeddings using Phi-4 | |
def get_embeddings(text): | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() | |
return embeddings.numpy() | |
# Function to calculate similarity between texts | |
def calculate_similarity(text1, text2): | |
emb1 = get_embeddings(text1) | |
emb2 = get_embeddings(text2) | |
return cosine_similarity([emb1], [emb2])[0][0] | |
# Function to tokenize text using SpaCy | |
def tokenize_text(text, nlp_model): | |
doc = nlp_model(text, disable=["tagger", "parser"]) | |
tokens = [(token.text.lower(), token.label_) for token in doc.ents] | |
return tokens | |
# Function to extract CGPA | |
def extract_cgpa(resume_text): | |
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b' | |
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE) | |
return float(match.group(1 or 2)) if match else None | |
# Streamlit Frontend | |
st.markdown("# Resume Matching Tool ππ") | |
st.markdown("An application to match resumes with a job description using Phi-4") | |
# File Upload | |
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True) | |
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"]) | |
if resumes_files and job_descriptions_file: | |
# Load SpaCy model | |
nlp = spacy.load("en_Resume_Matching_Keywords") | |
# Process documents | |
job_description_text = extract_text_from_pdf(job_descriptions_file) | |
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files} | |
# Generate embeddings | |
job_embedding = get_embeddings(job_description_text) | |
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()} | |
# Calculate similarities | |
results = [] | |
for name, emb in resume_embeddings.items(): | |
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100 | |
results.append({ | |
"Resume": name, | |
"Similarity Score": f"{similarity:.2f}%", | |
"Details": "View Details" | |
}) | |
# Show results | |
st.dataframe(pd.DataFrame(results)) | |
# Detailed analysis | |
st.subheader("Detailed Analysis") | |
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys())) | |
if selected_resume: | |
resume_text = resumes_texts[selected_resume] | |
# Entity extraction | |
doc = nlp(resume_text) | |
entities = [(ent.text, ent.label_) for ent in doc.ents] | |
# Display entities | |
st.write("### Extracted Entities") | |
entity_df = pd.DataFrame(entities, columns=["Text", "Label"]) | |
st.dataframe(entity_df) | |
# Skills matching | |
st.write("### Skills Matching") | |
skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"] | |
job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"] | |
matched_skills = list(set(skills) & set(job_skills)) | |
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}") | |
# Visualization | |
st.write("### Similarity Heatmap") | |
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',') | |
if skills_keywords: | |
heatmap_data = [] | |
for skill in skills_keywords: | |
skill_emb = get_embeddings(skill.strip()) | |
row = [] | |
for name, emb in resume_embeddings.items(): | |
row.append(cosine_similarity([emb], [skill_emb])[0][0]) | |
heatmap_data.append(row) | |
plt.figure(figsize=(12, 8)) | |
sns.heatmap(pd.DataFrame(heatmap_data, | |
columns=list(resumes_texts.keys()), | |
index=skills_keywords), | |
annot=True, cmap="YlGnBu") | |
st.pyplot(plt) | |
else: | |
st.warning("Please upload both resumes and job description to proceed.") |