Akshayram1's picture
Create app.py
fde7f4e verified
raw
history blame
5.41 kB
# Import necessary libraries
import streamlit as st
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
# Download necessary NLTK data
nltk.download('punkt')
# Define regular expressions for pattern matching
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
float_digit_regex = re.compile(r'^\d{10}$')
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
# Load Phi-4 model and tokenizer
@st.cache_resource
def load_model():
model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
return tokenizer, model
tokenizer, model = load_model()
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to generate embeddings using Phi-4
def get_embeddings(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
return embeddings.numpy()
# Function to calculate similarity between texts
def calculate_similarity(text1, text2):
emb1 = get_embeddings(text1)
emb2 = get_embeddings(text2)
return cosine_similarity([emb1], [emb2])[0][0]
# Function to tokenize text using SpaCy
def tokenize_text(text, nlp_model):
doc = nlp_model(text, disable=["tagger", "parser"])
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
return tokens
# Function to extract CGPA
def extract_cgpa(resume_text):
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
return float(match.group(1 or 2)) if match else None
# Streamlit Frontend
st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
st.markdown("An application to match resumes with a job description using Phi-4")
# File Upload
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
if resumes_files and job_descriptions_file:
# Load SpaCy model
nlp = spacy.load("en_Resume_Matching_Keywords")
# Process documents
job_description_text = extract_text_from_pdf(job_descriptions_file)
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
# Generate embeddings
job_embedding = get_embeddings(job_description_text)
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
# Calculate similarities
results = []
for name, emb in resume_embeddings.items():
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
results.append({
"Resume": name,
"Similarity Score": f"{similarity:.2f}%",
"Details": "View Details"
})
# Show results
st.dataframe(pd.DataFrame(results))
# Detailed analysis
st.subheader("Detailed Analysis")
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
if selected_resume:
resume_text = resumes_texts[selected_resume]
# Entity extraction
doc = nlp(resume_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
# Display entities
st.write("### Extracted Entities")
entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
st.dataframe(entity_df)
# Skills matching
st.write("### Skills Matching")
skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
matched_skills = list(set(skills) & set(job_skills))
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
# Visualization
st.write("### Similarity Heatmap")
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
if skills_keywords:
heatmap_data = []
for skill in skills_keywords:
skill_emb = get_embeddings(skill.strip())
row = []
for name, emb in resume_embeddings.items():
row.append(cosine_similarity([emb], [skill_emb])[0][0])
heatmap_data.append(row)
plt.figure(figsize=(12, 8))
sns.heatmap(pd.DataFrame(heatmap_data,
columns=list(resumes_texts.keys()),
index=skills_keywords),
annot=True, cmap="YlGnBu")
st.pyplot(plt)
else:
st.warning("Please upload both resumes and job description to proceed.")