Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
import PyPDF2
|
6 |
+
import pandas as pd
|
7 |
+
import re
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import seaborn as sns
|
10 |
+
import spacy
|
11 |
+
from transformers import AutoTokenizer, AutoModel
|
12 |
+
import torch
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
|
15 |
+
# Download necessary NLTK data
|
16 |
+
nltk.download('punkt')
|
17 |
+
|
18 |
+
# Define regular expressions for pattern matching
|
19 |
+
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
|
20 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
21 |
+
float_digit_regex = re.compile(r'^\d{10}$')
|
22 |
+
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
|
23 |
+
|
24 |
+
# Load Phi-4 model and tokenizer
|
25 |
+
@st.cache_resource
|
26 |
+
def load_model():
|
27 |
+
model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name
|
28 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
+
model = AutoModel.from_pretrained(model_name)
|
30 |
+
return tokenizer, model
|
31 |
+
|
32 |
+
tokenizer, model = load_model()
|
33 |
+
|
34 |
+
# Function to extract text from PDF
|
35 |
+
def extract_text_from_pdf(pdf_file):
|
36 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
37 |
+
text = ""
|
38 |
+
for page_num in range(len(pdf_reader.pages)):
|
39 |
+
text += pdf_reader.pages[page_num].extract_text()
|
40 |
+
return text
|
41 |
+
|
42 |
+
# Function to generate embeddings using Phi-4
|
43 |
+
def get_embeddings(text):
|
44 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
45 |
+
with torch.no_grad():
|
46 |
+
outputs = model(**inputs)
|
47 |
+
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
48 |
+
return embeddings.numpy()
|
49 |
+
|
50 |
+
# Function to calculate similarity between texts
|
51 |
+
def calculate_similarity(text1, text2):
|
52 |
+
emb1 = get_embeddings(text1)
|
53 |
+
emb2 = get_embeddings(text2)
|
54 |
+
return cosine_similarity([emb1], [emb2])[0][0]
|
55 |
+
|
56 |
+
# Function to tokenize text using SpaCy
|
57 |
+
def tokenize_text(text, nlp_model):
|
58 |
+
doc = nlp_model(text, disable=["tagger", "parser"])
|
59 |
+
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
|
60 |
+
return tokens
|
61 |
+
|
62 |
+
# Function to extract CGPA
|
63 |
+
def extract_cgpa(resume_text):
|
64 |
+
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
|
65 |
+
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
|
66 |
+
return float(match.group(1 or 2)) if match else None
|
67 |
+
|
68 |
+
# Streamlit Frontend
|
69 |
+
st.markdown("# Resume Matching Tool ππ")
|
70 |
+
st.markdown("An application to match resumes with a job description using Phi-4")
|
71 |
+
|
72 |
+
# File Upload
|
73 |
+
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
|
74 |
+
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
|
75 |
+
|
76 |
+
if resumes_files and job_descriptions_file:
|
77 |
+
# Load SpaCy model
|
78 |
+
nlp = spacy.load("en_Resume_Matching_Keywords")
|
79 |
+
|
80 |
+
# Process documents
|
81 |
+
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
82 |
+
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
|
83 |
+
|
84 |
+
# Generate embeddings
|
85 |
+
job_embedding = get_embeddings(job_description_text)
|
86 |
+
resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
|
87 |
+
|
88 |
+
# Calculate similarities
|
89 |
+
results = []
|
90 |
+
for name, emb in resume_embeddings.items():
|
91 |
+
similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
|
92 |
+
results.append({
|
93 |
+
"Resume": name,
|
94 |
+
"Similarity Score": f"{similarity:.2f}%",
|
95 |
+
"Details": "View Details"
|
96 |
+
})
|
97 |
+
|
98 |
+
# Show results
|
99 |
+
st.dataframe(pd.DataFrame(results))
|
100 |
+
|
101 |
+
# Detailed analysis
|
102 |
+
st.subheader("Detailed Analysis")
|
103 |
+
selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
|
104 |
+
|
105 |
+
if selected_resume:
|
106 |
+
resume_text = resumes_texts[selected_resume]
|
107 |
+
|
108 |
+
# Entity extraction
|
109 |
+
doc = nlp(resume_text)
|
110 |
+
entities = [(ent.text, ent.label_) for ent in doc.ents]
|
111 |
+
|
112 |
+
# Display entities
|
113 |
+
st.write("### Extracted Entities")
|
114 |
+
entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
|
115 |
+
st.dataframe(entity_df)
|
116 |
+
|
117 |
+
# Skills matching
|
118 |
+
st.write("### Skills Matching")
|
119 |
+
skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
|
120 |
+
job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
|
121 |
+
matched_skills = list(set(skills) & set(job_skills))
|
122 |
+
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
|
123 |
+
|
124 |
+
# Visualization
|
125 |
+
st.write("### Similarity Heatmap")
|
126 |
+
skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
|
127 |
+
|
128 |
+
if skills_keywords:
|
129 |
+
heatmap_data = []
|
130 |
+
for skill in skills_keywords:
|
131 |
+
skill_emb = get_embeddings(skill.strip())
|
132 |
+
row = []
|
133 |
+
for name, emb in resume_embeddings.items():
|
134 |
+
row.append(cosine_similarity([emb], [skill_emb])[0][0])
|
135 |
+
heatmap_data.append(row)
|
136 |
+
|
137 |
+
plt.figure(figsize=(12, 8))
|
138 |
+
sns.heatmap(pd.DataFrame(heatmap_data,
|
139 |
+
columns=list(resumes_texts.keys()),
|
140 |
+
index=skills_keywords),
|
141 |
+
annot=True, cmap="YlGnBu")
|
142 |
+
st.pyplot(plt)
|
143 |
+
else:
|
144 |
+
st.warning("Please upload both resumes and job description to proceed.")
|