Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,11 +7,9 @@ import pandas as pd
|
|
7 |
import re
|
8 |
import matplotlib.pyplot as plt
|
9 |
import seaborn as sns
|
10 |
-
import spacy
|
11 |
from transformers import AutoTokenizer, AutoModel
|
12 |
import torch
|
13 |
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
-
from transformers import AutoModelForCausalLM # Updated import
|
15 |
|
16 |
# Download necessary NLTK data
|
17 |
nltk.download('punkt')
|
@@ -22,16 +20,13 @@ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
22 |
float_digit_regex = re.compile(r'^\d{10}$')
|
23 |
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
|
24 |
|
25 |
-
# Load Phi-
|
26 |
-
|
27 |
-
|
28 |
@st.cache_resource
|
29 |
def load_model():
|
30 |
-
model_name = "microsoft/phi-
|
31 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
32 |
-
model =
|
33 |
return tokenizer, model
|
34 |
-
|
35 |
|
36 |
tokenizer, model = load_model()
|
37 |
|
@@ -43,9 +38,9 @@ def extract_text_from_pdf(pdf_file):
|
|
43 |
text += pdf_reader.pages[page_num].extract_text()
|
44 |
return text
|
45 |
|
46 |
-
# Function to generate embeddings using Phi-
|
47 |
def get_embeddings(text):
|
48 |
-
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=
|
49 |
with torch.no_grad():
|
50 |
outputs = model(**inputs)
|
51 |
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
@@ -57,30 +52,22 @@ def calculate_similarity(text1, text2):
|
|
57 |
emb2 = get_embeddings(text2)
|
58 |
return cosine_similarity([emb1], [emb2])[0][0]
|
59 |
|
60 |
-
# Function to
|
61 |
-
def
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
# Function to extract CGPA
|
67 |
-
def extract_cgpa(resume_text):
|
68 |
-
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
|
69 |
-
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
|
70 |
-
return float(match.group(1 or 2)) if match else None
|
71 |
|
72 |
# Streamlit Frontend
|
73 |
st.markdown("# Resume Matching Tool ππ")
|
74 |
-
st.markdown("An application to match resumes with
|
75 |
|
76 |
# File Upload
|
77 |
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
|
78 |
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
|
79 |
|
80 |
if resumes_files and job_descriptions_file:
|
81 |
-
# Load SpaCy model
|
82 |
-
nlp = spacy.load("en_Resume_Matching_Keywords")
|
83 |
-
|
84 |
# Process documents
|
85 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
86 |
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
|
@@ -109,21 +96,24 @@ if resumes_files and job_descriptions_file:
|
|
109 |
if selected_resume:
|
110 |
resume_text = resumes_texts[selected_resume]
|
111 |
|
112 |
-
# Entity extraction
|
113 |
-
doc = nlp(resume_text)
|
114 |
-
entities = [(ent.text, ent.label_) for ent in doc.ents]
|
115 |
-
|
116 |
-
# Display entities
|
117 |
st.write("### Extracted Entities")
|
118 |
-
|
119 |
-
st.
|
120 |
|
121 |
# Skills matching
|
122 |
st.write("### Skills Matching")
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
# Visualization
|
129 |
st.write("### Similarity Heatmap")
|
|
|
7 |
import re
|
8 |
import matplotlib.pyplot as plt
|
9 |
import seaborn as sns
|
|
|
10 |
from transformers import AutoTokenizer, AutoModel
|
11 |
import torch
|
12 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
13 |
|
14 |
# Download necessary NLTK data
|
15 |
nltk.download('punkt')
|
|
|
20 |
float_digit_regex = re.compile(r'^\d{10}$')
|
21 |
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
|
22 |
|
23 |
+
# Load Phi-3 model and tokenizer
|
|
|
|
|
24 |
@st.cache_resource
|
25 |
def load_model():
|
26 |
+
model_name = "microsoft/phi-3-mini-4k-instruct"
|
27 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
28 |
+
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
29 |
return tokenizer, model
|
|
|
30 |
|
31 |
tokenizer, model = load_model()
|
32 |
|
|
|
38 |
text += pdf_reader.pages[page_num].extract_text()
|
39 |
return text
|
40 |
|
41 |
+
# Function to generate embeddings using Phi-3
|
42 |
def get_embeddings(text):
|
43 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096)
|
44 |
with torch.no_grad():
|
45 |
outputs = model(**inputs)
|
46 |
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
|
|
|
52 |
emb2 = get_embeddings(text2)
|
53 |
return cosine_similarity([emb1], [emb2])[0][0]
|
54 |
|
55 |
+
# Function to extract entities using Phi-3
|
56 |
+
def extract_entities(text):
|
57 |
+
prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}"""
|
58 |
+
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
|
59 |
+
outputs = model.generate(**inputs, max_length=500)
|
60 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Streamlit Frontend
|
63 |
st.markdown("# Resume Matching Tool ππ")
|
64 |
+
st.markdown("An application to match resumes with job descriptions using Phi-3")
|
65 |
|
66 |
# File Upload
|
67 |
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
|
68 |
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
|
69 |
|
70 |
if resumes_files and job_descriptions_file:
|
|
|
|
|
|
|
71 |
# Process documents
|
72 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
73 |
resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
|
|
|
96 |
if selected_resume:
|
97 |
resume_text = resumes_texts[selected_resume]
|
98 |
|
99 |
+
# Entity extraction using Phi-3
|
|
|
|
|
|
|
|
|
100 |
st.write("### Extracted Entities")
|
101 |
+
entities = extract_entities(resume_text)
|
102 |
+
st.code(entities, language="json")
|
103 |
|
104 |
# Skills matching
|
105 |
st.write("### Skills Matching")
|
106 |
+
job_entities = extract_entities(job_description_text)
|
107 |
+
|
108 |
+
# Simple text-based matching
|
109 |
+
resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL)
|
110 |
+
job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL)
|
111 |
+
|
112 |
+
if resume_skills and job_skills:
|
113 |
+
resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')]
|
114 |
+
job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')]
|
115 |
+
matched_skills = list(set(resume_skills_list) & set(job_skills_list))
|
116 |
+
st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
|
117 |
|
118 |
# Visualization
|
119 |
st.write("### Similarity Heatmap")
|