Akshayram1 commited on
Commit
8098013
Β·
verified Β·
1 Parent(s): 4df9995

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -36
app.py CHANGED
@@ -7,11 +7,9 @@ import pandas as pd
7
  import re
8
  import matplotlib.pyplot as plt
9
  import seaborn as sns
10
- import spacy
11
  from transformers import AutoTokenizer, AutoModel
12
  import torch
13
  from sklearn.metrics.pairwise import cosine_similarity
14
- from transformers import AutoModelForCausalLM # Updated import
15
 
16
  # Download necessary NLTK data
17
  nltk.download('punkt')
@@ -22,16 +20,13 @@ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
22
  float_digit_regex = re.compile(r'^\d{10}$')
23
  email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
24
 
25
- # Load Phi-4 model and tokenizer
26
-
27
-
28
  @st.cache_resource
29
  def load_model():
30
- model_name = "microsoft/phi-2" # Verified working alternative
31
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
33
  return tokenizer, model
34
-
35
 
36
  tokenizer, model = load_model()
37
 
@@ -43,9 +38,9 @@ def extract_text_from_pdf(pdf_file):
43
  text += pdf_reader.pages[page_num].extract_text()
44
  return text
45
 
46
- # Function to generate embeddings using Phi-4
47
  def get_embeddings(text):
48
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
49
  with torch.no_grad():
50
  outputs = model(**inputs)
51
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
@@ -57,30 +52,22 @@ def calculate_similarity(text1, text2):
57
  emb2 = get_embeddings(text2)
58
  return cosine_similarity([emb1], [emb2])[0][0]
59
 
60
- # Function to tokenize text using SpaCy
61
- def tokenize_text(text, nlp_model):
62
- doc = nlp_model(text, disable=["tagger", "parser"])
63
- tokens = [(token.text.lower(), token.label_) for token in doc.ents]
64
- return tokens
65
-
66
- # Function to extract CGPA
67
- def extract_cgpa(resume_text):
68
- cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
69
- match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
70
- return float(match.group(1 or 2)) if match else None
71
 
72
  # Streamlit Frontend
73
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
74
- st.markdown("An application to match resumes with a job description using Phi-4")
75
 
76
  # File Upload
77
  resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
78
  job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
79
 
80
  if resumes_files and job_descriptions_file:
81
- # Load SpaCy model
82
- nlp = spacy.load("en_Resume_Matching_Keywords")
83
-
84
  # Process documents
85
  job_description_text = extract_text_from_pdf(job_descriptions_file)
86
  resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
@@ -109,21 +96,24 @@ if resumes_files and job_descriptions_file:
109
  if selected_resume:
110
  resume_text = resumes_texts[selected_resume]
111
 
112
- # Entity extraction
113
- doc = nlp(resume_text)
114
- entities = [(ent.text, ent.label_) for ent in doc.ents]
115
-
116
- # Display entities
117
  st.write("### Extracted Entities")
118
- entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
119
- st.dataframe(entity_df)
120
 
121
  # Skills matching
122
  st.write("### Skills Matching")
123
- skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
124
- job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
125
- matched_skills = list(set(skills) & set(job_skills))
126
- st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
 
 
 
 
 
 
 
127
 
128
  # Visualization
129
  st.write("### Similarity Heatmap")
 
7
  import re
8
  import matplotlib.pyplot as plt
9
  import seaborn as sns
 
10
  from transformers import AutoTokenizer, AutoModel
11
  import torch
12
  from sklearn.metrics.pairwise import cosine_similarity
 
13
 
14
  # Download necessary NLTK data
15
  nltk.download('punkt')
 
20
  float_digit_regex = re.compile(r'^\d{10}$')
21
  email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
22
 
23
+ # Load Phi-3 model and tokenizer
 
 
24
  @st.cache_resource
25
  def load_model():
26
+ model_name = "microsoft/phi-3-mini-4k-instruct"
27
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
29
  return tokenizer, model
 
30
 
31
  tokenizer, model = load_model()
32
 
 
38
  text += pdf_reader.pages[page_num].extract_text()
39
  return text
40
 
41
+ # Function to generate embeddings using Phi-3
42
  def get_embeddings(text):
43
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=4096)
44
  with torch.no_grad():
45
  outputs = model(**inputs)
46
  embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
 
52
  emb2 = get_embeddings(text2)
53
  return cosine_similarity([emb1], [emb2])[0][0]
54
 
55
+ # Function to extract entities using Phi-3
56
+ def extract_entities(text):
57
+ prompt = f"""Extract entities from this text in JSON format with keys: skills, education, experience. Text: {text[:3000]}"""
58
+ inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
59
+ outputs = model.generate(**inputs, max_length=500)
60
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
61
 
62
  # Streamlit Frontend
63
  st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
64
+ st.markdown("An application to match resumes with job descriptions using Phi-3")
65
 
66
  # File Upload
67
  resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
68
  job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
69
 
70
  if resumes_files and job_descriptions_file:
 
 
 
71
  # Process documents
72
  job_description_text = extract_text_from_pdf(job_descriptions_file)
73
  resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
 
96
  if selected_resume:
97
  resume_text = resumes_texts[selected_resume]
98
 
99
+ # Entity extraction using Phi-3
 
 
 
 
100
  st.write("### Extracted Entities")
101
+ entities = extract_entities(resume_text)
102
+ st.code(entities, language="json")
103
 
104
  # Skills matching
105
  st.write("### Skills Matching")
106
+ job_entities = extract_entities(job_description_text)
107
+
108
+ # Simple text-based matching
109
+ resume_skills = re.findall(r'"skills": \[(.*?)\]', entities, re.DOTALL)
110
+ job_skills = re.findall(r'"skills": \[(.*?)\]', job_entities, re.DOTALL)
111
+
112
+ if resume_skills and job_skills:
113
+ resume_skills_list = [s.strip().lower() for s in resume_skills[0].split(',')]
114
+ job_skills_list = [s.strip().lower() for s in job_skills[0].split(',')]
115
+ matched_skills = list(set(resume_skills_list) & set(job_skills_list))
116
+ st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
117
 
118
  # Visualization
119
  st.write("### Similarity Heatmap")