Akshayram1 commited on
Commit
fde7f4e
Β·
verified Β·
1 Parent(s): c73f9c3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import nltk
4
+ from nltk.tokenize import word_tokenize
5
+ import PyPDF2
6
+ import pandas as pd
7
+ import re
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ import spacy
11
+ from transformers import AutoTokenizer, AutoModel
12
+ import torch
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ # Download necessary NLTK data
16
+ nltk.download('punkt')
17
+
18
+ # Define regular expressions for pattern matching
19
+ float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
20
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
21
+ float_digit_regex = re.compile(r'^\d{10}$')
22
+ email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
23
+
24
+ # Load Phi-4 model and tokenizer
25
+ @st.cache_resource
26
+ def load_model():
27
+ model_name = "microsoft/Phi-4-multimodal-instruct" # Hypothetical model name
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModel.from_pretrained(model_name)
30
+ return tokenizer, model
31
+
32
+ tokenizer, model = load_model()
33
+
34
+ # Function to extract text from PDF
35
+ def extract_text_from_pdf(pdf_file):
36
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
37
+ text = ""
38
+ for page_num in range(len(pdf_reader.pages)):
39
+ text += pdf_reader.pages[page_num].extract_text()
40
+ return text
41
+
42
+ # Function to generate embeddings using Phi-4
43
+ def get_embeddings(text):
44
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
45
+ with torch.no_grad():
46
+ outputs = model(**inputs)
47
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
48
+ return embeddings.numpy()
49
+
50
+ # Function to calculate similarity between texts
51
+ def calculate_similarity(text1, text2):
52
+ emb1 = get_embeddings(text1)
53
+ emb2 = get_embeddings(text2)
54
+ return cosine_similarity([emb1], [emb2])[0][0]
55
+
56
+ # Function to tokenize text using SpaCy
57
+ def tokenize_text(text, nlp_model):
58
+ doc = nlp_model(text, disable=["tagger", "parser"])
59
+ tokens = [(token.text.lower(), token.label_) for token in doc.ents]
60
+ return tokens
61
+
62
+ # Function to extract CGPA
63
+ def extract_cgpa(resume_text):
64
+ cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
65
+ match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
66
+ return float(match.group(1 or 2)) if match else None
67
+
68
+ # Streamlit Frontend
69
+ st.markdown("# Resume Matching Tool πŸ“ƒπŸ“ƒ")
70
+ st.markdown("An application to match resumes with a job description using Phi-4")
71
+
72
+ # File Upload
73
+ resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
74
+ job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
75
+
76
+ if resumes_files and job_descriptions_file:
77
+ # Load SpaCy model
78
+ nlp = spacy.load("en_Resume_Matching_Keywords")
79
+
80
+ # Process documents
81
+ job_description_text = extract_text_from_pdf(job_descriptions_file)
82
+ resumes_texts = {file.name: extract_text_from_pdf(file) for file in resumes_files}
83
+
84
+ # Generate embeddings
85
+ job_embedding = get_embeddings(job_description_text)
86
+ resume_embeddings = {name: get_embeddings(text) for name, text in resumes_texts.items()}
87
+
88
+ # Calculate similarities
89
+ results = []
90
+ for name, emb in resume_embeddings.items():
91
+ similarity = cosine_similarity([emb], [job_embedding])[0][0] * 100
92
+ results.append({
93
+ "Resume": name,
94
+ "Similarity Score": f"{similarity:.2f}%",
95
+ "Details": "View Details"
96
+ })
97
+
98
+ # Show results
99
+ st.dataframe(pd.DataFrame(results))
100
+
101
+ # Detailed analysis
102
+ st.subheader("Detailed Analysis")
103
+ selected_resume = st.selectbox("Select Resume", list(resumes_texts.keys()))
104
+
105
+ if selected_resume:
106
+ resume_text = resumes_texts[selected_resume]
107
+
108
+ # Entity extraction
109
+ doc = nlp(resume_text)
110
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
111
+
112
+ # Display entities
113
+ st.write("### Extracted Entities")
114
+ entity_df = pd.DataFrame(entities, columns=["Text", "Label"])
115
+ st.dataframe(entity_df)
116
+
117
+ # Skills matching
118
+ st.write("### Skills Matching")
119
+ skills = [ent.text for ent in doc.ents if ent.label_ == "SKILLS"]
120
+ job_skills = [ent.text for ent in nlp(job_description_text).ents if ent.label_ == "SKILLS"]
121
+ matched_skills = list(set(skills) & set(job_skills))
122
+ st.write(f"**Matched Skills ({len(matched_skills)}):** {', '.join(matched_skills)}")
123
+
124
+ # Visualization
125
+ st.write("### Similarity Heatmap")
126
+ skills_keywords = st.text_input("Enter skills for heatmap (comma-separated):").split(',')
127
+
128
+ if skills_keywords:
129
+ heatmap_data = []
130
+ for skill in skills_keywords:
131
+ skill_emb = get_embeddings(skill.strip())
132
+ row = []
133
+ for name, emb in resume_embeddings.items():
134
+ row.append(cosine_similarity([emb], [skill_emb])[0][0])
135
+ heatmap_data.append(row)
136
+
137
+ plt.figure(figsize=(12, 8))
138
+ sns.heatmap(pd.DataFrame(heatmap_data,
139
+ columns=list(resumes_texts.keys()),
140
+ index=skills_keywords),
141
+ annot=True, cmap="YlGnBu")
142
+ st.pyplot(plt)
143
+ else:
144
+ st.warning("Please upload both resumes and job description to proceed.")