gauri-sharan commited on
Commit
73c3334
Β·
verified Β·
1 Parent(s): 4973551

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -146
app.py CHANGED
@@ -1,174 +1,199 @@
 
1
  import streamlit as st
2
  import spacy
 
3
  import pandas as pd
4
- import numpy as np
5
- from sklearn.pipeline import Pipeline
6
- from sklearn.compose import ColumnTransformer
7
- from sklearn.ensemble import GradientBoostingRegressor
8
- from sklearn.preprocessing import StandardScaler
9
  from sentence_transformers import SentenceTransformer
10
- from pyhealth.metrics import binary_metrics
11
- import mlflow
12
- import logging
13
- from system_monitor import SystemMonitor # Custom AIOPS module
14
- import torch
15
- from transformers import pipeline
16
 
17
- class AdvancedResumeProcessor:
18
  def __init__(self):
19
- self.nlp = spacy.load("en_core_web_trf")
20
- self.sentence_model = SentenceTransformer('all-mpnet-base-v2')
21
- self.system_monitor = SystemMonitor()
22
- self.logger = logging.getLogger('mlops')
23
- self.llm = pipeline('text-generation', model='gpt2-xl') if torch.cuda.is_available() else None
24
 
25
- # MLOps setup
26
- mlflow.set_tracking_uri("http://localhost:5000")
27
- self.experiment = mlflow.start_run()
28
-
29
- def _extract_entities(self, text):
30
- """Enhanced entity extraction with custom categories"""
31
  doc = self.nlp(text)
32
- return {
33
- 'skills': [ent.text for ent in doc.ents if ent.label_ == 'SKILL'],
34
- 'education': [ent.text for ent in doc.ents if ent.label_ == 'DEGREE'],
35
- 'experience': [ent.text for ent in doc.ents if ent.label_ == 'EXPERIENCE']
36
- }
 
 
 
 
 
 
37
 
38
- def _generate_features(self, jd_entities, resume_text):
39
- """Generate multi-modal features"""
40
- resume_entities = self._extract_entities(resume_text)
41
 
42
- # Semantic similarity
43
- jd_embed = self.sentence_model.encode([resume_text])[0]
44
- resume_embed = self.sentence_model.encode([resume_text])[0]
45
- semantic_sim = cosine_similarity([jd_embed], [resume_embed])[0][0]
46
 
47
- # Entity matching scores
48
- skill_match = len(set(jd_entities['skills']) & set(resume_entities['skills']))
 
49
 
50
- return {
51
- 'semantic_similarity': semantic_sim,
52
- 'skill_match': skill_match,
53
- 'education_match': int(any(deg in resume_entities['education'] for deg in jd_entities['education']))
54
- }
 
 
 
 
 
55
 
56
- def train_model(self, X, y):
57
- """MLOps enabled training pipeline"""
58
- with mlflow.start_run():
59
- preprocessor = ColumnTransformer([
60
- ('text', Pipeline([
61
- ('embed', SentenceTransformer('all-mpnet-base-v2')),
62
- ('scaler', StandardScaler())
63
- ]), 'resume_text')
64
- ])
65
-
66
- model = Pipeline([
67
- ('preproc', preprocessor),
68
- ('regressor', GradientBoostingRegressor())
69
- ])
70
-
71
- model.fit(X, y)
72
- mlflow.sklearn.log_model(model, "model")
73
- return model
74
 
75
- class MLOpsDashboard:
76
- def __init__(self):
77
- self.metrics = {
78
- 'model_performance': [],
79
- 'system_health': [],
80
- 'data_quality': []
81
- }
82
-
83
- def update_metrics(self, new_metrics):
84
- for k, v in new_metrics.items():
85
- self.metrics[k].append(v)
 
86
 
87
  def main():
88
- st.set_page_config(page_title="Enterprise Resume Ranker", layout="wide")
89
- st.title("πŸš€ Next-Gen Resume Ranking System with AIOPs/MLOps")
 
 
 
 
 
 
 
 
90
 
91
- processor = AdvancedResumeProcessor()
92
- dashboard = MLOpsDashboard()
93
 
94
  with st.sidebar:
95
- st.header("AIOPs Dashboard")
96
- processor.system_monitor.display_metrics()
97
- st.metric("Current Load", f"{processor.system_monitor.cpu_usage}% CPU")
98
-
99
- st.header("MLOps Controls")
100
- retrain = st.button("Retrain Production Model")
101
- if retrain:
102
- with st.spinner("Retraining model..."):
103
- # Add retraining logic here
104
- st.success("Model updated in production!")
105
-
106
- main_col1, main_col2 = st.columns([3, 2])
107
-
108
- with main_col1:
109
- st.header("Upload Files")
110
- jd_file = st.file_uploader("Job Description (TXT/PDF)", type=["txt", "pdf"])
111
  resume_files = st.file_uploader("Resumes (PDF/TXT)",
112
  type=["pdf", "txt"],
113
  accept_multiple_files=True)
114
 
115
- if jd_file and resume_files:
116
- try:
117
- # Process job description
118
- jd_text = processor.extract_text(jd_file)
119
- jd_entities = processor._extract_entities(jd_text)
120
-
121
- # Process resumes and generate features
122
- results = []
123
- for file in resume_files:
124
- resume_text = processor.extract_text(file)
125
- features = processor._generate_features(jd_entities, resume_text)
126
-
127
- # Generate LLM feedback
128
- llm_feedback = processor.llm(
129
- f"Compare this resume to the job description: {jd_text[:1000]}... RESUME: {resume_text[:1000]}"
130
- )[0]['generated_text'] if processor.llm else "LLM unavailable"
131
-
132
- results.append({
133
- "Filename": file.name,
134
- **features,
135
- "LLM Feedback": llm_feedback[:200] + "..."
136
- })
137
-
138
- # Display results
139
- df = pd.DataFrame(results).sort_values("semantic_similarity", ascending=False)
140
- st.subheader("Ranking Results with Explainability")
141
- st.dataframe(
142
- df,
143
- column_config={
144
- "semantic_similarity": "Semantic Match",
145
- "skill_match": "Skill Matches",
146
- "education_match": "Education Match"
147
- },
148
- use_container_width=True
149
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- # MLOps logging
152
- dashboard.update_metrics({
153
- 'model_performance': df['semantic_similarity'].mean(),
154
- 'data_quality': len(resume_files)
 
 
155
  })
156
-
157
- except Exception as e:
158
- processor.logger.error(f"Processing error: {str(e)}")
159
- st.error(f"System error: {str(e)}")
160
-
161
- with main_col2:
162
- st.header("Model Explainability")
163
- if 'df' in locals():
164
- st.plotly_chart(create_shap_plot(df)) # Implement SHAP visualization
165
- st.download_button("Export Evaluation Report",
166
- generate_report(df),
167
- file_name="ranking_report.pdf")
168
 
169
- st.header("LLM Feedback Analysis")
170
- if 'df' in locals():
171
- st.table(df[["Filename", "LLM Feedback"]].set_index("Filename"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  if __name__ == "__main__":
174
- main()
 
1
+ import os
2
  import streamlit as st
3
  import spacy
4
+ import PyPDF2
5
  import pandas as pd
6
+ import time
7
+ from datetime import datetime
8
+ import openai
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
  from sentence_transformers import SentenceTransformer
12
+ from collections import defaultdict
 
 
 
 
 
13
 
14
+ class ResumeProcessor:
15
  def __init__(self):
16
+ self.nlp = spacy.load("en_core_web_lg")
17
+ self.vectorizer = TfidfVectorizer(stop_words='english')
18
+ self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
19
 
20
+ def extract_text_from_pdf(self, file):
21
+ reader = PyPDF2.PdfReader(file)
22
+ return ' '.join([page.extract_text() for page in reader.pages])
23
+
24
+ def preprocess_text(self, text):
 
25
  doc = self.nlp(text)
26
+ tokens = [token.lemma_.lower() for token in doc
27
+ if not token.is_stop and not token.is_punct]
28
+ return ' '.join(tokens)
29
+
30
+ def extract_entities(self, text):
31
+ doc = self.nlp(text)
32
+ entities = defaultdict(set)
33
+ for ent in doc.ents:
34
+ if ent.label_ in ['ORG', 'PERSON', 'GPE', 'EDU', 'SKILL']:
35
+ entities[ent.label_].add(ent.text.lower())
36
+ return entities
37
 
38
+ def calculate_similarity(self, jd_text, resumes):
39
+ processed_jd = self.preprocess_text(jd_text)
40
+ processed_resumes = [self.preprocess_text(resume) for resume in resumes]
41
 
42
+ tfidf_matrix = self.vectorizer.fit_transform([processed_jd] + processed_resumes)
43
+ tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
 
 
44
 
45
+ jd_embedding = self.sentence_model.encode([processed_jd])
46
+ resume_embeddings = self.sentence_model.encode(processed_resumes)
47
+ semantic_scores = cosine_similarity(jd_embedding, resume_embeddings)[0]
48
 
49
+ jd_entities = self.extract_entities(jd_text)
50
+ entity_scores = []
51
+ for resume in resumes:
52
+ resume_entities = self.extract_entities(resume)
53
+ score = sum(len(jd_entities[key] & resume_entities[key])
54
+ for key in jd_entities) / max(len(jd_entities), 1)
55
+ entity_scores.append(score)
56
+
57
+ combined_scores = (tfidf_scores + semantic_scores + entity_scores) / 3
58
+ return combined_scores, tfidf_matrix, jd_entities
59
 
60
+ def get_top_terms(vector, feature_names, top_n=10):
61
+ if vector.nnz == 0:
62
+ return []
63
+ indices = vector.indices
64
+ data = vector.data
65
+ sorted_terms = sorted(zip(indices, data), key=lambda x: -x[1])
66
+ return [feature_names[idx] for idx, _ in sorted_terms[:top_n]]
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ def generate_llm_feedback(jd, resume):
69
+ try:
70
+ response = openai.ChatCompletion.create(
71
+ model="gpt-3.5-turbo",
72
+ messages=[{
73
+ "role": "user",
74
+ "content": f"Job Description:\n{jd}\n\nResume:\n{resume}\n\nProvide brief feedback on resume suitability."
75
+ }]
76
+ )
77
+ return response.choices[0].message.content
78
+ except Exception as e:
79
+ return f"Error generating feedback: {str(e)}"
80
 
81
  def main():
82
+ st.set_page_config(page_title="Resume Ranker Pro", layout="wide")
83
+ st.title("πŸš€ AI-Powered Resume Ranking System 2.0")
84
+
85
+ if 'metrics' not in st.session_state:
86
+ st.session_state.metrics = {
87
+ 'total_processed': 0,
88
+ 'avg_time': 0,
89
+ 'last_processed': None,
90
+ 'errors': []
91
+ }
92
 
93
+ processor = ResumeProcessor()
 
94
 
95
  with st.sidebar:
96
+ st.header("βš™οΈ Configuration")
97
+ jd_file = st.file_uploader("Job Description (TXT)", type="txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  resume_files = st.file_uploader("Resumes (PDF/TXT)",
99
  type=["pdf", "txt"],
100
  accept_multiple_files=True)
101
 
102
+ st.divider()
103
+ st.header("πŸ“Š AIOPS Monitoring")
104
+ st.metric("Total Processed", st.session_state.metrics['total_processed'])
105
+ st.metric("Avg Processing Time", f"{st.session_state.metrics['avg_time']:.2f}s")
106
+ st.metric("Last Processed", st.session_state.metrics['last_processed'] or "Never")
107
+
108
+ st.divider()
109
+ st.header("πŸ”§ MLOps Settings")
110
+ st.write("Model Version: 1.1.0")
111
+ if st.button("Retrain Model (Mock)"):
112
+ with st.spinner("Simulating retraining..."):
113
+ time.sleep(2)
114
+ st.success("Model updated to v1.1.1")
115
+
116
+ st.divider()
117
+ llm_enabled = st.checkbox("Enable LLM Feedback")
118
+
119
+ # Get OpenAI key from environment variable
120
+ openai_key = os.environ.get("OPENAI_API_KEY")
121
+
122
+ # Only show API key input if not running in production environment
123
+ if not openai_key and llm_enabled:
124
+ openai_key = st.text_input("OpenAI API Key", type="password")
125
+
126
+ if llm_enabled:
127
+ openai.api_key = openai_key
128
+
129
+ if jd_file and resume_files:
130
+ start_time = time.time()
131
+ try:
132
+ jd_text = jd_file.read().decode()
133
+ resume_texts = []
134
+ for file in resume_files:
135
+ if file.type == "application/pdf":
136
+ text = processor.extract_text_from_pdf(file)
137
+ else:
138
+ text = file.read().decode()
139
+ resume_texts.append(text)
140
+
141
+ scores, tfidf_matrix, jd_entities = processor.calculate_similarity(jd_text, resume_texts)
142
+ feature_names = processor.vectorizer.get_feature_names_out()
143
+ jd_top_terms = get_top_terms(tfidf_matrix[0], feature_names)
144
+
145
+ results = []
146
+ for i, (score, text) in enumerate(zip(scores, resume_texts)):
147
+ resume_vector = tfidf_matrix[i+1]
148
+ resume_terms = get_top_terms(resume_vector, feature_names)
149
+ common_terms = set(jd_top_terms) & set(resume_terms)
150
+ resume_entities = processor.extract_entities(text)
151
+ matched_entities = []
152
+ for key in jd_entities:
153
+ matched_entities.extend(jd_entities[key] & resume_entities.get(key, set()))
154
 
155
+ results.append({
156
+ "Filename": resume_files[i].name,
157
+ "Score": score,
158
+ "Top Terms": ", ".join(common_terms),
159
+ "Matched Entities": ", ".join(matched_entities),
160
+ "Resume Text": text
161
  })
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ df = pd.DataFrame(results).sort_values("Score", ascending=False)
164
+
165
+ st.subheader("πŸ“Š Ranking Results")
166
+ st.dataframe(
167
+ df[["Filename", "Score", "Top Terms", "Matched Entities"]],
168
+ column_config={
169
+ "Score": st.column_config.ProgressColumn(
170
+ format="%.4f",
171
+ min_value=0,
172
+ max_value=1.0
173
+ )
174
+ },
175
+ use_container_width=True,
176
+ hide_index=True
177
+ )
178
+
179
+ if llm_enabled and openai_key:
180
+ st.subheader("🧠 LLM Feedback")
181
+ for idx, row in df.iterrows():
182
+ with st.expander(f"Feedback for {row['Filename']}"):
183
+ feedback = generate_llm_feedback(jd_text, row['Resume Text'])
184
+ st.write(feedback)
185
+
186
+ processing_time = time.time() - start_time
187
+ st.session_state.metrics['total_processed'] += len(resume_files)
188
+ st.session_state.metrics['avg_time'] = (
189
+ st.session_state.metrics['avg_time'] * (st.session_state.metrics['total_processed'] - len(resume_files)) +
190
+ processing_time
191
+ ) / st.session_state.metrics['total_processed']
192
+ st.session_state.metrics['last_processed'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
193
+
194
+ except Exception as e:
195
+ st.error(f"Error processing files: {str(e)}")
196
+ st.session_state.metrics['errors'].append(str(e))
197
 
198
  if __name__ == "__main__":
199
+ main()