kbberendsen commited on
Commit
f83431c
·
1 Parent(s): 2a1554b

add all files except models

Browse files
.gitattributes CHANGED
@@ -31,5 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- *.ipynb filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.xz filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *.ff1pkl filter=lfs diff=lfs merge=lfs -text
35
+ *.sqlite filter=lfs diff=lfs merge=lfs -text
36
+ *.ipynb filter=lfs diff=lfs merge=lfs -text
.vscode/settings.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "python.analysis.typeCheckingMode": "off",
3
+ "cSpell.words": [
4
+ "iloc",
5
+ "lgbm",
6
+ "NDCG",
7
+ "proba",
8
+ "XGBM"
9
+ ]
10
+ }
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["shiny", "run", "dashboard/app.py", "--host", "0.0.0.0", "--port", "7860"]
classification.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28d178d125452857cecdbe5816e5e889a25b2c194a69a071a0f7965f1e298a46
3
+ size 105271651
dashboard/app.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from shiny import App, ui, render, reactive, module
3
+ from shiny import experimental as x
4
+ import shinyswatch
5
+ import pandas as pd
6
+ from modules import lead_ids, classification, ranking, support_texts
7
+
8
+ # Define lead options
9
+ leads_8 = lead_ids.leads_8
10
+ leads_123 = lead_ids.leads_123
11
+ leads_256 = lead_ids.leads_256
12
+
13
+ app_ui = ui.page_fluid(
14
+ shinyswatch.theme.minty(),
15
+ ui.panel_title("Lead Recommender System"),
16
+ ui.layout_sidebar(
17
+ ui.panel_sidebar(
18
+ ui.input_checkbox(
19
+ "explanation_select", "Show explanation", True
20
+ ),
21
+ ui.markdown("**Data**"),
22
+ ui.input_select(
23
+ "campaign_select", "Select campaign id:",
24
+ choices = ["8", "123", "256"],
25
+ selected = "8"
26
+ ),
27
+ ui.input_select(
28
+ "lead_select", "Select lead id:",
29
+ choices = leads_8
30
+ ),
31
+ ui.HTML("<br>"),
32
+ ui.markdown("**Classification**"),
33
+ ui.input_select(
34
+ "model_select", "Select classification model:",
35
+ choices = ["BERT", "XGB"],
36
+ selected = "BERT"
37
+ ),
38
+ ui.input_select(
39
+ "proba_cutoff_select", "Select minimum relevance probability (%):",
40
+ choices = [50, 60, 65, 70, 75, 80, 85, 90, 95],
41
+ selected = 80
42
+ ),
43
+ ui.HTML("<br>"),
44
+ ui.markdown("**Ranking**"),
45
+ ui.input_select(
46
+ "model_select_rank", "Select ranking model:",
47
+ choices = ["Light XGBM 1", "Light XGBM 2", "Light XGBM 3"],
48
+ selected = "Light XGBM 3"
49
+ ),
50
+ ui.input_select(
51
+ "rank_cutoff_select", "Select minimum prediction score (%):",
52
+ choices = [50, 60, 65, 70, 75, 80, 85, 90, 95],
53
+ selected = 80
54
+ ),
55
+ width=2.5
56
+ ),
57
+
58
+ ui.navset_tab(
59
+ ui.nav("Classification",
60
+ ui.panel_main(
61
+ ui.panel_conditional("input.explanation_select",
62
+ ui.row(
63
+ ui.column(
64
+ 12,
65
+ x.ui.card(
66
+ ui.markdown(support_texts.classification_intro_1),
67
+ ui.markdown(support_texts.classification_intro_2),
68
+ ui.markdown(support_texts.classification_intro_3),
69
+ ),
70
+ ),
71
+ ),
72
+ ),
73
+ ui.row(
74
+ ui.column(
75
+ 12,
76
+ x.ui.card(
77
+ ui.markdown("**Model classification performance of selected lead**"),
78
+ ui.output_data_frame("performance_metrics"),
79
+ ),
80
+ ),
81
+ ),
82
+ ui.row(
83
+ ui.column(
84
+ 12,
85
+ x.ui.card(
86
+ ui.markdown("**Top 3 relevant employees in selected lead**"),
87
+ ui.output_data_frame("predictions_123"),
88
+ ),
89
+ x.ui.card(
90
+ ui.markdown("**All employees in selected lead**"),
91
+ ui.output_data_frame("predictions"),
92
+ ),
93
+ ),
94
+ ),
95
+ ui.row(
96
+ ui.column(
97
+ 12,
98
+ x.ui.card(
99
+ ui.markdown("**Model descriptions and testing performance**"),
100
+ ui.markdown(support_texts.models_test_classification_1),
101
+ ui.output_data_frame("models_test_classification"),
102
+ ui.markdown(support_texts.models_test_classification_2)
103
+ ),
104
+ ),
105
+ ),
106
+ ),
107
+ ),
108
+
109
+
110
+ ui.nav("Ranking",
111
+ ui.panel_main(
112
+ ui.panel_conditional("input.explanation_select",
113
+ ui.row(
114
+ ui.column(
115
+ 12,
116
+ x.ui.card(
117
+ ui.markdown(support_texts.ranking_intro_1),
118
+ ui.markdown(support_texts.ranking_intro_2),
119
+ ui.markdown(support_texts.ranking_intro_3),
120
+ ),
121
+ ),
122
+ ),
123
+ ),
124
+ ui.row(
125
+ ui.column(
126
+ 12,
127
+ x.ui.card(
128
+ ui.markdown("**Model ranking performance of selected lead**"),
129
+ ui.output_data_frame("ndcg_score")
130
+ ),
131
+ x.ui.card(
132
+ ui.markdown("**Top 3 relevant employees in selected lead**"),
133
+ ui.output_data_frame("predictions_123_rank"),
134
+ ),
135
+ ),
136
+ ),
137
+ ui.row(
138
+ ui.column(
139
+ 12,
140
+ x.ui.card(
141
+ ui.markdown("**Ranking of all employees in selected lead**"),
142
+ ui.output_data_frame("predictions_rank")
143
+ ),
144
+ ),
145
+ ),
146
+ ui.row(
147
+ ui.column(
148
+ 12,
149
+ x.ui.card(
150
+ ui.markdown("**Model descriptions and testing performance**"),
151
+ ui.markdown("The table below shows the model performance on a larger testing dataset (does not contain the campaigns in this dashboard)"),
152
+ ui.output_data_frame("models_test_ranking"),
153
+ ui.markdown(support_texts.models_test_ranking_1)
154
+ ),
155
+ ),
156
+ ),
157
+ ),
158
+ ),
159
+
160
+ ui.nav("Comparison",
161
+ ui.panel_main(
162
+ ui.row(
163
+ ui.markdown("**Model performance comparison (campaign-level)**"),
164
+ ui.column(
165
+ 6,
166
+ x.ui.card(
167
+ ui.markdown("**Classification model performance**"),
168
+ ui.output_data_frame("performance_metrics_campaign"),
169
+ ui.markdown("<br><br>")
170
+ ),
171
+ ),
172
+ ui.column(
173
+ 6,
174
+ x.ui.card(
175
+ ui.markdown("**Ranking model performance**"),
176
+ ui.output_data_frame("campaign_rank")
177
+ ),
178
+ ),
179
+ ),
180
+ ui.row(
181
+ ui.column(
182
+ 12,
183
+ x.ui.card(
184
+ ui.markdown(support_texts.comparison_1)
185
+ ),
186
+ ),
187
+ ),
188
+ ui.row(
189
+ ui.column(
190
+ 12,
191
+ x.ui.card(
192
+ ui.markdown(support_texts.ranking_ndcg)
193
+ ),
194
+ ),
195
+ ),
196
+ ),
197
+ ),
198
+ ),
199
+ ),
200
+ )
201
+
202
+ def server(input, output, session):
203
+ # Updating lead id selection list
204
+ @reactive.Effect()
205
+ def _():
206
+ if input.campaign_select() == "8":
207
+ lead_options = leads_8
208
+ elif input.campaign_select() == "123":
209
+ lead_options = leads_123
210
+ elif input.campaign_select() == "256":
211
+ lead_options = leads_256
212
+
213
+ ui.update_select("lead_select",
214
+ label="Select lead id:",
215
+ choices=lead_options
216
+ )
217
+
218
+ ui.update_switch
219
+
220
+ # Get classification data single lead
221
+ @reactive.Calc
222
+ def get_lead_predictions():
223
+ try:
224
+ df, df_123, df_performance_metrics = classification.classify(CAMPAIGN_ID=int(input.campaign_select()),
225
+ LEAD_ID=int(input.lead_select()),
226
+ proba_cutoff=int(input.proba_cutoff_select()),
227
+ model_type=input.model_select())
228
+ return df, df_123, df_performance_metrics
229
+
230
+ except TypeError:
231
+ pass
232
+
233
+ except Exception:
234
+ ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
235
+
236
+ # Get classification data single lead
237
+ @reactive.Calc
238
+ def get_campaign_predictions():
239
+ try:
240
+ df, df_123, df_performance_metrics = classification.classify(CAMPAIGN_ID=int(input.campaign_select()),
241
+ LEAD_ID=int(input.lead_select()),
242
+ proba_cutoff=int(input.proba_cutoff_select()),
243
+ model_type=input.model_select(),
244
+ full_campaign=True)
245
+ return df, df_123, df_performance_metrics
246
+
247
+ except TypeError:
248
+ pass
249
+
250
+ except Exception:
251
+ ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
252
+
253
+ # Get ranking data
254
+ @reactive.Calc
255
+ def get_lead_ranking():
256
+ try:
257
+ df, df_123, ndcg, df_campaign_rank = ranking.rank_single_lead(CAMPAIGN_ID=int(input.campaign_select()),
258
+ LEAD_ID=int(input.lead_select()),
259
+ rank_cutoff=int(input.rank_cutoff_select()),
260
+ ranker=input.model_select_rank())
261
+ return df, df_123, ndcg, df_campaign_rank
262
+
263
+ except TypeError:
264
+ pass
265
+
266
+ except Exception:
267
+ ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
268
+
269
+ @output
270
+ @render.data_frame
271
+ def predictions_123():
272
+ try:
273
+ df, df_123, df_performance_metrics = get_lead_predictions()
274
+ return df_123
275
+ except TypeError:
276
+ pass
277
+
278
+ @output
279
+ @render.data_frame
280
+ def predictions():
281
+ try:
282
+ df, df_123, df_performance_metrics = get_lead_predictions()
283
+ return df
284
+ except TypeError:
285
+ pass
286
+
287
+ @output
288
+ @render.data_frame
289
+ def performance_metrics():
290
+ try:
291
+ df, df_123, df_performance_metrics = get_lead_predictions()
292
+ return df_performance_metrics
293
+ except TypeError:
294
+ pass
295
+
296
+ @output
297
+ @render.data_frame
298
+ def performance_metrics_campaign():
299
+ try:
300
+ df, df_123, df_performance_metrics = get_campaign_predictions()
301
+ return df_performance_metrics
302
+ except TypeError:
303
+ pass
304
+
305
+ @output
306
+ @render.data_frame
307
+ def predictions_123_rank():
308
+ try:
309
+ df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
310
+ return df_123
311
+ except TypeError:
312
+ pass
313
+
314
+ @output
315
+ @render.data_frame
316
+ def predictions_rank():
317
+ try:
318
+ df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
319
+ return df
320
+ except TypeError:
321
+ pass
322
+
323
+ @output
324
+ @render.data_frame
325
+ def ndcg_score():
326
+ try:
327
+ df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
328
+ return ndcg
329
+ except TypeError:
330
+ pass
331
+
332
+ @output
333
+ @render.data_frame
334
+ def campaign_rank():
335
+ try:
336
+ df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
337
+ return df_campaign_rank
338
+ except TypeError:
339
+ pass
340
+
341
+ # Model performance tables (test set)
342
+ @output
343
+ @render.data_frame
344
+ def models_test_classification():
345
+ try:
346
+ models_test_classification_data = {'Model': ['BERT', 'XGB'],
347
+ 'F1 weighted': [0.84, 0.81],
348
+ 'F1': [0.35, 0.35],
349
+ 'Accuracy': [0.80, 0.76],
350
+ 'Recall': [0.65, 0.78],
351
+ 'Precision': [0.24, 0.22]}
352
+ df_models_test_classification = pd.DataFrame.from_dict(data=models_test_classification_data, orient='index', columns=models_test_classification_data['Model'])
353
+ df_models_test_classification = df_models_test_classification.iloc[1:]
354
+ df_models_test_classification.reset_index(inplace=True, names=['Metric'])
355
+ return df_models_test_classification
356
+ except TypeError:
357
+ pass
358
+
359
+ @output
360
+ @render.data_frame
361
+ def models_test_ranking():
362
+ try:
363
+ models_test_ranking_data = {'Model': ['Light XGBM 1', 'Light XGBM 2', 'Light XGBM 3'],
364
+ 'NDCG@k score': [0.652, 0.2, 0.948],
365
+ 'k': [3,3,3]}
366
+ df_models_test_ranking = pd.DataFrame.from_dict(data=models_test_ranking_data, orient='index', columns=models_test_ranking_data['Model'])
367
+ df_models_test_ranking = df_models_test_ranking.iloc[1:]
368
+ df_models_test_ranking.reset_index(inplace=True, names=['Metric'])
369
+ return df_models_test_ranking
370
+ except TypeError:
371
+ pass
372
+
373
+ app = App(app_ui, server)
dashboard/modules/classification.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing required packages
2
+ import pickle
3
+ import pandas as pd
4
+ import re
5
+ import numpy as np
6
+ import torch.nn.functional as F
7
+ from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+
10
+
11
+ # Loading data
12
+ parquet_file = 'data\data_dump_ai_assingment.parquet'
13
+ df = pd.read_parquet(parquet_file, engine='pyarrow')
14
+
15
+
16
+ # Setting 3 random campaigns aside as testing examples for final models
17
+ campaign_ids = [8, 123, 256]
18
+ df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
19
+
20
+
21
+ # Clean text
22
+ def clean_text(text):
23
+ # Use a regular expression to remove non-alphabetic characters
24
+ cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
25
+
26
+ # Remove multiple consecutive spaces and leading/trailing spaces
27
+ cleaned_text = ' '.join(cleaned_text.split())
28
+
29
+ # Lower texts
30
+ cleaned_text = cleaned_text.lower()
31
+
32
+ return cleaned_text
33
+
34
+
35
+ def combine_text(df_single_lead):
36
+ # Changing column types
37
+ df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
38
+ df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
39
+ df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
40
+
41
+ # Combine text columns
42
+ df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
43
+
44
+ # Clean text
45
+ df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
46
+
47
+ return df_single_lead
48
+
49
+
50
+ # Function to test model performance
51
+ def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
52
+ text_test = X_test.to_list()
53
+ labels_test = y_test.to_list()
54
+
55
+ # Split the test data into batches to prevent large memory allocation
56
+ batch_size = batch_size
57
+ num_samples = len(text_test)
58
+ num_batches = (num_samples + batch_size - 1) // batch_size # Calculate the number of batches
59
+
60
+ # Initialize an empty list to store predicted labels
61
+ predicted_labels_test = []
62
+
63
+ # Initialize an empty list to store predicted probabilities
64
+ predicted_proba_test = []
65
+
66
+ # Iterate over batches
67
+ for i in range(num_batches):
68
+ start_idx = i * batch_size
69
+ end_idx = min((i + 1) * batch_size, num_samples)
70
+
71
+ # Get a batch of text and labels
72
+ batch_text = text_test[start_idx:end_idx]
73
+ batch_labels = labels_test[start_idx:end_idx]
74
+
75
+ # Encode the batch
76
+ encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')
77
+
78
+ # Forward pass through the model
79
+ logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits
80
+
81
+ # Get predicted labels for the batch
82
+ batch_predicted_labels = logits.argmax(dim=1).tolist()
83
+
84
+ # Append the batch predictions to the overall list
85
+ predicted_labels_test.extend(batch_predicted_labels)
86
+
87
+ # Apply softmax to logits to retrieve probabilities and put them in a cleaned list
88
+ softmax_proba = F.softmax(logits, dim=-1)
89
+ batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]
90
+
91
+ # Append the batch probabilities to the overall list
92
+ predicted_proba_test.extend(batch_predicted_proba)
93
+
94
+ return predicted_labels_test, predicted_proba_test
95
+
96
+
97
+ # Calculate performance metrics
98
+ def compute_metrics(predictions, true_labels):
99
+ f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
100
+ f1 = round(f1_score(true_labels, predictions),3)
101
+ accuracy = round(accuracy_score(true_labels, predictions),3)
102
+ recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
103
+ precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
104
+ performance_metrics = {
105
+ 'F1 weighted': f1_weighted,
106
+ 'F1': f1,
107
+ 'Accuracy': accuracy,
108
+ 'Recall': recall,
109
+ 'Precision': precision
110
+ }
111
+
112
+ return performance_metrics
113
+
114
+
115
+ # Loading XGB model
116
+ with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
117
+ xgb_model_tuned_2 = pickle.load(model_file)
118
+
119
+ # Loading XGB vectorizer
120
+ with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
121
+ vectorizer = pickle.load(model_file)
122
+
123
+
124
+ # Loading BERT model
125
+ distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')
126
+
127
+ # Loading BERT tokenizer
128
+ distil_bert_model_tuned_2 = AutoModelForSequenceClassification.from_pretrained(
129
+ 'models/distil_bert_tuned_2', num_labels=2)
130
+
131
+
132
+ # Classify single lead data
133
+ def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):
134
+
135
+ if full_campaign == True:
136
+ # Select full campaign data
137
+ df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
138
+ else:
139
+ # Selecting single lead data
140
+ df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
141
+
142
+ # True labels
143
+ true_labels = df['employee_is_selected'].tolist()
144
+
145
+ # Combining text columns
146
+ df = combine_text(df)
147
+
148
+ # Vectorize text with tfidf vectorizer
149
+ tfidf_matrix = vectorizer.transform(df['combined_text'])
150
+
151
+ # Selecing model
152
+ if model_type=='XGB':
153
+ model = xgb_model_tuned_2
154
+ # Predictions
155
+ predictions = model.predict(tfidf_matrix)
156
+ # Prediction porabilities of being 1 (selected)
157
+ predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()
158
+
159
+ elif model_type=='BERT':
160
+ predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2,
161
+ tokenizer = distil_bert_tokenizer_tuned_2,
162
+ X_test = df['combined_text'],
163
+ y_test = df['employee_is_selected'])
164
+ # Prediction porabilities of being 1 (selected)
165
+ predictions_proba_1 = [lists[1] for lists in predicted_test_proba]
166
+
167
+ # Alter predictions based on rank_cutoff value
168
+ cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]
169
+
170
+ # Use argsort to get the indices that would sort the list in descending order
171
+ sorted_indices = np.argsort(predictions_proba_1)[::-1]
172
+
173
+ # Create dataframe columns and ranking
174
+ df['cutoff_prediction'] = cutoff_predictions
175
+ df['prediction_proba_1'] = predictions_proba_1
176
+ df = df.sort_values(by='prediction_proba_1', ascending=False)
177
+ df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
178
+ df['prediction_proba_1'] = df['prediction_proba_1'].round(3)
179
+
180
+ df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
181
+ df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')
182
+
183
+ performance_metrics = compute_metrics(cutoff_predictions, true_labels)
184
+ df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
185
+ df_performance_metrics.reset_index(inplace=True, names=['Metric'])
186
+
187
+ return df, df_123, df_performance_metrics
188
+
189
+
190
+
dashboard/modules/lead_ids.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ parquet_file = 'data\data_dump_ai_assingment.parquet'
4
+ df = pd.read_parquet(parquet_file, engine='pyarrow')
5
+
6
+ # Setting 3 random campaigns aside as testing examples for final models
7
+ campaign_ids = [8, 123, 256]
8
+ df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True].copy()
9
+
10
+ def get_unique_lead_ids(df, campaign_id):
11
+ df_campaign = df_final_testing[df_final_testing['campaign_id'] == campaign_id].copy()
12
+ lead_ids = list(df_campaign['lead_id'].unique())
13
+ return lead_ids
14
+
15
+ leads_8 = get_unique_lead_ids(df_final_testing, 8)
16
+ leads_123 = get_unique_lead_ids(df_final_testing, 123)
17
+ leads_256 = get_unique_lead_ids(df_final_testing, 256)
18
+
19
+ leads_8 = [str(i) for i in leads_8]
20
+ leads_123 = [str(i) for i in leads_123]
21
+ leads_256 = [str(i) for i in leads_256]
dashboard/modules/ranking.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing required packages
2
+ import pickle
3
+ import pandas as pd
4
+ import re
5
+ import numpy as np
6
+ import lightgbm as lgbm
7
+ from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score
8
+
9
+
10
+ # Loading data
11
+ parquet_file = 'data\data_dump_ai_assingment.parquet'
12
+ df = pd.read_parquet(parquet_file, engine='pyarrow')
13
+
14
+
15
+ # Setting 3 random campaigns aside as testing examples for final models
16
+ campaign_ids = [8, 123, 256]
17
+ df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
18
+
19
+
20
+ # Clean text
21
+ def clean_text(text):
22
+ # Use a regular expression to remove non-alphabetic characters
23
+ cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
24
+
25
+ # Remove multiple consecutive spaces and leading/trailing spaces
26
+ cleaned_text = ' '.join(cleaned_text.split())
27
+
28
+ # Lower texts
29
+ cleaned_text = cleaned_text.lower()
30
+
31
+ return cleaned_text
32
+
33
+
34
+ def combine_text(df_single_lead):
35
+ # Changing column types
36
+ df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
37
+ df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
38
+ df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
39
+
40
+ # Combine text columns
41
+ df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
42
+
43
+ # Clean text
44
+ df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
45
+
46
+ return df_single_lead
47
+
48
+
49
+ # Calculate performance metrics
50
+ def compute_metrics(predictions, true_labels):
51
+ f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
52
+ f1 = round(f1_score(true_labels, predictions),3)
53
+ accuracy = round(accuracy_score(true_labels, predictions),3)
54
+ recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
55
+ precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
56
+ performance_metrics = {
57
+ 'F1 weighted': f1_weighted,
58
+ 'F1': f1,
59
+ 'Accuracy': accuracy,
60
+ 'Recall': recall,
61
+ 'Precision': precision
62
+ }
63
+
64
+ return performance_metrics
65
+
66
+
67
+ # Loading LGBM models
68
+ with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file:
69
+ lgbm_model_1 = pickle.load(model_file)
70
+
71
+ with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file:
72
+ lgbm_model_2 = pickle.load(model_file)
73
+
74
+ with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file:
75
+ lgbm_model_3 = pickle.load(model_file)
76
+
77
+ # Loading LGBM vectorizer
78
+ with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file:
79
+ vectorizer = pickle.load(model_file)
80
+
81
+
82
+ # Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores
83
+ # Function to properly test a model on the test set by calculating score per group
84
+ def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50):
85
+ # Create empty lists to store predictions and true labels for each query group (lead id groups)
86
+ campaign_predictions = []
87
+ campaign_predictions_cutoff = []
88
+ campaign_true_labels = []
89
+ campaign_ndcg_scores = []
90
+
91
+ campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
92
+ query_group_ids = campaign_data['lead_id']
93
+
94
+ # Iterate over query groups (in this case lead ids)
95
+ lead_ids = np.unique(query_group_ids)
96
+ for lead_id in lead_ids:
97
+ # Filter the data for the specific lead_id
98
+ single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id]
99
+
100
+ # Only predict ranking is lead contains more than 1 employee
101
+ if len(single_lead_data)>1:
102
+
103
+ single_lead_data = combine_text(single_lead_data)
104
+
105
+ # Preprocess the text features for the single lead
106
+ single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
107
+
108
+ # Predict single lead scores
109
+ single_lead_pred = ranker.predict(single_lead_tfidf)
110
+
111
+ # Store predictions
112
+ campaign_predictions.extend(single_lead_pred)
113
+ campaign_true_labels.extend(single_lead_data['employee_is_selected'])
114
+
115
+ # Store lead NDCG score
116
+ # k is 3 unless single lead data has less than 4 items
117
+ if len(single_lead_data) < 4:
118
+ k = len(single_lead_data)
119
+ else:
120
+ k = 3
121
+
122
+ ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k)
123
+ campaign_ndcg_scores.append(ndcg_lead)
124
+
125
+ else:
126
+ pass
127
+
128
+ # Get max and min value of campaign prediction scores
129
+ campaign_predictions_max = max(campaign_predictions)
130
+ campaign_predictions_min = min(campaign_predictions)
131
+
132
+ # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
133
+ campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions]
134
+
135
+ # Define binary predictions based on rank_cutoff value
136
+ cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled]
137
+
138
+ # Get performance metrics using binary cutoff_predictions
139
+ performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions)
140
+ df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value'])
141
+ df_performance_metrics.reset_index(inplace=True, names=['Metric'])
142
+
143
+ # Get average NDCG score
144
+ ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3)
145
+ df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k}
146
+ df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value'])
147
+ df_campaign_ndcg.reset_index(inplace=True, names=['Metric'])
148
+
149
+ # Merge perfromance metrics and average NDCG score
150
+ df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True)
151
+
152
+ return campaign_predictions_max, campaign_predictions_min, df_campaign_rank
153
+
154
+
155
+ # Rank single lead
156
+ def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3):
157
+ if ranker == "Light XGBM 1":
158
+ ranker = lgbm_model_1
159
+ elif ranker == "Light XGBM 2":
160
+ ranker = lgbm_model_2
161
+ elif ranker == "Light XGBM 3":
162
+ ranker = lgbm_model_3
163
+
164
+ # Selecting single lead data and combine text columns used for ranking
165
+ single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
166
+ single_lead_data = combine_text(single_lead_data)
167
+
168
+ # Preprocess the text features for the single lead
169
+ single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
170
+
171
+ # Predict single lead
172
+ single_lead_pred = ranker.predict(single_lead_tfidf)
173
+ single_lead_data['predicted_score'] = single_lead_pred
174
+
175
+ # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
176
+ campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff)
177
+ single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value)
178
+
179
+ # Define binary predictions based on rank_cutoff value
180
+ cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']]
181
+ single_lead_data['cutoff_prediction'] = cutoff_predictions
182
+
183
+ # Rank employees and create output dataframe
184
+ ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))]
185
+ single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False)
186
+ single_lead_data['ranking'] = ranked_list
187
+ single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3)
188
+ single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3)
189
+ single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']]
190
+
191
+ # Top 3 dataframe
192
+ df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking')
193
+
194
+ # k is 3 unless single lead data has less than 4 items
195
+ if len(single_lead_data) < 4:
196
+ k = len(single_lead_data)
197
+ else:
198
+ k = 3
199
+
200
+ # Compute NDCG score
201
+ ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3)
202
+
203
+ df_ndcg_data = {'NDCG@k score': ndcg, 'k': k}
204
+ df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value'])
205
+ df_ndcg.reset_index(inplace=True, names=['Metric'])
206
+
207
+ # Print data and overall ndcg score
208
+ #print(f'NDCG Score on Test Data: {ndcg:.4f}')
209
+ return single_lead_data, df_123, df_ndcg, df_campaign_rank
dashboard/modules/support_texts.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ classification_intro_1 = '''
2
+ **Model performance**<br>
3
+ The first table shows the model performance metrics of the current selection. In other words, how the BERT or XGB classification model performs on the current lead selection.
4
+ These performance metrics are calculated by comparing the 'employee_is_selected' column (true labels) to the 'cutoff_prediction' column (predicted labels). The model performance is also dependent
5
+ on the minimum relevance probability selected by the user in the sidebar. Important to note, the models were not trained on the campaigns and accompanying leads shown in this dashboard.
6
+ This means that these campaigns are 'new' for the models. Model descriptions can be found at the bottom of the page.
7
+ '''
8
+
9
+ classification_intro_2 = '''
10
+ **Top 3**<br>
11
+ The second table below shows the top 3 relevant employees. It's possible that the top 3 contains less than 3 employees, as the relevance probability needs to be above the selected minimum
12
+ in the sidebar. In the third table, all employees within the selected lead can be found, including the top 3.
13
+ '''
14
+
15
+ classification_intro_3 = '''
16
+ **Prediction/relevance probabilities**<br>
17
+ The predicted employee probability/relevance scores can be found in the 'prediction_proba_1' column of the tables. Again, how many of the employees show up in the first top 3 table
18
+ depends on the selected minimum probability percentage in the sidebar.
19
+ '''
20
+
21
+ models_test_classification_1 = '''
22
+ The model performance metrics below are derived from testing the models on a testing set (this set does not contain the campaigns in this dashboard). Note, these metrics are calculated with a standard probability
23
+ cutoff value of 0.5. A higher cutoff value like 0.8 tends to increase model performance for all models.
24
+ '''
25
+
26
+ models_test_classification_2 = '''
27
+ The classification models are based on two frameworks. The first is XGBoost, an optimized distributed gradient boosting library. Multiple models were created and only the best performing model is
28
+ added to this dashboard. The model variations differ in, for example, how they deal with the unbalanced training data and their hyperparameters. The XGB model added to this dashboard uses oversampling to deal
29
+ with the unbalanced data and grid search cross validation to optimize the hyperparameters.
30
+ <br><br>
31
+ The other classification model uses a BERT transformer to vectorize and classify the text variables used in training. In this approach, the Pytorch library is used to fine tune the pre-trained BERT
32
+ transformer on the training data. Again, multiple models are created and the best performing model is added to this dashboard. This BERT model uses a data imbalance ratio and optimizes the F1 score.
33
+ For specifics regarding all models, see the source code.
34
+ <br><br>
35
+ Comparing the XGB and BERT models, BERT performs slightly better when looking at the weighted F1 score. However the 'normal' F1 score (harmonic mean of precision and recall), is the same for both models.
36
+ That being said, XGB is significantly more efficient. This efficiency difference is not noticeable when classifying small datasets like single leads, but becomes evident when trying to classify larger datasets
37
+ like full campaigns on the 'comparison' tab of this dashboard.
38
+ '''
39
+
40
+
41
+ ranking_intro_1 = '''
42
+ **Model performance**<br>
43
+ The first table shows the model performance metrics of the current selection. For the ranking models, this is the NDCG@k score. Here, the k parameter is set to 3, as for this exercise,
44
+ correctly ranking the top 3 relevant employees is most important. The NDCG@k metric is calculated from comparing the 'employee_is_selected' column (true labels) to the 'predicted_score'
45
+ column (predicted labels). Note that the predicted scores are not binary like the true labels. Model descriptions can be found at the bottom of the page.
46
+ '''
47
+
48
+ ranking_intro_2 = '''
49
+ **Top 3**<br>
50
+ The second table shows the top 3 relevant employees. As the predicted_score column contain's values with different ranges from lead to lead, these scores are scaled between 0 and 1 to make them
51
+ comparable across leads (scaling is explained below). These scaled scores are then used to determine a top 3 relevant employee ranking by checking if the scaled score is above the user selected minimum
52
+ prediction score in the sidebar. Note that ranking models like these prioritize the correct employee order above individual employee relevance scoring. Therefore, the top 3 and its dependence on
53
+ the selected minimum relevance probability is a bit less important compared to the total ranking order in the third table.
54
+ '''
55
+
56
+ ranking_intro_3 = '''
57
+ **Predicted relevance scores and score scaling**<br>
58
+ The predicted employee relevance scores can be found in the 'predicted_score' column of the tables. The predicted scores range of the whole campaign is used to scale the predicted scores in single leads.
59
+ The scores are scaled between values of 0 and 1. This approach means that the score scaling is dependent on the campaign the lead falls into. The scaled relevance scores are stored in the 'scaled_predicted_score'
60
+ column of the tables. Again, how many of the employees show up in the first top 3 table depends on the selected minimum prediction score in the sidebar.
61
+ '''
62
+
63
+ models_test_ranking_1 = '''
64
+ The ranking models are based on LightGMB, which is a gradient boosting framework that uses tree based learning algorithms. From this framework, models are created using the LGBMRanker. Here, the objective
65
+ parameter is set to 'lambdarank' and the boosting type parameter is set to 'Gradient Boosting Decision Tree'. The combination of these two parameters mean that the models use a lambdaMART algorithm. The models
66
+ available in this dashboard are described below. More information about, for example, the specific training parameters can be found in the source code.
67
+ <br>
68
+ - **Light GBM 1:** First implementation of the lambdaMART algorithm. Hyperparameters are not tuned. Performs the worst of the three models.
69
+ - **Light GBM 2:** Hyperparameters are tuned by testing multiple hyperparameter values using trials. These trails try to optimize the NDCG@k score on a testing set. For this model, k is set to 10.
70
+ - **Light GBM 3:** Hyperparameters are tuned with different value ranges. Also, a unbalanced data flag is added. Finally, k is set to 3 to prioritize the top 3 ranking score. This model performs the best.
71
+ '''
72
+
73
+
74
+ comparison_1 = '''
75
+ **Classification vs Ranking**<br>
76
+ In order to make the classification and ranking models comparable, performance metrics like F1 and recall are calculated for both. To calculate these, the predicted relevance scores
77
+ are converted to a binary classification (0 for non relevant and 1 for relevant) with the use of the selected minimum relevance probabilities/scores. Therefore, the performance metrics are dependent
78
+ on the selected campaign, model and minimum relevance probabilities/scores. It should be noted that of all the performance metrics, the NDCG score is best to estimate the ranking model performance.
79
+ More about this is explained below.<br><br>
80
+ Comparing the two techniques, the classification models seem to perform better in terms of individual item scoring. This means that the classification models are more accurate
81
+ in determining whether or not a single employee is relevant (0 or 1). This is evident when looking at the F1 score (harmonic mean of precision and recall score), which is higher in the classification models
82
+ when compared to the F1 scores of the ranking models. However, this is to be expected as the learning to rank models are more focussed on the relative ordering of the items (employees) instead of the most
83
+ accurate relevance score of each individual item. This means that metrics like F1, accuracy, precision and recall are less optimal for assessing the ranking performance of the model. The NDCG score is better
84
+ suited for this purpose.
85
+ '''
86
+
87
+ ranking_ndcg = '''
88
+ **Ranking (NDCG scores)**<br>
89
+ The ranking models are best evaluated by their NDCG@k scores. The k parameter is set to 3, as for this exercise, correctly ranking the top 3 relevant employees is most important. On this page, the average
90
+ NDCG scores of a full campaign can be found. This means that the individual lead NDCG scores are summed an divided by the total number of lead within the selected campaign. Some campaigns have low average
91
+ campaign NDCG scores, like with campaign 256, can be explained by the fact that this campaign has lots of leads with zero relevant employees in them. When this is the case, the NDCG score of such a lead is
92
+ likely 0. Of course, lots of these 0 scores will lower the average NDCG score of the full campaign.
93
+ '''
data/OLD_data_dump_ai_assingment.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:499bdc1a2fc3a6a08cf917b48d49e43a0d85d59808a2f6b7140ea40ebddf86ee
3
+ size 23894789
data/data_dump_ai_assingment.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25edcff8b8eeccd92cd456fa19609247644b5f2c4893f188b39111d6ff22847
3
+ size 18923746
ranking.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd1d85c31a928033f0e92cafd44c35507748cef8cc75589d5f6ef07d9a798890
3
+ size 9421959
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ lightgbm==4.1.0
2
+ numpy==1.25.1
3
+ pandas==2.0.3
4
+ scikit_learn==1.3.1
5
+ shiny==0.5.0
6
+ shinyswatch==0.2.4
7
+ torch==2.0.1
8
+ transformers==4.33.3