Spaces:

kbberendsen
/

reshark

Sleeping

App Files Files Community

kbberendsen commited on Oct 12, 2023

Commit

f83431c

1 Parent(s): 2a1554b

add all files except models

Browse files

Files changed (13) hide show

.gitattributes +3 -2
.vscode/settings.json +10 -0
Dockerfile +13 -0
classification.ipynb +3 -0
dashboard/app.py +373 -0
dashboard/modules/classification.py +190 -0
dashboard/modules/lead_ids.py +21 -0
dashboard/modules/ranking.py +209 -0
dashboard/modules/support_texts.py +93 -0
data/OLD_data_dump_ai_assingment.parquet +3 -0
data/data_dump_ai_assingment.parquet +3 -0
ranking.ipynb +3 -0
requirements.txt +8 -0

.gitattributes CHANGED Viewed

@@ -31,5 +31,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-*.ipynb filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*.ff1pkl filter=lfs diff=lfs merge=lfs -text
+*.sqlite filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "python.analysis.typeCheckingMode": "off",
+    "cSpell.words": [
+        "iloc",
+        "lgbm",
+        "NDCG",
+        "proba",
+        "XGBM"
+    ]
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["shiny", "run", "dashboard/app.py", "--host", "0.0.0.0", "--port", "7860"]

classification.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28d178d125452857cecdbe5816e5e889a25b2c194a69a071a0f7965f1e298a46
+size 105271651

dashboard/app.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import os
+from shiny import App, ui, render, reactive, module
+from shiny import experimental as x
+import shinyswatch
+import pandas as pd
+from modules import lead_ids, classification, ranking, support_texts
+# Define lead options
+leads_8 = lead_ids.leads_8
+leads_123 = lead_ids.leads_123
+leads_256 = lead_ids.leads_256
+app_ui = ui.page_fluid(
+    shinyswatch.theme.minty(),
+    ui.panel_title("Lead Recommender System"),
+        ui.layout_sidebar(
+            ui.panel_sidebar(
+                ui.input_checkbox(
+                    "explanation_select", "Show explanation", True
+                ),
+                ui.markdown("**Data**"),
+                ui.input_select(
+                    "campaign_select", "Select campaign id:",
+                    choices = ["8", "123", "256"],
+                    selected = "8"
+                ),
+                ui.input_select(
+                    "lead_select", "Select lead id:",
+                    choices = leads_8
+                ),
+                ui.HTML("<br>"),
+                ui.markdown("**Classification**"),
+                ui.input_select(
+                    "model_select", "Select classification model:",
+                    choices = ["BERT", "XGB"],
+                    selected = "BERT"
+                ),
+                ui.input_select(
+                    "proba_cutoff_select", "Select minimum relevance probability (%):",
+                    choices = [50, 60, 65, 70, 75, 80, 85, 90, 95],
+                    selected = 80
+                ),
+                ui.HTML("<br>"),
+                ui.markdown("**Ranking**"),
+                ui.input_select(
+                    "model_select_rank", "Select ranking model:",
+                    choices = ["Light XGBM 1", "Light XGBM 2", "Light XGBM 3"],
+                    selected = "Light XGBM 3"
+                ),
+                ui.input_select(
+                    "rank_cutoff_select", "Select minimum prediction score (%):",
+                    choices = [50, 60, 65, 70, 75, 80, 85, 90, 95],
+                    selected = 80
+                ),
+            width=2.5
+            ),
+        ui.navset_tab(
+        ui.nav("Classification",
+                ui.panel_main(
+                    ui.panel_conditional("input.explanation_select",
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown(support_texts.classification_intro_1),
+                                ui.markdown(support_texts.classification_intro_2),
+                                ui.markdown(support_texts.classification_intro_3),
+                            ),
+                        ),
+                    ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Model classification performance of selected lead**"),
+                                ui.output_data_frame("performance_metrics"),
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Top 3 relevant employees in selected lead**"),
+                                ui.output_data_frame("predictions_123"),
+                            ),
+                            x.ui.card(
+                                ui.markdown("**All employees in selected lead**"),
+                                ui.output_data_frame("predictions"),
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Model descriptions and testing performance**"),
+                                ui.markdown(support_texts.models_test_classification_1),
+                                ui.output_data_frame("models_test_classification"),
+                                ui.markdown(support_texts.models_test_classification_2)
+                            ),
+                        ),
+                    ),
+                ),
+            ),
+        ui.nav("Ranking",
+                ui.panel_main(
+                    ui.panel_conditional("input.explanation_select",
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown(support_texts.ranking_intro_1),
+                                ui.markdown(support_texts.ranking_intro_2),
+                                ui.markdown(support_texts.ranking_intro_3),
+                            ),
+                        ),
+                    ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Model ranking performance of selected lead**"),
+                                ui.output_data_frame("ndcg_score")
+                            ),
+                            x.ui.card(
+                                ui.markdown("**Top 3 relevant employees in selected lead**"),
+                                ui.output_data_frame("predictions_123_rank"),
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Ranking of all employees in selected lead**"),
+                                ui.output_data_frame("predictions_rank")
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown("**Model descriptions and testing performance**"),
+                                ui.markdown("The table below shows the model performance on a larger testing dataset (does not contain the campaigns in this dashboard)"),
+                                ui.output_data_frame("models_test_ranking"),
+                                ui.markdown(support_texts.models_test_ranking_1)
+                            ),
+                        ),
+                    ),
+                ),
+            ),
+        ui.nav("Comparison",
+                ui.panel_main(
+                    ui.row(
+                        ui.markdown("**Model performance comparison (campaign-level)**"),
+                        ui.column(
+                            6,
+                            x.ui.card(
+                                ui.markdown("**Classification model performance**"),
+                                ui.output_data_frame("performance_metrics_campaign"),
+                                ui.markdown("<br><br>")
+                            ),
+                        ),
+                        ui.column(
+                            6,
+                            x.ui.card(
+                                ui.markdown("**Ranking model performance**"),
+                                ui.output_data_frame("campaign_rank")
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown(support_texts.comparison_1)
+                            ),
+                        ),
+                    ),
+                    ui.row(
+                        ui.column(
+                            12,
+                            x.ui.card(
+                                ui.markdown(support_texts.ranking_ndcg)
+                            ),
+                        ),
+                    ),
+                ),
+            ),
+        ),
+    ),
+)
+def server(input, output, session):
+    # Updating lead id selection list
+    @reactive.Effect()
+    def _():
+        if input.campaign_select() == "8":
+            lead_options = leads_8
+        elif input.campaign_select() == "123":
+            lead_options = leads_123
+        elif input.campaign_select() == "256":
+            lead_options = leads_256
+        ui.update_select("lead_select",
+            label="Select lead id:",
+            choices=lead_options
+        )
+        ui.update_switch
+    # Get classification data single lead
+    @reactive.Calc
+    def get_lead_predictions():
+        try:
+            df, df_123, df_performance_metrics = classification.classify(CAMPAIGN_ID=int(input.campaign_select()),
+                                                                     LEAD_ID=int(input.lead_select()),
+                                                                     proba_cutoff=int(input.proba_cutoff_select()),
+                                                                     model_type=input.model_select())
+            return df, df_123, df_performance_metrics
+        except TypeError:
+            pass
+        except Exception:
+            ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
+    # Get classification data single lead
+    @reactive.Calc
+    def get_campaign_predictions():
+        try:
+            df, df_123, df_performance_metrics = classification.classify(CAMPAIGN_ID=int(input.campaign_select()),
+                                                                     LEAD_ID=int(input.lead_select()),
+                                                                     proba_cutoff=int(input.proba_cutoff_select()),
+                                                                     model_type=input.model_select(),
+                                                                     full_campaign=True)
+            return df, df_123, df_performance_metrics
+        except TypeError:
+            pass
+        except Exception:
+            ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
+    # Get ranking data
+    @reactive.Calc
+    def get_lead_ranking():
+        try:
+            df, df_123, ndcg, df_campaign_rank = ranking.rank_single_lead(CAMPAIGN_ID=int(input.campaign_select()),
+                                                                            LEAD_ID=int(input.lead_select()),
+                                                                            rank_cutoff=int(input.rank_cutoff_select()),
+                                                                            ranker=input.model_select_rank())
+            return df, df_123, ndcg, df_campaign_rank
+        except TypeError:
+            pass
+        except Exception:
+            ui.notification_show("Data is still loading or something went wrong", duration=3, type="error")
+    @output
+    @render.data_frame
+    def predictions_123():
+        try:
+            df, df_123, df_performance_metrics = get_lead_predictions()
+            return df_123
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def predictions():
+        try:
+            df, df_123, df_performance_metrics = get_lead_predictions()
+            return df
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def performance_metrics():
+        try:
+            df, df_123, df_performance_metrics = get_lead_predictions()
+            return df_performance_metrics
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def performance_metrics_campaign():
+        try:
+            df, df_123, df_performance_metrics = get_campaign_predictions()
+            return df_performance_metrics
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def predictions_123_rank():
+        try:
+            df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
+            return df_123
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def predictions_rank():
+        try:
+            df, df_123, ndcg, df_campaign_rank  = get_lead_ranking()
+            return df
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def ndcg_score():
+        try:
+            df, df_123, ndcg, df_campaign_rank  = get_lead_ranking()
+            return ndcg
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def campaign_rank():
+        try:
+            df, df_123, ndcg, df_campaign_rank = get_lead_ranking()
+            return df_campaign_rank
+        except TypeError:
+            pass
+    # Model performance tables (test set)
+    @output
+    @render.data_frame
+    def models_test_classification():
+        try:
+            models_test_classification_data = {'Model': ['BERT', 'XGB'],
+                                               'F1 weighted': [0.84, 0.81],
+                                               'F1': [0.35, 0.35],
+                                               'Accuracy': [0.80, 0.76],
+                                               'Recall': [0.65, 0.78],
+                                               'Precision': [0.24, 0.22]}
+            df_models_test_classification = pd.DataFrame.from_dict(data=models_test_classification_data, orient='index', columns=models_test_classification_data['Model'])
+            df_models_test_classification = df_models_test_classification.iloc[1:]
+            df_models_test_classification.reset_index(inplace=True, names=['Metric'])
+            return df_models_test_classification
+        except TypeError:
+            pass
+    @output
+    @render.data_frame
+    def models_test_ranking():
+        try:
+            models_test_ranking_data = {'Model': ['Light XGBM 1', 'Light XGBM 2', 'Light XGBM 3'],
+                                        'NDCG@k score': [0.652, 0.2, 0.948],
+                                        'k': [3,3,3]}
+            df_models_test_ranking = pd.DataFrame.from_dict(data=models_test_ranking_data, orient='index', columns=models_test_ranking_data['Model'])
+            df_models_test_ranking = df_models_test_ranking.iloc[1:]
+            df_models_test_ranking.reset_index(inplace=True, names=['Metric'])
+            return df_models_test_ranking
+        except TypeError:
+            pass
+app = App(app_ui, server)

dashboard/modules/classification.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Importing required packages
+import pickle
+import pandas as pd
+import re
+import numpy as np
+import torch.nn.functional as F
+from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+# Loading data
+parquet_file = 'data\data_dump_ai_assingment.parquet'
+df = pd.read_parquet(parquet_file, engine='pyarrow')
+# Setting 3 random campaigns aside as testing examples for final models
+campaign_ids = [8, 123, 256]
+df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
+# Clean text
+def clean_text(text):
+    # Use a regular expression to remove non-alphabetic characters
+    cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
+    # Remove multiple consecutive spaces and leading/trailing spaces
+    cleaned_text = ' '.join(cleaned_text.split())
+    # Lower texts
+    cleaned_text = cleaned_text.lower()
+    return cleaned_text
+def combine_text(df_single_lead):
+    # Changing column types
+    df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
+    df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
+    df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
+    # Combine text columns
+    df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
+    # Clean text
+    df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
+    return df_single_lead
+# Function to test model performance
+def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
+    text_test = X_test.to_list()
+    labels_test = y_test.to_list()
+    # Split the test data into batches to prevent large memory allocation
+    batch_size = batch_size
+    num_samples = len(text_test)
+    num_batches = (num_samples + batch_size - 1) // batch_size  # Calculate the number of batches
+    # Initialize an empty list to store predicted labels
+    predicted_labels_test = []
+    # Initialize an empty list to store predicted probabilities
+    predicted_proba_test = []
+    # Iterate over batches
+    for i in range(num_batches):
+        start_idx = i * batch_size
+        end_idx = min((i + 1) * batch_size, num_samples)
+        # Get a batch of text and labels
+        batch_text = text_test[start_idx:end_idx]
+        batch_labels = labels_test[start_idx:end_idx]
+        # Encode the batch
+        encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')
+        # Forward pass through the model
+        logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits
+        # Get predicted labels for the batch
+        batch_predicted_labels = logits.argmax(dim=1).tolist()
+        # Append the batch predictions to the overall list
+        predicted_labels_test.extend(batch_predicted_labels)
+        # Apply softmax to logits to retrieve probabilities and put them in a cleaned list
+        softmax_proba = F.softmax(logits, dim=-1)
+        batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]
+        # Append the batch probabilities to the overall list
+        predicted_proba_test.extend(batch_predicted_proba)
+    return predicted_labels_test, predicted_proba_test
+# Calculate performance metrics
+def compute_metrics(predictions, true_labels):
+    f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
+    f1 = round(f1_score(true_labels, predictions),3)
+    accuracy = round(accuracy_score(true_labels, predictions),3)
+    recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
+    precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
+    performance_metrics = {
+        'F1 weighted': f1_weighted,
+        'F1': f1,
+        'Accuracy': accuracy,
+        'Recall': recall,
+        'Precision': precision
+        }
+    return performance_metrics
+# Loading XGB model
+with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
+    xgb_model_tuned_2 = pickle.load(model_file)
+# Loading XGB vectorizer
+with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
+    vectorizer = pickle.load(model_file)
+# Loading BERT model
+distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')
+# Loading BERT tokenizer
+distil_bert_model_tuned_2  = AutoModelForSequenceClassification.from_pretrained(
+    'models/distil_bert_tuned_2', num_labels=2)
+# Classify single lead data
+def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):
+    if full_campaign == True:
+        # Select full campaign data
+        df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
+    else:
+        # Selecting single lead data
+        df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
+    # True labels
+    true_labels = df['employee_is_selected'].tolist()
+    # Combining text columns
+    df = combine_text(df)
+    # Vectorize text with tfidf vectorizer
+    tfidf_matrix = vectorizer.transform(df['combined_text'])
+    # Selecing model
+    if model_type=='XGB':
+        model = xgb_model_tuned_2
+        # Predictions
+        predictions = model.predict(tfidf_matrix)
+        # Prediction porabilities of being 1 (selected)
+        predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()
+    elif model_type=='BERT':
+        predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2,
+                                                              tokenizer = distil_bert_tokenizer_tuned_2,
+                                                              X_test = df['combined_text'],
+                                                              y_test = df['employee_is_selected'])
+        # Prediction porabilities of being 1 (selected)
+        predictions_proba_1 = [lists[1] for lists in predicted_test_proba]
+    # Alter predictions based on rank_cutoff value
+    cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]
+    # Use argsort to get the indices that would sort the list in descending order
+    sorted_indices = np.argsort(predictions_proba_1)[::-1]
+    # Create dataframe columns and ranking
+    df['cutoff_prediction'] = cutoff_predictions
+    df['prediction_proba_1'] = predictions_proba_1
+    df = df.sort_values(by='prediction_proba_1', ascending=False)
+    df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
+    df['prediction_proba_1'] = df['prediction_proba_1'].round(3)
+    df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
+    df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')
+    performance_metrics = compute_metrics(cutoff_predictions, true_labels)
+    df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
+    df_performance_metrics.reset_index(inplace=True, names=['Metric'])
+    return df, df_123, df_performance_metrics

dashboard/modules/lead_ids.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pandas as pd
+parquet_file = 'data\data_dump_ai_assingment.parquet'
+df = pd.read_parquet(parquet_file, engine='pyarrow')
+# Setting 3 random campaigns aside as testing examples for final models
+campaign_ids = [8, 123, 256]
+df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True].copy()
+def get_unique_lead_ids(df, campaign_id):
+    df_campaign = df_final_testing[df_final_testing['campaign_id'] == campaign_id].copy()
+    lead_ids = list(df_campaign['lead_id'].unique())
+    return lead_ids
+leads_8 = get_unique_lead_ids(df_final_testing, 8)
+leads_123 = get_unique_lead_ids(df_final_testing, 123)
+leads_256 = get_unique_lead_ids(df_final_testing, 256)
+leads_8 = [str(i) for i in leads_8]
+leads_123 = [str(i) for i in leads_123]
+leads_256 = [str(i) for i in leads_256]

dashboard/modules/ranking.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Importing required packages
+import pickle
+import pandas as pd
+import re
+import numpy as np
+import lightgbm as lgbm
+from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score
+# Loading data
+parquet_file = 'data\data_dump_ai_assingment.parquet'
+df = pd.read_parquet(parquet_file, engine='pyarrow')
+# Setting 3 random campaigns aside as testing examples for final models
+campaign_ids = [8, 123, 256]
+df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
+# Clean text
+def clean_text(text):
+    # Use a regular expression to remove non-alphabetic characters
+    cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
+    # Remove multiple consecutive spaces and leading/trailing spaces
+    cleaned_text = ' '.join(cleaned_text.split())
+    # Lower texts
+    cleaned_text = cleaned_text.lower()
+    return cleaned_text
+def combine_text(df_single_lead):
+    # Changing column types
+    df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
+    df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
+    df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
+    # Combine text columns
+    df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
+    # Clean text
+    df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
+    return df_single_lead
+# Calculate performance metrics
+def compute_metrics(predictions, true_labels):
+    f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
+    f1 = round(f1_score(true_labels, predictions),3)
+    accuracy = round(accuracy_score(true_labels, predictions),3)
+    recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
+    precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
+    performance_metrics = {
+        'F1 weighted': f1_weighted,
+        'F1': f1,
+        'Accuracy': accuracy,
+        'Recall': recall,
+        'Precision': precision
+        }
+    return performance_metrics
+# Loading LGBM models
+with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file:
+    lgbm_model_1 = pickle.load(model_file)
+with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file:
+    lgbm_model_2 = pickle.load(model_file)
+with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file:
+    lgbm_model_3 = pickle.load(model_file)
+# Loading LGBM vectorizer
+with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file:
+    vectorizer = pickle.load(model_file)
+# Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores
+# Function to properly test a model on the test set by calculating score per group
+def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50):
+    # Create empty lists to store predictions and true labels for each query group (lead id groups)
+    campaign_predictions = []
+    campaign_predictions_cutoff = []
+    campaign_true_labels = []
+    campaign_ndcg_scores = []
+    campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
+    query_group_ids = campaign_data['lead_id']
+    # Iterate over query groups (in this case lead ids)
+    lead_ids = np.unique(query_group_ids)
+    for lead_id in lead_ids:
+            # Filter the data for the specific lead_id
+            single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id]
+            # Only predict ranking is lead contains more than 1 employee
+            if len(single_lead_data)>1:
+                single_lead_data = combine_text(single_lead_data)
+                # Preprocess the text features for the single lead
+                single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
+                # Predict single lead scores
+                single_lead_pred = ranker.predict(single_lead_tfidf)
+                # Store predictions
+                campaign_predictions.extend(single_lead_pred)
+                campaign_true_labels.extend(single_lead_data['employee_is_selected'])
+                # Store lead NDCG score
+                # k is 3 unless single lead data has less than 4 items
+                if len(single_lead_data) < 4:
+                    k = len(single_lead_data)
+                else:
+                    k = 3
+                ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k)
+                campaign_ndcg_scores.append(ndcg_lead)
+            else:
+                pass
+    # Get max and min value of campaign prediction scores
+    campaign_predictions_max = max(campaign_predictions)
+    campaign_predictions_min = min(campaign_predictions)
+    # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
+    campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions]
+    # Define binary predictions based on rank_cutoff value
+    cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled]
+    # Get performance metrics using binary cutoff_predictions
+    performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions)
+    df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value'])
+    df_performance_metrics.reset_index(inplace=True, names=['Metric'])
+    # Get average NDCG score
+    ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3)
+    df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k}
+    df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value'])
+    df_campaign_ndcg.reset_index(inplace=True, names=['Metric'])
+    # Merge perfromance metrics and average NDCG score
+    df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True)
+    return campaign_predictions_max, campaign_predictions_min, df_campaign_rank
+# Rank single lead
+def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3):
+    if ranker == "Light XGBM 1":
+        ranker = lgbm_model_1
+    elif ranker == "Light XGBM 2":
+        ranker = lgbm_model_2
+    elif ranker == "Light XGBM 3":
+        ranker = lgbm_model_3
+    # Selecting single lead data and combine text columns used for ranking
+    single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
+    single_lead_data = combine_text(single_lead_data)
+    # Preprocess the text features for the single lead
+    single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
+    # Predict single lead
+    single_lead_pred = ranker.predict(single_lead_tfidf)
+    single_lead_data['predicted_score'] = single_lead_pred
+    # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
+    campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff)
+    single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value)
+    # Define binary predictions based on rank_cutoff value
+    cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']]
+    single_lead_data['cutoff_prediction'] = cutoff_predictions
+    # Rank employees and create output dataframe
+    ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))]
+    single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False)
+    single_lead_data['ranking'] = ranked_list
+    single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3)
+    single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3)
+    single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']]
+    # Top 3 dataframe
+    df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking')
+    # k is 3 unless single lead data has less than 4 items
+    if len(single_lead_data) < 4:
+        k = len(single_lead_data)
+    else:
+        k = 3
+    # Compute NDCG score
+    ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3)
+    df_ndcg_data = {'NDCG@k score': ndcg, 'k': k}
+    df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value'])
+    df_ndcg.reset_index(inplace=True, names=['Metric'])
+    # Print data and overall ndcg score
+    #print(f'NDCG Score on Test Data: {ndcg:.4f}')
+    return single_lead_data, df_123, df_ndcg, df_campaign_rank

dashboard/modules/support_texts.py ADDED Viewed

	@@ -0,0 +1,93 @@

+classification_intro_1 =            '''
+                                    **Model performance**<br>
+                                    The first table shows the model performance metrics of the current selection. In other words, how the BERT or XGB classification model performs on the current lead selection.
+                                    These performance metrics are calculated by comparing the 'employee_is_selected' column (true labels) to the 'cutoff_prediction' column (predicted labels). The model performance is also dependent
+                                    on the minimum relevance probability selected by the user in the sidebar. Important to note, the models were not trained on the campaigns and accompanying leads shown in this dashboard.
+                                    This means that these campaigns are 'new' for the models. Model descriptions can be found at the bottom of the page.
+                                    '''
+classification_intro_2 =            '''
+                                    **Top 3**<br>
+                                    The second table below shows the top 3 relevant employees. It's possible that the top 3 contains less than 3 employees, as the relevance probability needs to be above the selected minimum
+                                    in the sidebar. In the third table, all employees within the selected lead can be found, including the top 3.
+                                    '''
+classification_intro_3 =            '''
+                                    **Prediction/relevance probabilities**<br>
+                                    The predicted employee probability/relevance scores can be found in the 'prediction_proba_1' column of the tables. Again, how many of the employees show up in the first top 3 table
+                                    depends on the selected minimum probability percentage in the sidebar.
+                                    '''
+models_test_classification_1 =      '''
+                                    The model performance metrics below are derived from testing the models on a testing set (this set does not contain the campaigns in this dashboard). Note, these metrics are calculated with a standard probability
+                                    cutoff value of 0.5. A higher cutoff value like 0.8 tends to increase model performance for all  models.
+                                    '''
+models_test_classification_2 =      '''
+                                    The classification models are based on two frameworks. The first is XGBoost, an optimized distributed gradient boosting library. Multiple models were created and only the best performing model is
+                                    added to this dashboard. The model variations differ in, for example, how they deal with the unbalanced training data and their hyperparameters. The XGB model added to this dashboard uses oversampling to deal
+                                    with the unbalanced data and grid search cross validation to optimize the hyperparameters.
+                                    <br><br>
+                                    The other classification model uses a BERT transformer to vectorize and classify the text variables used in training. In this approach, the Pytorch library is used to fine tune the pre-trained BERT
+                                    transformer on the training data. Again, multiple models are created and the best performing model is added to this dashboard. This BERT model uses a data imbalance ratio and optimizes the F1 score.
+                                    For specifics regarding all models, see the source code.
+                                    <br><br>
+                                    Comparing the XGB and BERT models, BERT performs slightly better when looking at the weighted F1 score. However the 'normal' F1 score (harmonic mean of precision and recall), is the same for both models.
+                                    That being said, XGB is significantly more efficient. This efficiency difference is not noticeable when classifying small datasets like single leads, but becomes evident when trying to classify larger datasets
+                                    like full campaigns on the 'comparison' tab of this dashboard.
+                                    '''
+ranking_intro_1 =                   '''
+                                    **Model performance**<br>
+                                    The first table shows the model performance metrics of the current selection. For the ranking models, this is the NDCG@k score. Here, the k parameter is set to 3, as for this exercise,
+                                    correctly ranking the top 3 relevant employees is most important. The NDCG@k metric is calculated from comparing the 'employee_is_selected' column (true labels) to the 'predicted_score'
+                                    column (predicted labels). Note that the predicted scores are not binary like the true labels. Model descriptions can be found at the bottom of the page.
+                                    '''
+ranking_intro_2 =                   '''
+                                    **Top 3**<br>
+                                    The second table shows the top 3 relevant employees. As the predicted_score column contain's values with different ranges from lead to lead, these scores are scaled between 0 and 1 to make them
+                                    comparable across leads (scaling is explained below). These scaled scores are then used to determine a top 3 relevant employee ranking by checking if the scaled score is above the user selected minimum
+                                    prediction score in the sidebar. Note that ranking models like these prioritize the correct employee order above individual employee relevance scoring. Therefore, the top 3 and its dependence on
+                                    the selected minimum relevance probability is a bit less important compared to the total ranking order in the third table.
+                                    '''
+ranking_intro_3 =                   '''
+                                    **Predicted relevance scores and score scaling**<br>
+                                    The predicted employee relevance scores can be found in the 'predicted_score' column of the tables. The predicted scores range of the whole campaign is used to scale the predicted scores in single leads.
+                                    The scores are scaled between values of 0 and 1. This approach means that the score scaling is dependent on the campaign the lead falls into. The scaled relevance scores are stored in the 'scaled_predicted_score'
+                                    column of the tables. Again, how many of the employees show up in the first top 3 table depends on the selected minimum prediction score in the sidebar.
+                                    '''
+models_test_ranking_1 =             '''
+                                    The ranking models are based on LightGMB, which is a gradient boosting framework that uses tree based learning algorithms. From this framework, models are created using the LGBMRanker. Here, the objective
+                                    parameter is set to 'lambdarank' and the boosting type parameter is set to 'Gradient Boosting Decision Tree'. The combination of these two parameters mean that the models use a lambdaMART algorithm. The models
+                                    available in this dashboard are described below. More information about, for example, the specific training parameters can be found in the source code.
+                                    <br>
+                                    - **Light GBM 1:** First implementation of the lambdaMART algorithm. Hyperparameters are not tuned. Performs the worst of the three models.
+                                    - **Light GBM 2:** Hyperparameters are tuned by testing multiple hyperparameter values using trials. These trails try to optimize the NDCG@k score on a testing set. For this model, k is set to 10.
+                                    - **Light GBM 3:** Hyperparameters are tuned with different value ranges. Also, a unbalanced data flag is added. Finally, k is set to 3 to prioritize the top 3 ranking score. This model performs the best.
+                                    '''
+comparison_1 =                      '''
+                                    **Classification vs Ranking**<br>
+                                    In order to make the classification and ranking models comparable, performance metrics like F1 and recall are calculated for both. To calculate these, the predicted relevance scores
+                                    are converted to a binary classification (0 for non relevant and 1 for relevant) with the use of the selected minimum relevance probabilities/scores. Therefore, the performance metrics are dependent
+                                    on the selected campaign, model and minimum relevance probabilities/scores. It should be noted that of all the performance metrics, the NDCG score is best to estimate the ranking model performance.
+                                    More about this is explained below.<br><br>
+                                    Comparing the two techniques, the classification models seem to perform better in terms of individual item scoring. This means that the classification models are more accurate
+                                    in determining whether or not a single employee is relevant (0 or 1). This is evident when looking at the F1 score (harmonic mean of precision and recall score), which is higher in the classification models
+                                    when compared to the F1 scores of the ranking models. However, this is to be expected as the learning to rank models are more focussed on the relative ordering of the items (employees) instead of the most
+                                    accurate relevance score of each individual item. This means that metrics like F1, accuracy, precision and recall are less optimal for assessing the ranking performance of the model. The NDCG score is better
+                                    suited for this purpose.
+                                    '''
+ranking_ndcg =                      '''
+                                    **Ranking (NDCG scores)**<br>
+                                    The ranking models are best evaluated by their NDCG@k scores. The k parameter is set to 3, as for this exercise, correctly ranking the top 3 relevant employees is most important. On this page, the average
+                                    NDCG scores of a full campaign can be found. This means that the individual lead NDCG scores are summed an divided by the total number of lead within the selected campaign. Some campaigns have low average
+                                    campaign NDCG scores, like with campaign 256, can be explained by the fact that this campaign has lots of leads with zero relevant employees in them. When this is the case, the NDCG score of such a lead is
+                                    likely 0. Of course, lots of these 0 scores will lower the average NDCG score of the full campaign.
+                                    '''

data/OLD_data_dump_ai_assingment.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:499bdc1a2fc3a6a08cf917b48d49e43a0d85d59808a2f6b7140ea40ebddf86ee
+size 23894789

data/data_dump_ai_assingment.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e25edcff8b8eeccd92cd456fa19609247644b5f2c4893f188b39111d6ff22847
+size 18923746

ranking.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd1d85c31a928033f0e92cafd44c35507748cef8cc75589d5f6ef07d9a798890
+size 9421959

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+lightgbm==4.1.0
+numpy==1.25.1
+pandas==2.0.3
+scikit_learn==1.3.1
+shiny==0.5.0
+shinyswatch==0.2.4
+torch==2.0.1
+transformers==4.33.3