Commit
·
7e4c438
1
Parent(s):
b09510c
Added SMOTE k neighbors to hparam search
Browse files
notebooks/protac_degradation_predictor.py
CHANGED
|
@@ -707,7 +707,7 @@ def train_model(
|
|
| 707 |
learning_rate=2e-5,
|
| 708 |
max_epochs=50,
|
| 709 |
smiles_emb_dim=1024,
|
| 710 |
-
|
| 711 |
use_ored_activity=False if active_col == 'Active' else True,
|
| 712 |
fast_dev_run=False,
|
| 713 |
disabled_embeddings=[],
|
|
@@ -723,7 +723,7 @@ def train_model(
|
|
| 723 |
learning_rate (float): The learning rate.
|
| 724 |
max_epochs (int): The maximum number of epochs.
|
| 725 |
smiles_emb_dim (int): The dimension of the SMILES embeddings.
|
| 726 |
-
|
| 727 |
use_ored_activity (bool): Whether to use the ORED activity column.
|
| 728 |
fast_dev_run (bool): Whether to run a fast development run.
|
| 729 |
disabled_embeddings (list): The list of disabled embeddings.
|
|
@@ -731,7 +731,7 @@ def train_model(
|
|
| 731 |
Returns:
|
| 732 |
tuple: The trained model, the trainer, and the metrics.
|
| 733 |
"""
|
| 734 |
-
oversampler = SMOTE(k_neighbors=
|
| 735 |
train_ds = PROTAC_Dataset(
|
| 736 |
train_df,
|
| 737 |
protein_embeddings,
|
|
@@ -821,6 +821,7 @@ def objective(
|
|
| 821 |
batch_size_options,
|
| 822 |
learning_rate_options,
|
| 823 |
max_epochs_options,
|
|
|
|
| 824 |
fast_dev_run=False,
|
| 825 |
) -> float:
|
| 826 |
# Generate the hyperparameters
|
|
@@ -828,6 +829,7 @@ def objective(
|
|
| 828 |
batch_size = trial.suggest_categorical('batch_size', batch_size_options)
|
| 829 |
learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
|
| 830 |
max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
|
|
|
|
| 831 |
|
| 832 |
# Train the model with the current set of hyperparameters
|
| 833 |
_, _, metrics = train_model(
|
|
@@ -837,6 +839,7 @@ def objective(
|
|
| 837 |
batch_size=batch_size,
|
| 838 |
learning_rate=learning_rate,
|
| 839 |
max_epochs=max_epochs,
|
|
|
|
| 840 |
fast_dev_run=fast_dev_run,
|
| 841 |
)
|
| 842 |
|
|
@@ -872,6 +875,7 @@ def hyperparameter_tuning_and_training(
|
|
| 872 |
batch_size_options = [8, 16, 32]
|
| 873 |
learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
|
| 874 |
max_epochs_options = [10, 20, 50]
|
|
|
|
| 875 |
|
| 876 |
# Create an Optuna study object
|
| 877 |
study = optuna.create_study(direction='minimize')
|
|
@@ -883,6 +887,7 @@ def hyperparameter_tuning_and_training(
|
|
| 883 |
batch_size_options,
|
| 884 |
learning_rate_options,
|
| 885 |
max_epochs_options,
|
|
|
|
| 886 |
fast_dev_run=fast_dev_run,),
|
| 887 |
n_trials=n_trials,
|
| 888 |
)
|
|
@@ -893,6 +898,7 @@ def hyperparameter_tuning_and_training(
|
|
| 893 |
best_batch_size = best_params['batch_size']
|
| 894 |
best_learning_rate = best_params['learning_rate']
|
| 895 |
best_max_epochs = best_params['max_epochs']
|
|
|
|
| 896 |
|
| 897 |
# Retrain the model with the best hyperparameters
|
| 898 |
model, trainer, metrics = train_model(
|
|
@@ -906,6 +912,13 @@ def hyperparameter_tuning_and_training(
|
|
| 906 |
fast_dev_run=fast_dev_run,
|
| 907 |
)
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
# Return the best metrics
|
| 910 |
return model, trainer, metrics
|
| 911 |
|
|
|
|
| 707 |
learning_rate=2e-5,
|
| 708 |
max_epochs=50,
|
| 709 |
smiles_emb_dim=1024,
|
| 710 |
+
smote_k_neighbors=5,
|
| 711 |
use_ored_activity=False if active_col == 'Active' else True,
|
| 712 |
fast_dev_run=False,
|
| 713 |
disabled_embeddings=[],
|
|
|
|
| 723 |
learning_rate (float): The learning rate.
|
| 724 |
max_epochs (int): The maximum number of epochs.
|
| 725 |
smiles_emb_dim (int): The dimension of the SMILES embeddings.
|
| 726 |
+
smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
|
| 727 |
use_ored_activity (bool): Whether to use the ORED activity column.
|
| 728 |
fast_dev_run (bool): Whether to run a fast development run.
|
| 729 |
disabled_embeddings (list): The list of disabled embeddings.
|
|
|
|
| 731 |
Returns:
|
| 732 |
tuple: The trained model, the trainer, and the metrics.
|
| 733 |
"""
|
| 734 |
+
oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
|
| 735 |
train_ds = PROTAC_Dataset(
|
| 736 |
train_df,
|
| 737 |
protein_embeddings,
|
|
|
|
| 821 |
batch_size_options,
|
| 822 |
learning_rate_options,
|
| 823 |
max_epochs_options,
|
| 824 |
+
smote_k_neighbors_options,
|
| 825 |
fast_dev_run=False,
|
| 826 |
) -> float:
|
| 827 |
# Generate the hyperparameters
|
|
|
|
| 829 |
batch_size = trial.suggest_categorical('batch_size', batch_size_options)
|
| 830 |
learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
|
| 831 |
max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
|
| 832 |
+
smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
|
| 833 |
|
| 834 |
# Train the model with the current set of hyperparameters
|
| 835 |
_, _, metrics = train_model(
|
|
|
|
| 839 |
batch_size=batch_size,
|
| 840 |
learning_rate=learning_rate,
|
| 841 |
max_epochs=max_epochs,
|
| 842 |
+
smote_k_neighbors=smote_k_neighbors,
|
| 843 |
fast_dev_run=fast_dev_run,
|
| 844 |
)
|
| 845 |
|
|
|
|
| 875 |
batch_size_options = [8, 16, 32]
|
| 876 |
learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
|
| 877 |
max_epochs_options = [10, 20, 50]
|
| 878 |
+
smote_k_neighbors_options = list(range(3, 16))
|
| 879 |
|
| 880 |
# Create an Optuna study object
|
| 881 |
study = optuna.create_study(direction='minimize')
|
|
|
|
| 887 |
batch_size_options,
|
| 888 |
learning_rate_options,
|
| 889 |
max_epochs_options,
|
| 890 |
+
smote_k_neighbors_options=smote_k_neighbors_options,
|
| 891 |
fast_dev_run=fast_dev_run,),
|
| 892 |
n_trials=n_trials,
|
| 893 |
)
|
|
|
|
| 898 |
best_batch_size = best_params['batch_size']
|
| 899 |
best_learning_rate = best_params['learning_rate']
|
| 900 |
best_max_epochs = best_params['max_epochs']
|
| 901 |
+
best_smote_k_neighbors = best_params['smote_k_neighbors']
|
| 902 |
|
| 903 |
# Retrain the model with the best hyperparameters
|
| 904 |
model, trainer, metrics = train_model(
|
|
|
|
| 912 |
fast_dev_run=fast_dev_run,
|
| 913 |
)
|
| 914 |
|
| 915 |
+
# Report the best hyperparameters found
|
| 916 |
+
metrics['hidden_dim'] = best_hidden_dim
|
| 917 |
+
metrics['batch_size'] = best_batch_size
|
| 918 |
+
metrics['learning_rate'] = best_learning_rate
|
| 919 |
+
metrics['max_epochs'] = best_max_epochs
|
| 920 |
+
metrics['smote_k_neighbors'] = best_smote_k_neighbors
|
| 921 |
+
|
| 922 |
# Return the best metrics
|
| 923 |
return model, trainer, metrics
|
| 924 |
|