Commit
·
7e4c438
1
Parent(s):
b09510c
Added SMOTE k neighbors to hparam search
Browse files
notebooks/protac_degradation_predictor.py
CHANGED
@@ -707,7 +707,7 @@ def train_model(
|
|
707 |
learning_rate=2e-5,
|
708 |
max_epochs=50,
|
709 |
smiles_emb_dim=1024,
|
710 |
-
|
711 |
use_ored_activity=False if active_col == 'Active' else True,
|
712 |
fast_dev_run=False,
|
713 |
disabled_embeddings=[],
|
@@ -723,7 +723,7 @@ def train_model(
|
|
723 |
learning_rate (float): The learning rate.
|
724 |
max_epochs (int): The maximum number of epochs.
|
725 |
smiles_emb_dim (int): The dimension of the SMILES embeddings.
|
726 |
-
|
727 |
use_ored_activity (bool): Whether to use the ORED activity column.
|
728 |
fast_dev_run (bool): Whether to run a fast development run.
|
729 |
disabled_embeddings (list): The list of disabled embeddings.
|
@@ -731,7 +731,7 @@ def train_model(
|
|
731 |
Returns:
|
732 |
tuple: The trained model, the trainer, and the metrics.
|
733 |
"""
|
734 |
-
oversampler = SMOTE(k_neighbors=
|
735 |
train_ds = PROTAC_Dataset(
|
736 |
train_df,
|
737 |
protein_embeddings,
|
@@ -821,6 +821,7 @@ def objective(
|
|
821 |
batch_size_options,
|
822 |
learning_rate_options,
|
823 |
max_epochs_options,
|
|
|
824 |
fast_dev_run=False,
|
825 |
) -> float:
|
826 |
# Generate the hyperparameters
|
@@ -828,6 +829,7 @@ def objective(
|
|
828 |
batch_size = trial.suggest_categorical('batch_size', batch_size_options)
|
829 |
learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
|
830 |
max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
|
|
|
831 |
|
832 |
# Train the model with the current set of hyperparameters
|
833 |
_, _, metrics = train_model(
|
@@ -837,6 +839,7 @@ def objective(
|
|
837 |
batch_size=batch_size,
|
838 |
learning_rate=learning_rate,
|
839 |
max_epochs=max_epochs,
|
|
|
840 |
fast_dev_run=fast_dev_run,
|
841 |
)
|
842 |
|
@@ -872,6 +875,7 @@ def hyperparameter_tuning_and_training(
|
|
872 |
batch_size_options = [8, 16, 32]
|
873 |
learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
|
874 |
max_epochs_options = [10, 20, 50]
|
|
|
875 |
|
876 |
# Create an Optuna study object
|
877 |
study = optuna.create_study(direction='minimize')
|
@@ -883,6 +887,7 @@ def hyperparameter_tuning_and_training(
|
|
883 |
batch_size_options,
|
884 |
learning_rate_options,
|
885 |
max_epochs_options,
|
|
|
886 |
fast_dev_run=fast_dev_run,),
|
887 |
n_trials=n_trials,
|
888 |
)
|
@@ -893,6 +898,7 @@ def hyperparameter_tuning_and_training(
|
|
893 |
best_batch_size = best_params['batch_size']
|
894 |
best_learning_rate = best_params['learning_rate']
|
895 |
best_max_epochs = best_params['max_epochs']
|
|
|
896 |
|
897 |
# Retrain the model with the best hyperparameters
|
898 |
model, trainer, metrics = train_model(
|
@@ -906,6 +912,13 @@ def hyperparameter_tuning_and_training(
|
|
906 |
fast_dev_run=fast_dev_run,
|
907 |
)
|
908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
# Return the best metrics
|
910 |
return model, trainer, metrics
|
911 |
|
|
|
707 |
learning_rate=2e-5,
|
708 |
max_epochs=50,
|
709 |
smiles_emb_dim=1024,
|
710 |
+
smote_k_neighbors=5,
|
711 |
use_ored_activity=False if active_col == 'Active' else True,
|
712 |
fast_dev_run=False,
|
713 |
disabled_embeddings=[],
|
|
|
723 |
learning_rate (float): The learning rate.
|
724 |
max_epochs (int): The maximum number of epochs.
|
725 |
smiles_emb_dim (int): The dimension of the SMILES embeddings.
|
726 |
+
smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
|
727 |
use_ored_activity (bool): Whether to use the ORED activity column.
|
728 |
fast_dev_run (bool): Whether to run a fast development run.
|
729 |
disabled_embeddings (list): The list of disabled embeddings.
|
|
|
731 |
Returns:
|
732 |
tuple: The trained model, the trainer, and the metrics.
|
733 |
"""
|
734 |
+
oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
|
735 |
train_ds = PROTAC_Dataset(
|
736 |
train_df,
|
737 |
protein_embeddings,
|
|
|
821 |
batch_size_options,
|
822 |
learning_rate_options,
|
823 |
max_epochs_options,
|
824 |
+
smote_k_neighbors_options,
|
825 |
fast_dev_run=False,
|
826 |
) -> float:
|
827 |
# Generate the hyperparameters
|
|
|
829 |
batch_size = trial.suggest_categorical('batch_size', batch_size_options)
|
830 |
learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
|
831 |
max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
|
832 |
+
smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
|
833 |
|
834 |
# Train the model with the current set of hyperparameters
|
835 |
_, _, metrics = train_model(
|
|
|
839 |
batch_size=batch_size,
|
840 |
learning_rate=learning_rate,
|
841 |
max_epochs=max_epochs,
|
842 |
+
smote_k_neighbors=smote_k_neighbors,
|
843 |
fast_dev_run=fast_dev_run,
|
844 |
)
|
845 |
|
|
|
875 |
batch_size_options = [8, 16, 32]
|
876 |
learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
|
877 |
max_epochs_options = [10, 20, 50]
|
878 |
+
smote_k_neighbors_options = list(range(3, 16))
|
879 |
|
880 |
# Create an Optuna study object
|
881 |
study = optuna.create_study(direction='minimize')
|
|
|
887 |
batch_size_options,
|
888 |
learning_rate_options,
|
889 |
max_epochs_options,
|
890 |
+
smote_k_neighbors_options=smote_k_neighbors_options,
|
891 |
fast_dev_run=fast_dev_run,),
|
892 |
n_trials=n_trials,
|
893 |
)
|
|
|
898 |
best_batch_size = best_params['batch_size']
|
899 |
best_learning_rate = best_params['learning_rate']
|
900 |
best_max_epochs = best_params['max_epochs']
|
901 |
+
best_smote_k_neighbors = best_params['smote_k_neighbors']
|
902 |
|
903 |
# Retrain the model with the best hyperparameters
|
904 |
model, trainer, metrics = train_model(
|
|
|
912 |
fast_dev_run=fast_dev_run,
|
913 |
)
|
914 |
|
915 |
+
# Report the best hyperparameters found
|
916 |
+
metrics['hidden_dim'] = best_hidden_dim
|
917 |
+
metrics['batch_size'] = best_batch_size
|
918 |
+
metrics['learning_rate'] = best_learning_rate
|
919 |
+
metrics['max_epochs'] = best_max_epochs
|
920 |
+
metrics['smote_k_neighbors'] = best_smote_k_neighbors
|
921 |
+
|
922 |
# Return the best metrics
|
923 |
return model, trainer, metrics
|
924 |
|