ribesstefano commited on
Commit
7e4c438
·
1 Parent(s): b09510c

Added SMOTE k neighbors to hparam search

Browse files
notebooks/protac_degradation_predictor.py CHANGED
@@ -707,7 +707,7 @@ def train_model(
707
  learning_rate=2e-5,
708
  max_epochs=50,
709
  smiles_emb_dim=1024,
710
- smote_n_neighbors=5,
711
  use_ored_activity=False if active_col == 'Active' else True,
712
  fast_dev_run=False,
713
  disabled_embeddings=[],
@@ -723,7 +723,7 @@ def train_model(
723
  learning_rate (float): The learning rate.
724
  max_epochs (int): The maximum number of epochs.
725
  smiles_emb_dim (int): The dimension of the SMILES embeddings.
726
- smote_n_neighbors (int): The number of neighbors for the SMOTE oversampler.
727
  use_ored_activity (bool): Whether to use the ORED activity column.
728
  fast_dev_run (bool): Whether to run a fast development run.
729
  disabled_embeddings (list): The list of disabled embeddings.
@@ -731,7 +731,7 @@ def train_model(
731
  Returns:
732
  tuple: The trained model, the trainer, and the metrics.
733
  """
734
- oversampler = SMOTE(k_neighbors=smote_n_neighbors, random_state=42)
735
  train_ds = PROTAC_Dataset(
736
  train_df,
737
  protein_embeddings,
@@ -821,6 +821,7 @@ def objective(
821
  batch_size_options,
822
  learning_rate_options,
823
  max_epochs_options,
 
824
  fast_dev_run=False,
825
  ) -> float:
826
  # Generate the hyperparameters
@@ -828,6 +829,7 @@ def objective(
828
  batch_size = trial.suggest_categorical('batch_size', batch_size_options)
829
  learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
830
  max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
 
831
 
832
  # Train the model with the current set of hyperparameters
833
  _, _, metrics = train_model(
@@ -837,6 +839,7 @@ def objective(
837
  batch_size=batch_size,
838
  learning_rate=learning_rate,
839
  max_epochs=max_epochs,
 
840
  fast_dev_run=fast_dev_run,
841
  )
842
 
@@ -872,6 +875,7 @@ def hyperparameter_tuning_and_training(
872
  batch_size_options = [8, 16, 32]
873
  learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
874
  max_epochs_options = [10, 20, 50]
 
875
 
876
  # Create an Optuna study object
877
  study = optuna.create_study(direction='minimize')
@@ -883,6 +887,7 @@ def hyperparameter_tuning_and_training(
883
  batch_size_options,
884
  learning_rate_options,
885
  max_epochs_options,
 
886
  fast_dev_run=fast_dev_run,),
887
  n_trials=n_trials,
888
  )
@@ -893,6 +898,7 @@ def hyperparameter_tuning_and_training(
893
  best_batch_size = best_params['batch_size']
894
  best_learning_rate = best_params['learning_rate']
895
  best_max_epochs = best_params['max_epochs']
 
896
 
897
  # Retrain the model with the best hyperparameters
898
  model, trainer, metrics = train_model(
@@ -906,6 +912,13 @@ def hyperparameter_tuning_and_training(
906
  fast_dev_run=fast_dev_run,
907
  )
908
 
 
 
 
 
 
 
 
909
  # Return the best metrics
910
  return model, trainer, metrics
911
 
 
707
  learning_rate=2e-5,
708
  max_epochs=50,
709
  smiles_emb_dim=1024,
710
+ smote_k_neighbors=5,
711
  use_ored_activity=False if active_col == 'Active' else True,
712
  fast_dev_run=False,
713
  disabled_embeddings=[],
 
723
  learning_rate (float): The learning rate.
724
  max_epochs (int): The maximum number of epochs.
725
  smiles_emb_dim (int): The dimension of the SMILES embeddings.
726
+ smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
727
  use_ored_activity (bool): Whether to use the ORED activity column.
728
  fast_dev_run (bool): Whether to run a fast development run.
729
  disabled_embeddings (list): The list of disabled embeddings.
 
731
  Returns:
732
  tuple: The trained model, the trainer, and the metrics.
733
  """
734
+ oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
735
  train_ds = PROTAC_Dataset(
736
  train_df,
737
  protein_embeddings,
 
821
  batch_size_options,
822
  learning_rate_options,
823
  max_epochs_options,
824
+ smote_k_neighbors_options,
825
  fast_dev_run=False,
826
  ) -> float:
827
  # Generate the hyperparameters
 
829
  batch_size = trial.suggest_categorical('batch_size', batch_size_options)
830
  learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
831
  max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
832
+ smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
833
 
834
  # Train the model with the current set of hyperparameters
835
  _, _, metrics = train_model(
 
839
  batch_size=batch_size,
840
  learning_rate=learning_rate,
841
  max_epochs=max_epochs,
842
+ smote_k_neighbors=smote_k_neighbors,
843
  fast_dev_run=fast_dev_run,
844
  )
845
 
 
875
  batch_size_options = [8, 16, 32]
876
  learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
877
  max_epochs_options = [10, 20, 50]
878
+ smote_k_neighbors_options = list(range(3, 16))
879
 
880
  # Create an Optuna study object
881
  study = optuna.create_study(direction='minimize')
 
887
  batch_size_options,
888
  learning_rate_options,
889
  max_epochs_options,
890
+ smote_k_neighbors_options=smote_k_neighbors_options,
891
  fast_dev_run=fast_dev_run,),
892
  n_trials=n_trials,
893
  )
 
898
  best_batch_size = best_params['batch_size']
899
  best_learning_rate = best_params['learning_rate']
900
  best_max_epochs = best_params['max_epochs']
901
+ best_smote_k_neighbors = best_params['smote_k_neighbors']
902
 
903
  # Retrain the model with the best hyperparameters
904
  model, trainer, metrics = train_model(
 
912
  fast_dev_run=fast_dev_run,
913
  )
914
 
915
+ # Report the best hyperparameters found
916
+ metrics['hidden_dim'] = best_hidden_dim
917
+ metrics['batch_size'] = best_batch_size
918
+ metrics['learning_rate'] = best_learning_rate
919
+ metrics['max_epochs'] = best_max_epochs
920
+ metrics['smote_k_neighbors'] = best_smote_k_neighbors
921
+
922
  # Return the best metrics
923
  return model, trainer, metrics
924