Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Apr 23, 2024

Commit

6a5a99e

1 Parent(s): 4d17fea

Refactored experiments + fixed bug in dataset when applying scaling to val and test sets

Browse files

Files changed (5) hide show

README.md +37 -0
protac_degradation_predictor/optuna_utils.py +150 -55
protac_degradation_predictor/protac_dataset.py +9 -6
protac_degradation_predictor/pytorch_models.py +26 -20
src/run_experiments.py +151 -113

README.md CHANGED Viewed

@@ -1,5 +1,42 @@
 # PROTAC-Degradation-Predictor
 Predicting PROTAC protein degradation activity via machine learning.
 > If you're coming from my [thesis repo](https://github.com/ribesstefano/Machine-Learning-for-Predicting-Targeted-Protein-Degradation), I just wanted to create a separate and "less generic" repo for fast prototyping new ideas.
 > Stefano.

 # PROTAC-Degradation-Predictor
 Predicting PROTAC protein degradation activity via machine learning.
+## Data Curation
+For data curation code, please refer to the code in the Jupyter notebooks [`data_curation.ipynb`](notebooks/data_curation.ipynb).
+## Installing the Package
+To install the package, run the following command:
+```bash
+pip install .
+```
+## Running the Package
+To run the package after installation, here is an example snippet:
+```python
+import protac_degradation_predictor as pdp
+protac_smiles = 'CC(C)(C)OC(=O)N1CCN(CC1)C2=CC(=C(C=C2)C(=O)NC3=CC(=C(C=C3)F)Cl)C(=O)NC4=CC=C(C=C4)F'
+e3_ligase = 'VHL'
+target_uniprot = 'P04637'
+cell_line = 'HeLa'
+active_protac = pdp.is_protac_active(
+    protac_smiles,
+    e3_ligase,
+    target_uniprot,
+    cell_line,
+    device='gpu', # Default to 'cpu'
+    proba_threshold=0.5, # Default value
+)
+print(f'The given PROTAC is: {"active" if active_protac else "inactive"}')
+```
 > If you're coming from my [thesis repo](https://github.com/ribesstefano/Machine-Learning-for-Predicting-Targeted-Protein-Degradation), I just wanted to create a separate and "less generic" repo for fast prototyping new ideas.
 > Stefano.

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -21,6 +21,12 @@ from sklearn.ensemble import (
 )
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
 def pytorch_model_objective(
@@ -28,8 +34,9 @@ def pytorch_model_objective(
         protein2embedding: Dict,
         cell2embedding: Dict,
         smiles2fp: Dict,
-        train_df: pd.DataFrame,
-        val_df: pd.DataFrame,
         hidden_dim_options: List[int] = [256, 512, 768],
         batch_size_options: List[int] = [8, 16, 32],
         learning_rate_options: Tuple[float, float] = (1e-5, 1e-3),
@@ -55,7 +62,7 @@ def pytorch_model_objective(
         active_label (str): The active label column.
         disabled_embeddings (List[str]): The list of disabled embeddings.
     """
-    # Generate the hyperparameters
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
@@ -65,49 +72,90 @@ def pytorch_model_objective(
     apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
-    # Train the model with the current set of hyperparameters
-    _, _, metrics = train_model(
-        protein2embedding,
-        cell2embedding,
-        smiles2fp,
-        train_df,
-        val_df,
-        hidden_dim=hidden_dim,
-        batch_size=batch_size,
-        join_embeddings=join_embeddings,
-        learning_rate=learning_rate,
-        dropout=dropout,
-        max_epochs=max_epochs,
-        smote_k_neighbors=smote_k_neighbors,
-        apply_scaling=apply_scaling,
-        use_smote=use_smote,
-        use_logger=False,
-        fast_dev_run=fast_dev_run,
-        active_label=active_label,
-        disabled_embeddings=disabled_embeddings,
-    )
-    # Metrics is a dictionary containing at least the validation loss
-    val_loss = metrics['val_loss']
-    val_acc = metrics['val_acc']
-    val_roc_auc = metrics['val_roc_auc']
     # Optuna aims to minimize the pytorch_model_objective
-    return val_loss - val_acc - val_roc_auc
 def hyperparameter_tuning_and_training(
         protein2embedding: Dict,
         cell2embedding: Dict,
         smiles2fp: Dict,
-        train_df: pd.DataFrame,
-        val_df: pd.DataFrame,
-        test_df: Optional[pd.DataFrame] = None,
         fast_dev_run: bool = False,
         n_trials: int = 50,
         logger_name: str = 'protac_hparam_search',
         active_label: str = 'Active',
-        disabled_embeddings: List[str] = [],
         study_filename: Optional[str] = None,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
@@ -125,6 +173,8 @@ def hyperparameter_tuning_and_training(
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
     """
     # Define the search space
     hidden_dim_options = [256, 512, 768]
     batch_size_options = [8, 16, 32]
@@ -151,42 +201,87 @@ def hyperparameter_tuning_and_training(
                 protein2embedding=protein2embedding,
                 cell2embedding=cell2embedding,
                 smiles2fp=smiles2fp,
-                train_df=train_df,
-                val_df=val_df,
                 hidden_dim_options=hidden_dim_options,
                 batch_size_options=batch_size_options,
                 learning_rate_options=learning_rate_options,
                 smote_k_neighbors_options=smote_k_neighbors_options,
                 fast_dev_run=fast_dev_run,
                 active_label=active_label,
-                disabled_embeddings=disabled_embeddings,
             ),
             n_trials=n_trials,
         )
         if study_filename:
             joblib.dump(study, study_filename)
-    # Retrain the model with the best hyperparameters
-    model, trainer, metrics = train_model(
-        protein2embedding=protein2embedding,
-        cell2embedding=cell2embedding,
-        smiles2fp=smiles2fp,
-        train_df=train_df,
-        val_df=val_df,
-        test_df=test_df,
-        use_logger=True,
-        logger_name=logger_name,
-        fast_dev_run=fast_dev_run,
-        active_label=active_label,
-        disabled_embeddings=disabled_embeddings,
-        **study.best_params,
-    )
-    # Report the best hyperparameters found
-    metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
-    # Return the best metrics
-    return model, trainer, metrics
 def sklearn_model_objective(

 )
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import SVC
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
+import numpy as np
+import pytorch_lightning as pl
 def pytorch_model_objective(
         protein2embedding: Dict,
         cell2embedding: Dict,
         smiles2fp: Dict,
+        train_val_df: pd.DataFrame,
+        kf: StratifiedKFold | StratifiedGroupKFold,
+        groups: Optional[np.array] = None,
         hidden_dim_options: List[int] = [256, 512, 768],
         batch_size_options: List[int] = [8, 16, 32],
         learning_rate_options: Tuple[float, float] = (1e-5, 1e-3),
         active_label (str): The active label column.
         disabled_embeddings (List[str]): The list of disabled embeddings.
     """
+    # Suggest hyperparameters to be used accross the CV folds
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
+    # Start the CV over the folds
+    X = train_val_df.drop(columns=active_label)
+    y = train_val_df[active_label].tolist()
+    report = []
+    for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
+        logging.info(f'Fold {k + 1}/{kf.get_n_splits()}')
+        # Get the train and val sets
+        train_df = train_val_df.iloc[train_index]
+        val_df = train_val_df.iloc[val_index]
+        # Check for data leakage and get some statistics
+        leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
+        leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
+        stats = {
+            'model_type': 'Pytorch',
+            'fold': k,
+            'train_len': len(train_df),
+            'val_len': len(val_df),
+            'train_perc': len(train_df) / len(train_val_df),
+            'val_perc': len(val_df) / len(train_val_df),
+            'train_active_perc': train_df[active_label].sum() / len(train_df),
+            'train_inactive_perc': (len(train_df) - train_df[active_label].sum()) / len(train_df),
+            'val_active_perc': val_df[active_label].sum() / len(val_df),
+            'val_inactive_perc': (len(val_df) - val_df[active_label].sum()) / len(val_df),
+            'num_leaking_uniprot': len(leaking_uniprot),
+            'num_leaking_smiles': len(leaking_smiles),
+            'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
+            'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
+        }
+        if groups is not None:
+            stats['train_unique_groups'] = len(np.unique(groups[train_index]))
+            stats['val_unique_groups'] = len(np.unique(groups[val_index]))
+        # At each fold, train and evaluate the Pytorch model
+        # Train the model with the current set of hyperparameters
+        _, _, metrics = train_model(
+            protein2embedding,
+            cell2embedding,
+            smiles2fp,
+            train_df,
+            val_df,
+            hidden_dim=hidden_dim,
+            batch_size=batch_size,
+            join_embeddings=join_embeddings,
+            learning_rate=learning_rate,
+            dropout=dropout,
+            max_epochs=max_epochs,
+            smote_k_neighbors=smote_k_neighbors,
+            apply_scaling=apply_scaling,
+            use_smote=use_smote,
+            use_logger=False,
+            fast_dev_run=fast_dev_run,
+            active_label=active_label,
+            disabled_embeddings=disabled_embeddings,
+        )
+        stats.update(metrics)
+        report.append(stats.copy())
+    # Get the average validation accuracy and ROC AUC accross the folds
+    val_acc = np.mean([r['val_acc'] for r in report])
+    val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
+    # Save the report in the trial
+    trial.set_user_attr('report', report)
     # Optuna aims to minimize the pytorch_model_objective
+    return - val_acc - val_roc_auc
 def hyperparameter_tuning_and_training(
         protein2embedding: Dict,
         cell2embedding: Dict,
         smiles2fp: Dict,
+        train_val_df: pd.DataFrame,
+        test_df: pd.DataFrame,
+        kf: StratifiedKFold | StratifiedGroupKFold,
+        groups: Optional[np.array] = None,
+        split_type: str = 'random',
+        n_models_for_test: int = 3,
         fast_dev_run: bool = False,
         n_trials: int = 50,
         logger_name: str = 'protac_hparam_search',
         active_label: str = 'Active',
+        max_epochs: int = 100,
         study_filename: Optional[str] = None,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
     """
+    pl.seed_everything(42)
     # Define the search space
     hidden_dim_options = [256, 512, 768]
     batch_size_options = [8, 16, 32]
                 protein2embedding=protein2embedding,
                 cell2embedding=cell2embedding,
                 smiles2fp=smiles2fp,
+                train_val_df=train_val_df,
+                kf=kf,
+                groups=groups,
                 hidden_dim_options=hidden_dim_options,
                 batch_size_options=batch_size_options,
                 learning_rate_options=learning_rate_options,
                 smote_k_neighbors_options=smote_k_neighbors_options,
                 fast_dev_run=fast_dev_run,
                 active_label=active_label,
+                max_epochs=max_epochs,
+                disabled_embeddings=[],
             ),
             n_trials=n_trials,
         )
         if study_filename:
             joblib.dump(study, study_filename)
+    cv_report = pd.DataFrame(study.best_trial.user_attrs['report'])
+    hparam_report = pd.DataFrame([study.best_params])
+    test_report = []
+    # Retrain N models with the best hyperparameters (measure model uncertainty)
+    for i in range(n_models_for_test):
+        pl.seed_everything(42 + i)
+        _, _, metrics = train_model(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_df=train_val_df,
+            val_df=test_df,
+            use_logger=True,
+            fast_dev_run=fast_dev_run,
+            active_label=active_label,
+            max_epochs=max_epochs,
+            disabled_embeddings=[],
+            logger_name=f'{logger_name}_best_model_{i}',
+            enable_checkpointing=True,
+            checkpoint_model_name=f'best_model_{split_type}_{i}',
+            **study.best_params,
+        )
+        # Rename the keys in the metrics dictionary
+        metrics = {k.replace('val_', 'test_'): v for k, v in metrics.items()}
+        metrics = {k.replace('train_', 'train_val_'): v for k, v in metrics.items()}
+        metrics['model_type'] = 'Pytorch'
+        metrics['test_model_id'] = i
+        test_report.append(metrics.copy())
+    test_report = pd.DataFrame(test_report)
+    # Ablation study: disable embeddings at a time
+    ablation_report = []
+    for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
+        logging.info('-' * 100)
+        logging.info(f'Ablation study with disabled embeddings: {disabled_embeddings}')
+        logging.info('-' * 100)
+        _, _, metrics = train_model(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_df=train_val_df,
+            val_df=test_df,
+            fast_dev_run=fast_dev_run,
+            active_label=active_label,
+            max_epochs=max_epochs,
+            use_logger=True,
+            logger_name=f'{logger_name}_disabled-{"-".join(disabled_embeddings)}',
+            disabled_embeddings=disabled_embeddings,
+            **study.best_params,
+        )
+        # Rename the keys in the metrics dictionary
+        metrics = {k.replace('val_', 'test_'): v for k, v in metrics.items()}
+        metrics = {k.replace('train_', 'train_val_'): v for k, v in metrics.items()}
+        metrics['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
+        metrics['model_type'] = 'Pytorch'
+        ablation_report.append(metrics.copy())
+    ablation_report = pd.DataFrame(ablation_report)
+    # Add a column with the split_type to all reports
+    for report in [cv_report, hparam_report, test_report, ablation_report]:
+        report['split_type'] = split_type
+    # Return the reports
+    return cv_report, hparam_report, test_report, ablation_report
 def sklearn_model_objective(

protac_degradation_predictor/protac_dataset.py CHANGED Viewed

@@ -146,12 +146,15 @@ class PROTAC_Dataset(Dataset):
             scalers (dict): The scalers for each feature.
             use_single_scaler (bool): Whether to use a single scaler for all features.
         """
-        if self.use_single_scaler is None:
-            raise ValueError(
-                "The fit_scaling method must be called before apply_scaling.")
-        if use_single_scaler != self.use_single_scaler:
-            raise ValueError(
-                f"The use_single_scaler parameter must be the same as the one used in the fit_scaling method. Got {use_single_scaler}, previously {self.use_single_scaler}.")
         if use_single_scaler:
             embeddings = np.hstack([
                 np.array(self.data['Smiles'].tolist()),

             scalers (dict): The scalers for each feature.
             use_single_scaler (bool): Whether to use a single scaler for all features.
         """
+        # TODO: The following check is WRONG: for val and test sets I must NOT
+        # use run the fit_scaling method, but I must use the scalers from the
+        # training set.
+        # if self.use_single_scaler is None:
+        #     raise ValueError(
+        #         "The fit_scaling method must be called before apply_scaling.")
+        # if use_single_scaler != self.use_single_scaler:
+        #     raise ValueError(
+        #         f"The use_single_scaler parameter must be the same as the one used in the fit_scaling method. Got {use_single_scaler}, previously {self.use_single_scaler}.")
         if use_single_scaler:
             embeddings = np.hstack([
                 np.array(self.data['Smiles'].tolist()),

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -2,7 +2,7 @@ import warnings
 from typing import Literal, List, Tuple, Optional, Dict
 from .protac_dataset import PROTAC_Dataset
-from .config import Config
 import pandas as pd
 import numpy as np
@@ -28,10 +28,10 @@ class PROTAC_Predictor(nn.Module):
     def __init__(
         self,
         hidden_dim: int,
-        smiles_emb_dim: int = Config.fingerprint_size,
-        poi_emb_dim: int = Config.protein_embedding_size,
-        e3_emb_dim: int = Config.protein_embedding_size,
-        cell_emb_dim: int = Config.cell_embedding_size,
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
         disabled_embeddings: list = [],
@@ -131,10 +131,10 @@ class PROTAC_Model(pl.LightningModule):
     def __init__(
         self,
         hidden_dim: int,
-        smiles_emb_dim: int = 224,
-        poi_emb_dim: int = 1024,
-        e3_emb_dim: int = 1024,
-        cell_emb_dim: int = 768,
         batch_size: int = 32,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
@@ -330,7 +330,10 @@ def train_model(
         learning_rate: float = 2e-5,
         dropout: float = 0.2,
         max_epochs: int = 50,
-        smiles_emb_dim: int = 224,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
@@ -339,6 +342,8 @@ def train_model(
         fast_dev_run: bool = False,
         use_logger: bool = True,
         logger_name: str = 'protac',
         disabled_embeddings: List[str] = [],
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -410,13 +415,14 @@ def train_model(
             mode='max',
             verbose=False,
         ),
-        # pl.callbacks.ModelCheckpoint(
-        #     monitor='val_acc',
-        #     mode='max',
-        #     verbose=True,
-        #     filename='{epoch}-{val_metrics_opt_score:.4f}',
-        # ),
     ]
     # Define Trainer
     trainer = pl.Trainer(
         logger=logger if use_logger else False,
@@ -424,7 +430,7 @@ def train_model(
         max_epochs=max_epochs,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
-        enable_checkpointing=False,
         enable_progress_bar=False,
         devices=1,
         num_nodes=1,
@@ -432,9 +438,9 @@ def train_model(
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
         smiles_emb_dim=smiles_emb_dim,
-        poi_emb_dim=1024,
-        e3_emb_dim=1024,
-        cell_emb_dim=768,
         batch_size=batch_size,
         join_embeddings=join_embeddings,
         dropout=dropout,

 from typing import Literal, List, Tuple, Optional, Dict
 from .protac_dataset import PROTAC_Dataset
+from .config import config
 import pandas as pd
 import numpy as np
     def __init__(
         self,
         hidden_dim: int,
+        smiles_emb_dim: int = config.fingerprint_size,
+        poi_emb_dim: int = config.protein_embedding_size,
+        e3_emb_dim: int = config.protein_embedding_size,
+        cell_emb_dim: int = config.cell_embedding_size,
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
         disabled_embeddings: list = [],
     def __init__(
         self,
         hidden_dim: int,
+        smiles_emb_dim: int = config.fingerprint_size,
+        poi_emb_dim: int = config.protein_embedding_size,
+        e3_emb_dim: int = config.protein_embedding_size,
+        cell_emb_dim: int = config.cell_embedding_size,
         batch_size: int = 32,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
         learning_rate: float = 2e-5,
         dropout: float = 0.2,
         max_epochs: int = 50,
+        smiles_emb_dim: int = config.fingerprint_size,
+        poi_emb_dim: int = config.protein_embedding_size,
+        e3_emb_dim: int = config.protein_embedding_size,
+        cell_emb_dim: int = config.cell_embedding_size,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
         fast_dev_run: bool = False,
         use_logger: bool = True,
         logger_name: str = 'protac',
+        enable_checkpointing: bool = False,
+        checkpoint_model_name: str = 'protac',
         disabled_embeddings: List[str] = [],
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
             mode='max',
             verbose=False,
         ),
     ]
+    if enable_checkpointing:
+        callbacks.append(pl.callbacks.ModelCheckpoint(
+            monitor='val_acc',
+            mode='max',
+            verbose=False,
+            filename=checkpoint_model_name + '-{epoch}-{val_metrics_opt_score:.4f}',
+        ))
     # Define Trainer
     trainer = pl.Trainer(
         logger=logger if use_logger else False,
         max_epochs=max_epochs,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
+        enable_checkpointing=enable_checkpointing,
         enable_progress_bar=False,
         devices=1,
         num_nodes=1,
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
         smiles_emb_dim=smiles_emb_dim,
+        poi_emb_dim=poi_emb_dim,
+        e3_emb_dim=e3_emb_dim,
+        cell_emb_dim=cell_emb_dim,
         batch_size=batch_size,
         join_embeddings=join_embeddings,
         dropout=dropout,

src/run_experiments.py CHANGED Viewed

@@ -27,6 +27,16 @@ warnings.filterwarnings("ignore", ".*FixedLocator*")
 warnings.filterwarnings("ignore", ".*does not have many workers.*")
 def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
     """ Get the indices of the test set using a random split.
@@ -263,120 +273,148 @@ def main(
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
-        # Start the CV over the folds
-        X = train_val_df.drop(columns=active_col)
-        y = train_val_df[active_col].tolist()
-        for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
-            print('-' * 100)
-            print(f'Starting CV for group type: {split_type}, fold: {k}')
-            print('-' * 100)
-            train_df = train_val_df.iloc[train_index]
-            val_df = train_val_df.iloc[val_index]
-            leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
-            leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
-            stats = {
-                'fold': k,
-                'split_type': split_type,
-                'train_len': len(train_df),
-                'val_len': len(val_df),
-                'train_perc': len(train_df) / len(train_val_df),
-                'val_perc': len(val_df) / len(train_val_df),
-                'train_active_perc': train_df[active_col].sum() / len(train_df),
-                'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
-                'val_active_perc': val_df[active_col].sum() / len(val_df),
-                'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
-                'test_active_perc': test_df[active_col].sum() / len(test_df),
-                'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
-                'num_leaking_uniprot': len(leaking_uniprot),
-                'num_leaking_smiles': len(leaking_smiles),
-                'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
-                'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
-            }
-            if split_type != 'random':
-                stats['train_unique_groups'] = len(np.unique(group[train_index]))
-                stats['val_unique_groups'] = len(np.unique(group[val_index]))
-            # At each fold, train and evaluate the Pytorch model
-            if split_type != 'tanimoto' or run_sklearn:
-                logging.info(f'Skipping Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
-                continue
-            else:
-                logging.info(f'Starting Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
-                # Train and evaluate the model
-                model, trainer, metrics = pdp.hyperparameter_tuning_and_training(
-                    protein2embedding,
-                    cell2embedding,
-                    smiles2fp,
-                    train_df,
-                    val_df,
-                    test_df,
-                    fast_dev_run=fast_dev_run,
-                    n_trials=n_trials,
-                    logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
-                    active_label=active_col,
-                    study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
-                )
-                hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
-                stats.update(metrics)
-                stats['model_type'] = 'Pytorch'
-                report.append(stats.copy())
-                del model
-                del trainer
-                # Ablation study: disable embeddings at a time
-                for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
-                    print('-' * 100)
-                    print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
-                    print('-' * 100)
-                    stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
-                    model, trainer, metrics = pdp.train_model(
-                        protein2embedding,
-                        cell2embedding,
-                        smiles2fp,
-                        train_df,
-                        val_df,
-                        test_df,
-                        fast_dev_run=fast_dev_run,
-                        logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
-                        active_label=active_col,
-                        disabled_embeddings=disabled_embeddings,
-                        **hparams,
-                    )
-                    stats.update(metrics)
-                    report.append(stats.copy())
-                    del model
-                    del trainer
-            # At each fold, train and evaluate sklearn models
-            if run_sklearn:
-                for model_type in ['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting']:
-                    logging.info(f'Starting sklearn model {model_type} training on fold {k} with split type {split_type} and test split {test_split}.')
-                    # Train and evaluate sklearn models
-                    model, metrics = pdp.hyperparameter_tuning_and_training_sklearn(
-                        protein2embedding=protein2embedding,
-                        cell2embedding=cell2embedding,
-                        smiles2fp=smiles2fp,
-                        train_df=train_df,
-                        val_df=val_df,
-                        test_df=test_df,
-                        model_type=model_type,
-                        active_label=active_col,
-                        n_trials=n_trials,
-                        study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}_{model_type.lower()}.pkl',
-                    )
-                    hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
-                    stats['model_type'] = model_type
-                    stats.update(metrics)
-                    report.append(stats.copy())
-        # Save the report at the end of each split type
-        report_df = pd.DataFrame(report)
-        report_df.to_csv(
-            f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}{"_sklearn" if run_sklearn else ""}.csv',
-            index=False,
         )
 if __name__ == '__main__':

 warnings.filterwarnings("ignore", ".*does not have many workers.*")
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
 def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
     """ Get the indices of the test set using a random split.
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
+        # Start the experiment
+        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
+        reports = pdp.hyperparameter_tuning_and_training(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_val_df=train_val_df,
+            test_df=test_df,
+            kf=kf,
+            groups=group,
+            split_type=split_type,
+            n_models_for_test=3,
+            fast_dev_run=fast_dev_run,
+            n_trials=n_trials,
+            max_epochs=10,
+            logger_name=f'logs_{experiment_name}',
+            active_label=active_col,
+            study_filename=f'../reports/study_{experiment_name}.pkl',
         )
+        cv_report, hparam_report, test_report, ablation_report = reports
+        # Save the reports to file
+        for report, filename in zip([cv_report, hparam_report, test_report, ablation_report], ['cv_train', 'hparams', 'test', 'ablation']):
+            report.to_csv(f'../reports/report_{filename}_{experiment_name}.csv', index=False)
+        # # Start the CV over the folds
+        # X = train_val_df.drop(columns=active_col)
+        # y = train_val_df[active_col].tolist()
+        # for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
+        #     print('-' * 100)
+        #     print(f'Starting CV for group type: {split_type}, fold: {k}')
+        #     print('-' * 100)
+        #     train_df = train_val_df.iloc[train_index]
+        #     val_df = train_val_df.iloc[val_index]
+        #     leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
+        #     leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
+        #     stats = {
+        #         'fold': k,
+        #         'split_type': split_type,
+        #         'train_len': len(train_df),
+        #         'val_len': len(val_df),
+        #         'train_perc': len(train_df) / len(train_val_df),
+        #         'val_perc': len(val_df) / len(train_val_df),
+        #         'train_active_perc': train_df[active_col].sum() / len(train_df),
+        #         'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
+        #         'val_active_perc': val_df[active_col].sum() / len(val_df),
+        #         'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
+        #         'test_active_perc': test_df[active_col].sum() / len(test_df),
+        #         'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
+        #         'num_leaking_uniprot': len(leaking_uniprot),
+        #         'num_leaking_smiles': len(leaking_smiles),
+        #         'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
+        #         'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
+        #     }
+        #     if split_type != 'random':
+        #         stats['train_unique_groups'] = len(np.unique(group[train_index]))
+        #         stats['val_unique_groups'] = len(np.unique(group[val_index]))
+        #     # At each fold, train and evaluate the Pytorch model
+        #     if split_type != 'tanimoto' or run_sklearn:
+        #         logging.info(f'Skipping Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
+        #         continue
+        #     else:
+        #         logging.info(f'Starting Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
+        #         # Train and evaluate the model
+        #         model, trainer, metrics = pdp.hyperparameter_tuning_and_training(
+        #             protein2embedding,
+        #             cell2embedding,
+        #             smiles2fp,
+        #             train_df,
+        #             val_df,
+        #             test_df,
+        #             fast_dev_run=fast_dev_run,
+        #             n_trials=n_trials,
+        #             logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
+        #             active_label=active_col,
+        #             study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
+        #         )
+        #         hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
+        #         stats.update(metrics)
+        #         stats['model_type'] = 'Pytorch'
+        #         report.append(stats.copy())
+        #         del model
+        #         del trainer
+        #         # Ablation study: disable embeddings at a time
+        #         for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
+        #             print('-' * 100)
+        #             print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
+        #             print('-' * 100)
+        #             stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
+        #             model, trainer, metrics = pdp.train_model(
+        #                 protein2embedding,
+        #                 cell2embedding,
+        #                 smiles2fp,
+        #                 train_df,
+        #                 val_df,
+        #                 test_df,
+        #                 fast_dev_run=fast_dev_run,
+        #                 logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
+        #                 active_label=active_col,
+        #                 disabled_embeddings=disabled_embeddings,
+        #                 **hparams,
+        #             )
+        #             stats.update(metrics)
+        #             report.append(stats.copy())
+        #             del model
+        #             del trainer
+        #     # At each fold, train and evaluate sklearn models
+        #     if run_sklearn:
+        #         for model_type in ['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting']:
+        #             logging.info(f'Starting sklearn model {model_type} training on fold {k} with split type {split_type} and test split {test_split}.')
+        #             # Train and evaluate sklearn models
+        #             model, metrics = pdp.hyperparameter_tuning_and_training_sklearn(
+        #                 protein2embedding=protein2embedding,
+        #                 cell2embedding=cell2embedding,
+        #                 smiles2fp=smiles2fp,
+        #                 train_df=train_df,
+        #                 val_df=val_df,
+        #                 test_df=test_df,
+        #                 model_type=model_type,
+        #                 active_label=active_col,
+        #                 n_trials=n_trials,
+        #                 study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}_{model_type.lower()}.pkl',
+        #             )
+        #             hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
+        #             stats['model_type'] = model_type
+        #             stats.update(metrics)
+        #             report.append(stats.copy())
+        # # Save the report at the end of each split type
+        # report_df = pd.DataFrame(report)
+        # report_df.to_csv(
+        #     f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}{"_sklearn" if run_sklearn else ""}.csv',
+        #     index=False,
+        # )
 if __name__ == '__main__':