Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Jul 25, 2024

Commit

8aec0bb

1 Parent(s): 1ee75b1

Started working on cell line one-hot encoding experiments

Browse files

Files changed (5) hide show

protac_degradation_predictor/protac_dataset.py +36 -3
protac_degradation_predictor/pytorch_models.py +59 -23
src/run_experiments.py +8 -3
src/run_experiments_cells_onehot.py +137 -0
src/{run_xgboost_experiments.py → run_experiments_xgboost.py} +0 -0

protac_degradation_predictor/protac_dataset.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import Literal, List, Tuple, Optional, Dict
 from collections import defaultdict
 from .data_utils import (
     get_fingerprint,
@@ -24,15 +26,16 @@ class PROTAC_Dataset(Dataset):
     def __init__(
         self,
         protac_df: pd.DataFrame,
-        protein2embedding: Dict,
-        cell2embedding: Dict,
-        smiles2fp: Dict,
         use_smote: bool = False,
         oversampler: Optional[SMOTE | ADASYN] = None,
         active_label: str = 'Active',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
     ):
         """ Initialize the PROTAC dataset
@@ -47,6 +50,7 @@ class PROTAC_Dataset(Dataset):
             disabled_embeddings (list): The list of embeddings to disable, i.e., return a zero vector
             scaler (StandardScaler | dict): The scaler to use for the embeddings
             use_single_scaler (bool): Whether to use a single scaler for all features
         """
         # Filter out examples with NaN in active_label column
         self.data = protac_df  # [~protac_df[active_label].isna()]
@@ -84,6 +88,22 @@ class PROTAC_Dataset(Dataset):
         self.oversampler = oversampler
         if self.use_smote:
             self.apply_smote()
     def apply_smote(self):
         # Prepare the dataset for SMOTE
@@ -269,6 +289,17 @@ class PROTAC_Dataset(Dataset):
         else:
             cell_emb = self.data['Cell Line Identifier'].iloc[idx]
         elem = {
             'smiles_emb': smiles_emb,
             'poi_emb': poi_emb,
@@ -293,6 +324,7 @@ def get_datasets(
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
         apply_scaling: bool = False,
 ) -> Tuple[PROTAC_Dataset, PROTAC_Dataset, Optional[PROTAC_Dataset]]:
     """ Get the datasets for training the PROTAC model.
@@ -323,6 +355,7 @@ def get_datasets(
         disabled_embeddings=disabled_embeddings,
         scaler=scaler,
         use_single_scaler=use_single_scaler,
     )
     val_ds = PROTAC_Dataset(
         val_df,

 from typing import Literal, List, Tuple, Optional, Dict
 from collections import defaultdict
+import random
+import logging
 from .data_utils import (
     get_fingerprint,
     def __init__(
         self,
         protac_df: pd.DataFrame,
+        protein2embedding: Dict[str, np.ndarray],
+        cell2embedding: Dict[str, np.ndarray],
+        smiles2fp: Dict[str, np.ndarray],
         use_smote: bool = False,
         oversampler: Optional[SMOTE | ADASYN] = None,
         active_label: str = 'Active',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
+        shuffle_embedding_prob: float = 0.0,
     ):
         """ Initialize the PROTAC dataset
             disabled_embeddings (list): The list of embeddings to disable, i.e., return a zero vector
             scaler (StandardScaler | dict): The scaler to use for the embeddings
             use_single_scaler (bool): Whether to use a single scaler for all features
+            shuffle_embedding_prob (float): The probability of shuffling the embeddings. Used for testing whether embeddings act as "barcodes". Defaults to 0.0, i.e., no shuffling.
         """
         # Filter out examples with NaN in active_label column
         self.data = protac_df  # [~protac_df[active_label].isna()]
         self.oversampler = oversampler
         if self.use_smote:
             self.apply_smote()
+        if shuffle_embedding_prob > 0.0:
+            self.shuffle_embedding_prob = shuffle_embedding_prob
+            # Set random seed
+            random.seed(42)
+            if self.protein_emb_dim != self.cell_emb_dim:
+                logging.warning('Protein and cell embeddings have different dimensions. Shuffling will be on POI and E3 embeddings only.')
+    def get_smiles_emb_dim(self):
+        return self.smiles_emb_dim
+    def get_protein_emb_dim(self):
+        return self.protein_emb_dim
+    def get_cell_emb_dim(self):
+        return self.cell_emb_dim
     def apply_smote(self):
         # Prepare the dataset for SMOTE
         else:
             cell_emb = self.data['Cell Line Identifier'].iloc[idx]
+        # Shuffle the embeddings if the probability is met
+        if random.random() < self.shuffle_embedding_prob:
+            if self.protein_emb_dim == self.cell_emb_dim:
+                # Randomly shuffle the embeddings for POI, cell, and E3
+                embeddings = np.vstack([poi_emb, e3_emb, cell_emb])
+                np.random.shuffle(embeddings)
+                poi_emb, e3_emb, cell_emb = embeddings
+            else:
+                # Swap POI and E3 embeddings only, because of different dimensions
+                poi_emb, e3_emb = e3_emb, poi_emb
         elem = {
             'smiles_emb': smiles_emb,
             'poi_emb': poi_emb,
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
         apply_scaling: bool = False,
+        shuffle_embedding_prob: float = 0.0,
 ) -> Tuple[PROTAC_Dataset, PROTAC_Dataset, Optional[PROTAC_Dataset]]:
     """ Get the datasets for training the PROTAC model.
         disabled_embeddings=disabled_embeddings,
         scaler=scaler,
         use_single_scaler=use_single_scaler,
+        shuffle_embedding_prob=shuffle_embedding_prob,
     )
     val_ds = PROTAC_Dataset(
         val_df,

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -23,7 +23,8 @@ from torchmetrics import (
     MetricCollection,
 )
 from imblearn.over_sampling import SMOTE
-from sklearn.preprocessing import StandardScaler
 class PROTAC_Predictor(nn.Module):
@@ -402,9 +403,9 @@ class PROTAC_Model(pl.LightningModule):
 # TODO: Use some sort of **kwargs to pass all the parameters to the model...
 def train_model(
-        protein2embedding: Dict,
-        cell2embedding: Dict,
-        smiles2fp: Dict,
         train_df: pd.DataFrame,
         val_df: pd.DataFrame,
         test_df: Optional[pd.DataFrame] = None,
@@ -414,10 +415,6 @@ def train_model(
         dropout: float = 0.2,
         max_epochs: int = 50,
         use_batch_norm: bool = False,
-        smiles_emb_dim: int = config.fingerprint_size,
-        poi_emb_dim: int = config.protein_embedding_size,
-        e3_emb_dim: int = config.protein_embedding_size,
-        cell_emb_dim: int = config.cell_embedding_size,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
@@ -431,29 +428,61 @@ def train_model(
         checkpoint_model_name: str = 'protac',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         return_predictions: bool = False,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
     Args:
-        protein2embedding (dict): Dictionary of protein embeddings.
-        cell2embedding (dict): Dictionary of cell line embeddings.
-        smiles2fp (dict): Dictionary of SMILES to fingerprint.
-        train_df (pd.DataFrame): The training set. It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
-        val_df (pd.DataFrame): The validation set.  It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
-        test_df (pd.DataFrame): The test set. If provided, the returned metrics will include test performance.  It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
-        hidden_dim (int): The hidden dimension of the model.
-        batch_size (int): The batch size.
-        learning_rate (float): The learning rate.
-        max_epochs (int): The maximum number of epochs.
-        smiles_emb_dim (int): The dimension of the SMILES embeddings.
-        smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
-        fast_dev_run (bool): Whether to run a fast development run.
-        disabled_embeddings (list): The list of disabled embeddings.
-        return_predictions (bool): Whether to return the predictions after the model, trainer, and metrics.
     Returns:
         tuple: The trained model, the trainer, and the metrics over the validation and test sets.
     """
     train_ds, val_ds, test_ds = get_datasets(
         train_df,
         val_df,
@@ -465,7 +494,14 @@ def train_model(
         smote_k_neighbors=smote_k_neighbors,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
     )
     loggers = [
         pl.loggers.TensorBoardLogger(
             save_dir=logger_save_dir,

     MetricCollection,
 )
 from imblearn.over_sampling import SMOTE
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.feature_extraction.text import CountVectorizer
 class PROTAC_Predictor(nn.Module):
 # TODO: Use some sort of **kwargs to pass all the parameters to the model...
 def train_model(
+        protein2embedding: Dict[str, np.ndarray],
+        cell2embedding: Dict[str, np.ndarray],
+        smiles2fp: Dict[str, np.ndarray],
         train_df: pd.DataFrame,
         val_df: pd.DataFrame,
         test_df: Optional[pd.DataFrame] = None,
         dropout: float = 0.2,
         max_epochs: int = 50,
         use_batch_norm: bool = False,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
         checkpoint_model_name: str = 'protac',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         return_predictions: bool = False,
+        shuffle_embedding_prob: float = 0.0,
+        use_cells_one_hot: bool = False,
+        use_amino_acid_count: bool = False,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
     Args:
+        protein2embedding (dict): A dictionary mapping protein identifiers to embeddings.
+        cell2embedding (dict): A dictionary mapping cell line identifiers to embeddings.
+        smiles2fp (dict): A dictionary mapping SMILES strings to fingerprints.
+        train_df (pd.DataFrame): The training dataframe.
+        val_df (pd.DataFrame): The validation dataframe.
+        test_df (Optional[pd.DataFrame]): The test dataframe.
+        hidden_dim (int): The hidden dimension of the model
+        batch_size (int): The batch size
+        learning_rate (float): The learning rate
+        dropout (float): The dropout rate
+        max_epochs (int): The maximum number of epochs
+        use_batch_norm (bool): Whether to use batch normalization
+        join_embeddings (Literal['beginning', 'concat', 'sum']): How to join the embeddings
+        smote_k_neighbors (int): The number of neighbors to use in SMOTE
+        use_smote (bool): Whether to use SMOTE
+        apply_scaling (bool): Whether to apply scaling to the embeddings
+        active_label (str): The name of the active label. Default: 'Active'
+        fast_dev_run (bool): Whether to run a fast development run (see PyTorch Lightning documentation)
+        use_logger (bool): Whether to use a logger
+        logger_save_dir (str): The directory to save the logs
+        logger_name (str): The name of the logger
+        enable_checkpointing (bool): Whether to enable checkpointing
+        checkpoint_model_name (str): The name of the model for checkpointing
+        disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
+        return_predictions (bool): Whether to return predictions on the validation and test sets
     Returns:
         tuple: The trained model, the trainer, and the metrics over the validation and test sets.
     """
+    if use_cells_one_hot:
+        # Get one-hot encoded embeddings for cell lines
+        onehotenc = OneHotEncoder(sparse_output=False)
+        cell_embeddings = onehotenc.fit_transform(
+            np.array(list(cell2embedding.keys()))
+        )
+        cell2embedding = {k: v for k, v in zip(cell2embedding.keys(), cell_embeddings)}
+    if use_amino_acid_count:
+        # Get count vectorized embeddings for proteins
+        # NOTE: Check that the protein2embedding is a dictionary of strings
+        if not all(isinstance(k, str) for k in protein2embedding.keys()):
+            raise ValueError("All keys in `protein2embedding` must be strings.")
+        countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
+        protein_embeddings = countvec.fit_transform(
+            list(protein2embedding.keys())
+        )
+        protein2embedding = {k: v for k, v in zip(protein2embedding.keys(), protein_embeddings)}
     train_ds, val_ds, test_ds = get_datasets(
         train_df,
         val_df,
         smote_k_neighbors=smote_k_neighbors,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
+        shuffle_embedding_prob=shuffle_embedding_prob,
     )
+    # NOTE: The embeddings dimensions should already match in all sets
+    smiles_emb_dim = train_ds.get_smiles_emb_dim()
+    poi_emb_dim = train_ds.get_protein_emb_dim()
+    e3_emb_dim = train_ds.get_protein_emb_dim()
+    cell_emb_dim = train_ds.get_cell_emb_dim()
     loggers = [
         pl.loggers.TensorBoardLogger(
             save_dir=logger_save_dir,

src/run_experiments.py CHANGED Viewed

@@ -238,10 +238,15 @@ def main(
     """ Train a PROTAC model using the given datasets and hyperparameters.
     Args:
-        use_ored_activity (bool): Whether to use the 'Active - OR' column.
-        n_trials (int): The number of hyperparameter optimization trials.
-        n_splits (int): The number of cross-validation splits.
         fast_dev_run (bool): Whether to run a fast development run.
     """
     pl.seed_everything(42)

     """ Train a PROTAC model using the given datasets and hyperparameters.
     Args:
+        active_col (str): The column containing the active/inactive information. Must be in the format 'Active (Dmax N, pDC50 M)'.
+        n_trials (int): The number of hyperparameter tuning trials to run.
         fast_dev_run (bool): Whether to run a fast development run.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+        cv_n_splits (int): The number of cross-validation splits to use.
+        max_epochs (int): The maximum number of epochs to train the model.
+        run_sklearn (bool): Whether to run sklearn models.
+        force_study (bool): Whether to force the creation of a new Optuna study.
+        experiments (str): The type of experiments to run.
     """
     pl.seed_everything(42)

src/run_experiments_cells_onehot.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import sys
+from collections import defaultdict
+import warnings
+import logging
+from typing import Literal
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import protac_degradation_predictor as pdp
+import pytorch_lightning as pl
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+from jsonargparse import CLI
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
+# Ignore UserWarning from Matplotlib
+warnings.filterwarnings("ignore", ".*FixedLocator*")
+# Ignore UserWarning from PyTorch Lightning
+warnings.filterwarnings("ignore", ".*does not have many workers.*")
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
+def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
+    n_trials: int = 100,
+    fast_dev_run: bool = False,
+    test_split: float = 0.1,
+    cv_n_splits: int = 5,
+    max_epochs: int = 100,
+    force_study: bool = False,
+    experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
+):
+    """ Run experiments with the cells one-hot encoding model.
+    Args:
+        active_col (str): Name of the column containing the active values.
+        n_trials (int): Number of hyperparameter optimization trials.
+        fast_dev_run (bool): Whether to run a fast development run.
+        test_split (float): Percentage of data to use for testing.
+        cv_n_splits (int): Number of cross-validation splits.
+        max_epochs (int): Maximum number of epochs to train the model.
+        force_study (bool): Whether to force the creation of a new study.
+        experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
+    """
+    # Make directory ../reports if it does not exist
+    if not os.path.exists('../reports'):
+        os.makedirs('../reports')
+    # Load embedding dictionaries
+    protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
+    cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    studies_dir = '../data/studies'
+    train_val_perc = f'{int((1 - test_split) * 100)}'
+    test_perc = f'{int(test_split * 100)}'
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    if experiments == 'all':
+        experiments = ['standard', 'similarity', 'target']
+    # Cross-Validation Training
+    reports = defaultdict(list)
+    for split_type in experiments:
+        train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
+        test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
+        train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
+        test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
+        # Get SMILES and precompute fingerprints dictionary
+        unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
+        smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
+        # Get the CV object
+        if split_type == 'standard':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = None
+        elif split_type == 'e3_ligase':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'similarity':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'target':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Uniprot Group'].to_numpy()
+        # Start the experiment
+        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
+        optuna_reports = pdp.hyperparameter_tuning_and_training(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_val_df=train_val_df,
+            test_df=test_df,
+            kf=kf,
+            groups=group,
+            split_type=split_type,
+            n_models_for_test=3,
+            fast_dev_run=fast_dev_run,
+            n_trials=n_trials,
+            max_epochs=max_epochs,
+            logger_save_dir='../logs',
+            logger_name=f'logs_{experiment_name}',
+            active_label=active_col,
+            study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
+            force_study=force_study,
+        )
+        # Save the reports to file
+        for report_name, report in optuna_reports.items():
+            report.to_csv(f'../reports/cellsonehot_{report_name}_{experiment_name}.csv', index=False)
+            reports[report_name].append(report.copy())
+if __name__ == '__main__':
+    cli = CLI(main)

src/{run_xgboost_experiments.py → run_experiments_xgboost.py} RENAMED Viewed

File without changes