Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Jul 25, 2024

Commit

ccc40da

1 Parent(s): e1370eb

Implemented cell lines one-hot and amino acid sequence count experiments

Browse files

Files changed (4) hide show

protac_degradation_predictor/optuna_utils.py +11 -1
protac_degradation_predictor/pytorch_models.py +1 -23
src/run_experiments_aminoacid_counts.py +156 -0
src/run_experiments_cells_onehot.py +9 -0

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -117,6 +117,8 @@ def pytorch_model_objective(
         logger_save_dir: str = 'logs',
         logger_name: str = 'cv_model',
         enable_checkpointing: bool = False,
 ) -> float:
     """ Objective function for hyperparameter optimization.
@@ -139,6 +141,8 @@ def pytorch_model_objective(
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     use_smote = trial.suggest_categorical('use_smote', [True, False])
     apply_scaling = True # trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
     use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
@@ -252,6 +256,8 @@ def hyperparameter_tuning_and_training(
         max_epochs: int = 100,
         study_filename: Optional[str] = None,
         force_study: bool = False,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
@@ -263,7 +269,7 @@ def hyperparameter_tuning_and_training(
         test_df (pd.DataFrame): The test set.
         kf (StratifiedKFold | StratifiedGroupKFold): The KFold object.
         groups (np.array): The groups for the StratifiedGroupKFold.
-        split_type (str): The split type.
         n_models_for_test (int): The number of models to train for the test set.
         fast_dev_run (bool): Whether to run a fast development run.
         n_trials (int): The number of trials for the hyperparameter search.
@@ -322,6 +328,8 @@ def hyperparameter_tuning_and_training(
                 active_label=active_label,
                 max_epochs=max_epochs,
                 disabled_embeddings=[],
             ),
             n_trials=n_trials,
         )
@@ -354,6 +362,8 @@ def hyperparameter_tuning_and_training(
         logger_save_dir=logger_save_dir,
         logger_name=f'{logger_name}_{split_type}_cv_model',
         enable_checkpointing=True,
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)

         logger_save_dir: str = 'logs',
         logger_name: str = 'cv_model',
         enable_checkpointing: bool = False,
+        use_cells_one_hot: bool = False,
+        use_amino_acid_count: bool = False,
 ) -> float:
     """ Objective function for hyperparameter optimization.
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     use_smote = trial.suggest_categorical('use_smote', [True, False])
+    if use_cells_one_hot or use_amino_acid_count:
+        use_smote = False
     apply_scaling = True # trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
     use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
         max_epochs: int = 100,
         study_filename: Optional[str] = None,
         force_study: bool = False,
+        use_cells_one_hot: bool = False,
+        use_amino_acid_count: bool = False,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
         test_df (pd.DataFrame): The test set.
         kf (StratifiedKFold | StratifiedGroupKFold): The KFold object.
         groups (np.array): The groups for the StratifiedGroupKFold.
+        split_type (str): The split type of the current study. Used for reporting.
         n_models_for_test (int): The number of models to train for the test set.
         fast_dev_run (bool): Whether to run a fast development run.
         n_trials (int): The number of trials for the hyperparameter search.
                 active_label=active_label,
                 max_epochs=max_epochs,
                 disabled_embeddings=[],
+                use_cells_one_hot=use_cells_one_hot,
+                use_amino_acid_count=use_amino_acid_count,
             ),
             n_trials=n_trials,
         )
         logger_save_dir=logger_save_dir,
         logger_name=f'{logger_name}_{split_type}_cv_model',
         enable_checkpointing=True,
+        use_cells_one_hot=use_cells_one_hot,
+        use_amino_acid_count=use_amino_acid_count,
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -23,8 +23,7 @@ from torchmetrics import (
     MetricCollection,
 )
 from imblearn.over_sampling import SMOTE
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.feature_extraction.text import CountVectorizer
 class PROTAC_Predictor(nn.Module):
@@ -429,8 +428,6 @@ def train_model(
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         return_predictions: bool = False,
         shuffle_embedding_prob: float = 0.0,
-        use_cells_one_hot: bool = False,
-        use_amino_acid_count: bool = False,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -464,25 +461,6 @@ def train_model(
     Returns:
         tuple: The trained model, the trainer, and the metrics over the validation and test sets.
     """
-    if use_cells_one_hot:
-        # Get one-hot encoded embeddings for cell lines
-        onehotenc = OneHotEncoder(sparse_output=False)
-        cell_embeddings = onehotenc.fit_transform(
-            np.array(list(cell2embedding.keys()))
-        )
-        cell2embedding = {k: v for k, v in zip(cell2embedding.keys(), cell_embeddings)}
-    if use_amino_acid_count:
-        # Get count vectorized embeddings for proteins
-        # NOTE: Check that the protein2embedding is a dictionary of strings
-        if not all(isinstance(k, str) for k in protein2embedding.keys()):
-            raise ValueError("All keys in `protein2embedding` must be strings.")
-        countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
-        protein_embeddings = countvec.fit_transform(
-            list(protein2embedding.keys())
-        )
-        protein2embedding = {k: v for k, v in zip(protein2embedding.keys(), protein_embeddings)}
     train_ds, val_ds, test_ds = get_datasets(
         train_df,
         val_df,

     MetricCollection,
 )
 from imblearn.over_sampling import SMOTE
+from sklearn.preprocessing import StandardScaler
 class PROTAC_Predictor(nn.Module):
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         return_predictions: bool = False,
         shuffle_embedding_prob: float = 0.0,
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
     Returns:
         tuple: The trained model, the trainer, and the metrics over the validation and test sets.
     """
     train_ds, val_ds, test_ds = get_datasets(
         train_df,
         val_df,

src/run_experiments_aminoacid_counts.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import os
+import sys
+from collections import defaultdict
+import warnings
+import logging
+from typing import Literal
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import protac_degradation_predictor as pdp
+import pytorch_lightning as pl
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+from jsonargparse import CLI
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
+from sklearn.feature_extraction.text import CountVectorizer
+# Ignore UserWarning from Matplotlib
+warnings.filterwarnings("ignore", ".*FixedLocator*")
+# Ignore UserWarning from PyTorch Lightning
+warnings.filterwarnings("ignore", ".*does not have many workers.*")
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
+def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
+    n_trials: int = 100,
+    fast_dev_run: bool = False,
+    test_split: float = 0.1,
+    cv_n_splits: int = 5,
+    max_epochs: int = 100,
+    force_study: bool = False,
+    experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
+):
+    """ Run experiments with the cells one-hot encoding model.
+    Args:
+        active_col (str): Name of the column containing the active values.
+        n_trials (int): Number of hyperparameter optimization trials.
+        fast_dev_run (bool): Whether to run a fast development run.
+        test_split (float): Percentage of data to use for testing.
+        cv_n_splits (int): Number of cross-validation splits.
+        max_epochs (int): Maximum number of epochs to train the model.
+        force_study (bool): Whether to force the creation of a new study.
+        experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
+    """
+    # Make directory ../reports if it does not exist
+    if not os.path.exists('../reports'):
+        os.makedirs('../reports')
+    # Load embedding dictionaries
+    protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
+    cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    # Create a new protein2embedding dictionary with amino acid sequence
+    protac_df = pdp.load_curated_dataset()
+    # Create the dictionary mapping 'Uniprot' to 'POI Sequence'
+    protein2embedding = protac_df.set_index('Uniprot')['POI Sequence'].to_dict()
+    # Create the dictionary mapping 'E3 Ligase Uniprot' to 'E3 Ligase Sequence'
+    e32seq = protac_df.set_index('E3 Ligase Uniprot')['E3 Ligase Sequence'].to_dict()
+    # Merge the two dictionaries into a new protein2embedding dictionary
+    protein2embedding.update(e32seq)
+    # Get count vectorized embeddings for proteins
+    # NOTE: Check that the protein2embedding is a dictionary of strings
+    if not all(isinstance(k, str) for k in protein2embedding.keys()):
+        raise ValueError("All keys in `protein2embedding` must be strings.")
+    countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
+    protein_embeddings = countvec.fit_transform(
+        list(protein2embedding.keys())
+    ).toarray()
+    protein2embedding = {k: v for k, v in zip(protein2embedding.keys(), protein_embeddings)}
+    studies_dir = '../data/studies'
+    train_val_perc = f'{int((1 - test_split) * 100)}'
+    test_perc = f'{int(test_split * 100)}'
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    if experiments == 'all':
+        experiments = ['standard', 'similarity', 'target']
+    # Cross-Validation Training
+    reports = defaultdict(list)
+    for split_type in experiments:
+        train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
+        test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
+        train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
+        test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
+        # Get SMILES and precompute fingerprints dictionary
+        unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
+        smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
+        # Get the CV object
+        if split_type == 'standard':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = None
+        elif split_type == 'e3_ligase':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'similarity':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'target':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Uniprot Group'].to_numpy()
+        # Start the experiment
+        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
+        optuna_reports = pdp.hyperparameter_tuning_and_training(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_val_df=train_val_df,
+            test_df=test_df,
+            kf=kf,
+            groups=group,
+            split_type=split_type,
+            n_models_for_test=3,
+            fast_dev_run=fast_dev_run,
+            n_trials=n_trials,
+            max_epochs=max_epochs,
+            logger_save_dir='../logs',
+            logger_name=f'logs_{experiment_name}',
+            active_label=active_col,
+            study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
+            force_study=force_study,
+            use_amino_acid_count=True,
+        )
+        # Save the reports to file
+        for report_name, report in optuna_reports.items():
+            report.to_csv(f'../reports/aminoacidcnt_{report_name}_{experiment_name}.csv', index=False)
+            reports[report_name].append(report.copy())
+if __name__ == '__main__':
+    cli = CLI(main)

src/run_experiments_cells_onehot.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sklearn.model_selection import (
     StratifiedKFold,
     StratifiedGroupKFold,
 )
 # Ignore UserWarning from Matplotlib
 warnings.filterwarnings("ignore", ".*FixedLocator*")
@@ -69,6 +70,13 @@ def main(
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
     studies_dir = '../data/studies'
     train_val_perc = f'{int((1 - test_split) * 100)}'
     test_perc = f'{int(test_split * 100)}'
@@ -125,6 +133,7 @@ def main(
             active_label=active_col,
             study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
             force_study=force_study,
         )
         # Save the reports to file

     StratifiedKFold,
     StratifiedGroupKFold,
 )
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 # Ignore UserWarning from Matplotlib
 warnings.filterwarnings("ignore", ".*FixedLocator*")
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    # Get one-hot encoded embeddings for cell lines
+    onehotenc = OneHotEncoder(sparse_output=False)
+    cell_embeddings = onehotenc.fit_transform(
+        np.array(list(cell2embedding.keys())).reshape(-1, 1)
+    )
+    cell2embedding = {k: v for k, v in zip(cell2embedding.keys(), cell_embeddings)}
     studies_dir = '../data/studies'
     train_val_perc = f'{int((1 - test_split) * 100)}'
     test_perc = f'{int(test_split * 100)}'
             active_label=active_col,
             study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
             force_study=force_study,
+            use_cells_one_hot=True,
         )
         # Save the reports to file