Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Running

App Files Files Community

ribesstefano commited on Apr 22, 2024

Commit

5e01175

1 Parent(s): 6101de8

Started working on packaging the repository

Browse files

Files changed (23) hide show

notebooks/plotting_dragradation_activity_performance.ipynb +0 -0
notebooks/protac_degradation_predictor.ipynb +10 -2
notebooks/protac_degradation_predictor.py +3 -2
protac_degradation_predictor/__init__.py +7 -0
protac_degradation_predictor/config.py +37 -0
protac_degradation_predictor/data/PROTAC-DB.csv +0 -0
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_0_test_split_0.2.pkl → protac_degradation_predictor/data/cell2embedding.pkl +2 -2
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_1_test_split_0.2.pkl → protac_degradation_predictor/data/uniprot2embedding.h5 +2 -2
protac_degradation_predictor/data_utils.py +46 -0
protac_degradation_predictor/optuna_utils.py +318 -0
protac_degradation_predictor/protac_dataset.py +193 -0
protac_degradation_predictor/protac_degradation_predictor.py +88 -0
protac_degradation_predictor/pytorch_models.py +471 -0
protac_degradation_predictor/sklearn_models.py +243 -0
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_0_test_split_0.1.pkl +1 -1
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_1_test_split_0.1.pkl +1 -1
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_2_test_split_0.1.pkl +1 -1
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_2_test_split_0.2.pkl +0 -3
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_3_test_split_0.1.pkl +1 -1
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_3_test_split_0.2.pkl +0 -3
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_4_test_split_0.1.pkl +1 -1
reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_4_test_split_0.2.pkl +0 -3
setup.py +21 -0

notebooks/plotting_dragradation_activity_performance.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/protac_degradation_predictor.ipynb CHANGED Viewed

@@ -1719,8 +1719,16 @@
     }
    ],
    "source": [
-    "from typing import Literal, List, Tuple, Optional\n",
-    "from sklearn.base import ClassifierMixin\n",
     "\n",
     "# Generic function to fit and evaluate a classifier model (given as argument),\n",
     "# on train and val sets (and optionally a test set) given as dataframes\n",

     }
    ],
    "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torchmetrics import (\n",
+    "    Accuracy,\n",
+    "    AUROC,\n",
+    "    Precision,\n",
+    "    Recall,\n",
+    "    F1Score,\n",
+    "    MetricCollection,\n",
+    ")\n",
     "\n",
     "# Generic function to fit and evaluate a classifier model (given as argument),\n",
     "# on train and val sets (and optionally a test set) given as dataframes\n",

notebooks/protac_degradation_predictor.py CHANGED Viewed

@@ -680,7 +680,7 @@ def train_model(
         hidden_dim (int): The hidden dimension of the model.
         batch_size (int): The batch size.
         learning_rate (float): The learning rate.
-        max_epochs (int): The maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
         fast_dev_run (bool): Whether to run a fast development run.
@@ -985,6 +985,8 @@ def main(
     encoder = OrdinalEncoder()
     protac_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
     active_df = protac_df[protac_df[active_col].notna()].copy()
     tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
     test_df = []
@@ -992,7 +994,6 @@ def main(
     # entries to the test_df if: 1) the test_df lenght + the group entries is less
     # 20% of the active_df lenght, and 2) the percentage of True and False entries
     # in the active_col in test_df is roughly 50%.
-    # Start the loop from the groups containing the smallest number of entries.
     for group in tanimoto_groups:
         group_df = active_df[active_df['Tanimoto Group'] == group]
         if test_df == []:

         hidden_dim (int): The hidden dimension of the model.
         batch_size (int): The batch size.
         learning_rate (float): The learning rate.
+        max_epochs (int): Th    e maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
         fast_dev_run (bool): Whether to run a fast development run.
     encoder = OrdinalEncoder()
     protac_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
     active_df = protac_df[protac_df[active_col].notna()].copy()
+    # Sort the groups so that samples with the highest tanimoto similarity,
+    # i.e., the "less similar" ones, are placed in the test set first
     tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
     test_df = []
     # entries to the test_df if: 1) the test_df lenght + the group entries is less
     # 20% of the active_df lenght, and 2) the percentage of True and False entries
     # in the active_col in test_df is roughly 50%.
     for group in tanimoto_groups:
         group_df = active_df[active_df['Tanimoto Group'] == group]
         if test_df == []:

protac_degradation_predictor/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .protac_degradation_predictor import (
+    PROTAC_Model,
+    train_model,
+)
+__version__ = "0.0.1"
+__author__ = "Stefano Ribes"

protac_degradation_predictor/config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from dataclasses import dataclass
+@dataclass(frozen=True)
+class Config:
+    # Embeddings information
+    morgan_radius: int = 15
+    fingerprint_size: int = 224
+    protein_embedding_size: int = 1024
+    cell_embedding_size: int = 768
+    # Data information
+    dmax_threshold: float = 0.6
+    pdc50_threshold: float = 6.0
+    e3_ligase2uniprot: dict = {
+        'VHL': 'P40337',
+        'CRBN': 'Q96SW2',
+        'DCAF11': 'Q8TEB1',
+        'DCAF15': 'Q66K64',
+        'DCAF16': 'Q9NXF7',
+        'MDM2': 'Q00987',
+        'Mdm2': 'Q00987',
+        'XIAP': 'P98170',
+        'cIAP1': 'Q7Z460',
+        'IAP': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
+        'Iap': 'P98170',  # I couldn't find the Uniprot ID for IAP, so it's XIAP instead
+        'AhR': 'P35869',
+        'RNF4': 'P78317',
+        'RNF114': 'Q9Y508',
+        'FEM1B': 'Q9UK73',
+        'Ubr1': 'Q8IWV7',
+    }
+    def __post_init__(self):
+        self.active_label: str = f'Active (Dmax {self.dmax_threshold}, pDC50 {self.pdc50_threshold})'
+config = Config()

protac_degradation_predictor/data/PROTAC-DB.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_0_test_split_0.2.pkl → protac_degradation_predictor/data/cell2embedding.pkl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:647f8b8e72f9f1f72ecdb8733b306df290f62fee42d30ab6da0f26cf3ed3b010
-size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:627e8ce3842afeb6bb7d5caa5ec1ba034c36dc77fab70734e15dca340a7fd718
+size 3550864

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_1_test_split_0.2.pkl → protac_degradation_predictor/data/uniprot2embedding.h5 RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:375e5f4f5080d2cf654b932faedeb2b0e9433d0c738542098f789beecd980c65
-size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:19f4b8c73652392db7840962d1a7817c7e899716e2bb758e4947c8c2bb265336
+size 51089512

protac_degradation_predictor/data_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import pkg_resources
+import pickle
+from typing import Dict
+from config import config
+import h5py
+import numpy as np
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from joblib import Memory
+home_dir = os.path.expanduser('~')
+cachedir = os.path.join(home_dir, '.cache', 'protac_degradation_predictor')
+memory = Memory(cachedir, verbose=0)
+@memory.cache
+def load_protein2embedding() -> Dict[str, np.ndarray]:
+    embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
+    protein2embedding = {}
+    with h5py.File(embeddings_path, "r") as file:
+        for sequence_id in file.keys():
+            embedding = file[sequence_id][:]
+            protein2embedding[sequence_id] = np.array(embedding)
+    return protein2embedding
+@memory.cache
+def load_cell2embedding() -> Dict[str, np.ndarray]:
+    embeddings_path = pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl')
+    with open(embeddings_path, 'rb') as f:
+        cell2embedding = pickle.load(f)
+    return cell2embedding
+def get_fingerprint(smiles: str) -> np.ndarray:
+    morgan_fpgen = AllChem.GetMorganGenerator(
+        radius=config.morgan_radius,
+        fpSize=config.fingerprint_size,
+        includeChirality=True,
+    )
+    return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))

protac_degradation_predictor/optuna_utils.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+from typing import Literal, List, Tuple, Optional, Dict
+from pytorch_models import train_model
+from sklearn_models import (
+    train_sklearn_model,
+    suggest_random_forest,
+    suggest_logistic_regression,
+    suggest_svc,
+    suggest_gradient_boosting,
+)
+import optuna
+from optuna.samplers import TPESampler
+import joblib
+import pandas as pd
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    GradientBoostingClassifier,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+def pytorch_model_objective(
+        trial: optuna.Trial,
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        hidden_dim_options: List[int] = [256, 512, 768],
+        batch_size_options: List[int] = [8, 16, 32],
+        learning_rate_options: Tuple[float, float] = (1e-5, 1e-3),
+        smote_k_neighbors_options: List[int] = list(range(3, 16)),
+        dropout_options: Tuple[float, float] = (0.1, 0.5),
+        fast_dev_run: bool = False,
+        active_label: str = 'Active',
+        disabled_embeddings: List[str] = [],
+        max_epochs: int = 100,
+) -> float:
+    """ Objective function for hyperparameter optimization.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        hidden_dim_options (List[int]): The hidden dimension options.
+        batch_size_options (List[int]): The batch size options.
+        learning_rate_options (Tuple[float, float]): The learning rate options.
+        smote_k_neighbors_options (List[int]): The SMOTE k neighbors options.
+        dropout_options (Tuple[float, float]): The dropout options.
+        fast_dev_run (bool): Whether to run a fast development run.
+        active_label (str): The active label column.
+        disabled_embeddings (List[str]): The list of disabled embeddings.
+    """
+    # Generate the hyperparameters
+    hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
+    batch_size = trial.suggest_categorical('batch_size', batch_size_options)
+    learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
+    join_embeddings = trial.suggest_categorical('join_embeddings', ['beginning', 'concat', 'sum'])
+    smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
+    use_smote = trial.suggest_categorical('use_smote', [True, False])
+    apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
+    dropout = trial.suggest_float('dropout', *dropout_options)
+    # Train the model with the current set of hyperparameters
+    _, _, metrics = train_model(
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        train_df,
+        val_df,
+        hidden_dim=hidden_dim,
+        batch_size=batch_size,
+        join_embeddings=join_embeddings,
+        learning_rate=learning_rate,
+        dropout=dropout,
+        max_epochs=max_epochs,
+        smote_k_neighbors=smote_k_neighbors,
+        apply_scaling=apply_scaling,
+        use_smote=use_smote,
+        use_logger=False,
+        fast_dev_run=fast_dev_run,
+        active_label=active_label,
+        disabled_embeddings=disabled_embeddings,
+    )
+    # Metrics is a dictionary containing at least the validation loss
+    val_loss = metrics['val_loss']
+    val_acc = metrics['val_acc']
+    val_roc_auc = metrics['val_roc_auc']
+    # Optuna aims to minimize the pytorch_model_objective
+    return val_loss - val_acc - val_roc_auc
+def hyperparameter_tuning_and_training(
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: Optional[pd.DataFrame] = None,
+        fast_dev_run: bool = False,
+        n_trials: int = 50,
+        logger_name: str = 'protac_hparam_search',
+        active_label: str = 'Active',
+        disabled_embeddings: List[str] = [],
+        study_filename: Optional[str] = None,
+) -> tuple:
+    """ Hyperparameter tuning and training of a PROTAC model.
+    Args:
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        test_df (pd.DataFrame): The test set.
+        fast_dev_run (bool): Whether to run a fast development run.
+        n_trials (int): The number of hyperparameter optimization trials.
+        logger_name (str): The name of the logger.
+        active_label (str): The active label column.
+        disabled_embeddings (List[str]): The list of disabled embeddings.
+    Returns:
+        tuple: The trained model, the trainer, and the best metrics.
+    """
+    # Define the search space
+    hidden_dim_options = [256, 512, 768]
+    batch_size_options = [8, 16, 32]
+    learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
+    smote_k_neighbors_options = list(range(3, 16))
+    # Set the verbosity of Optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    # Create an Optuna study object
+    sampler = TPESampler(seed=42, multivariate=True)
+    study = optuna.create_study(direction='minimize', sampler=sampler)
+    study_loaded = False
+    if study_filename:
+        if os.path.exists(study_filename):
+            study = joblib.load(study_filename)
+            study_loaded = True
+            print(f'Loaded study from {study_filename}')
+    if not study_loaded:
+        study.optimize(
+            lambda trial: pytorch_model_objective(
+                trial=trial,
+                protein2embedding=protein2embedding,
+                cell2embedding=cell2embedding,
+                smiles2fp=smiles2fp,
+                train_df=train_df,
+                val_df=val_df,
+                hidden_dim_options=hidden_dim_options,
+                batch_size_options=batch_size_options,
+                learning_rate_options=learning_rate_options,
+                smote_k_neighbors_options=smote_k_neighbors_options,
+                fast_dev_run=fast_dev_run,
+                active_label=active_label,
+                disabled_embeddings=disabled_embeddings,
+            ),
+            n_trials=n_trials,
+        )
+        if study_filename:
+            joblib.dump(study, study_filename)
+    # Retrain the model with the best hyperparameters
+    model, trainer, metrics = train_model(
+        protein2embedding=protein2embedding,
+        cell2embedding=cell2embedding,
+        smiles2fp=smiles2fp,
+        train_df=train_df,
+        val_df=val_df,
+        test_df=test_df,
+        use_logger=True,
+        logger_name=logger_name,
+        fast_dev_run=fast_dev_run,
+        active_label=active_label,
+        disabled_embeddings=disabled_embeddings,
+        **study.best_params,
+    )
+    # Report the best hyperparameters found
+    metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
+    # Return the best metrics
+    return model, trainer, metrics
+def sklearn_model_objective(
+        trial: optuna.Trial,
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
+        active_label: str = 'Active',
+) -> float:
+    """ Objective function for hyperparameter optimization.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        model_type (str): The model type.
+        hyperparameters (Dict): The hyperparameters for the model.
+        fast_dev_run (bool): Whether to run a fast development run.
+        active_label (str): The active label column.
+    """
+    # Generate the hyperparameters
+    use_single_scaler = trial.suggest_categorical('use_single_scaler', [True, False])
+    if model_type == 'RandomForest':
+        clf = suggest_random_forest(trial)
+    elif model_type == 'SVC':
+        clf = suggest_svc(trial)
+    elif model_type == 'LogisticRegression':
+        clf = suggest_logistic_regression(trial)
+    elif model_type == 'GradientBoosting':
+        clf = suggest_gradient_boosting(trial)
+    else:
+        raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
+    # Train the model with the current set of hyperparameters
+    _, metrics = train_sklearn_model(
+        clf=clf,
+        protein2embedding=protein2embedding,
+        cell2embedding=cell2embedding,
+        smiles2fp=smiles2fp,
+        train_df=train_df,
+        val_df=val_df,
+        active_label=active_label,
+        use_single_scaler=use_single_scaler,
+    )
+    # Metrics is a dictionary containing at least the validation loss
+    val_acc = metrics['val_acc']
+    val_roc_auc = metrics['val_roc_auc']
+    # Optuna aims to minimize the sklearn_model_objective
+    return - val_acc - val_roc_auc
+def hyperparameter_tuning_and_training_sklearn(
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: Optional[pd.DataFrame] = None,
+        model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
+        active_label: str = 'Active',
+        n_trials: int = 50,
+        logger_name: str = 'protac_hparam_search',
+        study_filename: Optional[str] = None,
+) -> Tuple:
+    # Set the verbosity of Optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    # Create an Optuna study object
+    sampler = TPESampler(seed=42, multivariate=True)
+    study = optuna.create_study(direction='minimize', sampler=sampler)
+    study_loaded = False
+    if study_filename:
+        if os.path.exists(study_filename):
+            study = joblib.load(study_filename)
+            study_loaded = True
+            print(f'Loaded study from {study_filename}')
+    if not study_loaded:
+        study.optimize(
+            lambda trial: sklearn_model_objective(
+                trial=trial,
+                protein2embedding=protein2embedding,
+                cell2embedding=cell2embedding,
+                smiles2fp=smiles2fp,
+                train_df=train_df,
+                val_df=val_df,
+                model_type=model_type,
+                active_label=active_label,
+            ),
+            n_trials=n_trials,
+        )
+        if study_filename:
+            joblib.dump(study, study_filename)
+    # Retrain the model with the best hyperparameters
+    best_hyperparameters = {k.replace('model_', ''): v for k, v in study.best_params.items() if k.startswith('model_')}
+    if model_type == 'RandomForest':
+        clf = RandomForestClassifier(random_state=42, **best_hyperparameters)
+    elif model_type == 'SVC':
+        clf = SVC(random_state=42, probability=True, **best_hyperparameters)
+    elif model_type == 'LogisticRegression':
+        clf = LogisticRegression(random_state=42, max_iter=1000, **best_hyperparameters)
+    elif model_type == 'GradientBoosting':
+        clf = GradientBoostingClassifier(random_state=42, **best_hyperparameters)
+    else:
+        raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
+    model, metrics = train_sklearn_model(
+        clf=clf,
+        protein2embedding=protein2embedding,
+        cell2embedding=cell2embedding,
+        smiles2fp=smiles2fp,
+        train_df=train_df,
+        val_df=val_df,
+        test_df=test_df,
+        active_label=active_label,
+        use_single_scaler=study.best_params['use_single_scaler'],
+    )
+    # Report the best hyperparameters found
+    metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
+    # Return the best metrics
+    return model, metrics

protac_degradation_predictor/protac_dataset.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from typing import Literal, List, Tuple, Optional, Dict
+from torch.utils.data import Dataset
+import numpy as np
+from imblearn.over_sampling import SMOTE, ADASYN
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+class PROTAC_Dataset(Dataset):
+    def __init__(
+        self,
+        protac_df: pd.DataFrame,
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        use_smote: bool = False,
+        oversampler: Optional[SMOTE | ADASYN] = None,
+        active_label: str = 'Active',
+    ):
+        """ Initialize the PROTAC dataset
+        Args:
+            protac_df (pd.DataFrame): The PROTAC dataframe
+            protein2embedding (dict): Dictionary of protein embeddings
+            cell2embedding (dict): Dictionary of cell line embeddings
+            smiles2fp (dict): Dictionary of SMILES to fingerprint
+            use_smote (bool): Whether to use SMOTE for oversampling
+            use_ored_activity (bool): Whether to use the 'Active - OR' column
+        """
+        # Filter out examples with NaN in active_col column
+        self.data = protac_df  # [~protac_df[active_col].isna()]
+        self.protein2embedding = protein2embedding
+        self.cell2embedding = cell2embedding
+        self.smiles2fp = smiles2fp
+        self.active_label = active_label
+        self.use_single_scaler = None
+        self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
+        self.protein_emb_dim = protein2embedding[list(
+            protein2embedding.keys())[0]].shape[0]
+        self.cell_emb_dim = cell2embedding[list(
+            cell2embedding.keys())[0]].shape[0]
+        # Look up the embeddings
+        self.data = pd.DataFrame({
+            'Smiles': self.data['Smiles'].apply(lambda x: smiles2fp[x].astype(np.float32)).tolist(),
+            'Uniprot': self.data['Uniprot'].apply(lambda x: protein2embedding[x].astype(np.float32)).tolist(),
+            'E3 Ligase Uniprot': self.data['E3 Ligase Uniprot'].apply(lambda x: protein2embedding[x].astype(np.float32)).tolist(),
+            'Cell Line Identifier': self.data['Cell Line Identifier'].apply(lambda x: cell2embedding[x].astype(np.float32)).tolist(),
+            self.active_label: self.data[self.active_label].astype(np.float32).tolist(),
+        })
+        # Apply SMOTE
+        self.use_smote = use_smote
+        self.oversampler = oversampler
+        if self.use_smote:
+            self.apply_smote()
+    def apply_smote(self):
+        # Prepare the dataset for SMOTE
+        features = []
+        labels = []
+        for _, row in self.data.iterrows():
+            features.append(np.hstack([
+                row['Smiles'],
+                row['Uniprot'],
+                row['E3 Ligase Uniprot'],
+                row['Cell Line Identifier'],
+            ]))
+            labels.append(row[self.active_label])
+        # Convert to numpy array
+        features = np.array(features).astype(np.float32)
+        labels = np.array(labels).astype(np.float32)
+        # Initialize SMOTE and fit
+        if self.oversampler is None:
+            oversampler = SMOTE(random_state=42)
+        else:
+            oversampler = self.oversampler
+        features_smote, labels_smote = oversampler.fit_resample(features, labels)
+        # Separate the features back into their respective embeddings
+        smiles_embs = features_smote[:, :self.smiles_emb_dim]
+        poi_embs = features_smote[:,
+                                  self.smiles_emb_dim:self.smiles_emb_dim+self.protein_emb_dim]
+        e3_embs = features_smote[:, self.smiles_emb_dim +
+                                 self.protein_emb_dim:self.smiles_emb_dim+2*self.protein_emb_dim]
+        cell_embs = features_smote[:, -self.cell_emb_dim:]
+        # Reconstruct the dataframe with oversampled data
+        df_smote = pd.DataFrame({
+            'Smiles': list(smiles_embs),
+            'Uniprot': list(poi_embs),
+            'E3 Ligase Uniprot': list(e3_embs),
+            'Cell Line Identifier': list(cell_embs),
+            self.active_label: labels_smote
+        })
+        self.data = df_smote
+    def fit_scaling(self, use_single_scaler: bool = False, **scaler_kwargs) -> dict:
+        """ Fit the scalers for the data.
+        Args:
+            use_single_scaler (bool): Whether to use a single scaler for all features.
+            scaler_kwargs: Keyword arguments for the StandardScaler.
+        Returns:
+            dict: The fitted scalers.
+        """
+        if use_single_scaler:
+            self.use_single_scaler = True
+            scaler = StandardScaler(**scaler_kwargs)
+            embeddings = np.hstack([
+                np.array(self.data['Smiles'].tolist()),
+                np.array(self.data['Uniprot'].tolist()),
+                np.array(self.data['E3 Ligase Uniprot'].tolist()),
+                np.array(self.data['Cell Line Identifier'].tolist()),
+            ])
+            scaler.fit(embeddings)
+            return scaler
+        else:
+            self.use_single_scaler = False
+            scalers = {}
+            scalers['Smiles'] = StandardScaler(**scaler_kwargs)
+            scalers['Uniprot'] = StandardScaler(**scaler_kwargs)
+            scalers['E3 Ligase Uniprot'] = StandardScaler(**scaler_kwargs)
+            scalers['Cell Line Identifier'] = StandardScaler(**scaler_kwargs)
+            scalers['Smiles'].fit(np.stack(self.data['Smiles'].to_numpy()))
+            scalers['Uniprot'].fit(np.stack(self.data['Uniprot'].to_numpy()))
+            scalers['E3 Ligase Uniprot'].fit(np.stack(self.data['E3 Ligase Uniprot'].to_numpy()))
+            scalers['Cell Line Identifier'].fit(np.stack(self.data['Cell Line Identifier'].to_numpy()))
+            return scalers
+    def apply_scaling(self, scalers: dict, use_single_scaler: bool = False):
+        """ Apply scaling to the data.
+        Args:
+            scalers (dict): The scalers for each feature.
+            use_single_scaler (bool): Whether to use a single scaler for all features.
+        """
+        if self.use_single_scaler is None:
+            raise ValueError(
+                "The fit_scaling method must be called before apply_scaling.")
+        if use_single_scaler != self.use_single_scaler:
+            raise ValueError(
+                f"The use_single_scaler parameter must be the same as the one used in the fit_scaling method. Got {use_single_scaler}, previously {self.use_single_scaler}.")
+        if use_single_scaler:
+            embeddings = np.hstack([
+                np.array(self.data['Smiles'].tolist()),
+                np.array(self.data['Uniprot'].tolist()),
+                np.array(self.data['E3 Ligase Uniprot'].tolist()),
+                np.array(self.data['Cell Line Identifier'].tolist()),
+            ])
+            scaled_embeddings = scalers.transform(embeddings)
+            self.data = pd.DataFrame({
+                'Smiles': list(scaled_embeddings[:, :self.smiles_emb_dim]),
+                'Uniprot': list(scaled_embeddings[:, self.smiles_emb_dim:self.smiles_emb_dim+self.protein_emb_dim]),
+                'E3 Ligase Uniprot': list(scaled_embeddings[:, self.smiles_emb_dim+self.protein_emb_dim:self.smiles_emb_dim+2*self.protein_emb_dim]),
+                'Cell Line Identifier': list(scaled_embeddings[:, -self.cell_emb_dim:]),
+                self.active_label: self.data[self.active_label]
+            })
+        else:
+            self.data['Smiles'] = self.data['Smiles'].apply(lambda x: scalers['Smiles'].transform(x[np.newaxis, :])[0])
+            self.data['Uniprot'] = self.data['Uniprot'].apply(lambda x: scalers['Uniprot'].transform(x[np.newaxis, :])[0])
+            self.data['E3 Ligase Uniprot'] = self.data['E3 Ligase Uniprot'].apply(lambda x: scalers['E3 Ligase Uniprot'].transform(x[np.newaxis, :])[0])
+            self.data['Cell Line Identifier'] = self.data['Cell Line Identifier'].apply(lambda x: scalers['Cell Line Identifier'].transform(x[np.newaxis, :])[0])
+    def get_numpy_arrays(self):
+        X = np.hstack([
+            np.array(self.data['Smiles'].tolist()),
+            np.array(self.data['Uniprot'].tolist()),
+            np.array(self.data['E3 Ligase Uniprot'].tolist()),
+            np.array(self.data['Cell Line Identifier'].tolist()),
+        ]).copy()
+        y = self.data[self.active_label].values.copy()
+        return X, y
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        elem = {
+            'smiles_emb': self.data['Smiles'].iloc[idx],
+            'poi_emb': self.data['Uniprot'].iloc[idx],
+            'e3_emb': self.data['E3 Ligase Uniprot'].iloc[idx],
+            'cell_emb': self.data['Cell Line Identifier'].iloc[idx],
+            'active': self.data[self.active_label].iloc[idx],
+        }
+        return elem

protac_degradation_predictor/protac_degradation_predictor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import pkg_resources
+import logging
+from pytorch_models import PROTAC_Model, load_model
+from data_utils import (
+    load_protein2embedding,
+    load_cell2embedding,
+    get_fingerprint,
+)
+from config import config
+import numpy as np
+import torch
+from torch import sigmoid
+package_name = 'protac_degradation_predictor'
+def get_protac_active_proba(
+        protac_smiles: str,
+        e3_ligase: str,
+        target_uniprot: str,
+        cell_line: str,
+        device: str = 'cpu',
+) -> bool:
+    ckpt_path = pkg_resources.resource_stream(__name__, 'data/model.ckpt')
+    model = load_model(ckpt_path).to(device)
+    protein2embedding = load_protein2embedding()
+    cell2embedding = load_cell2embedding()
+    # Setup default embeddings
+    if e3_ligase not in config.e3_ligase2uniprot:
+        available_e3_ligases = ', '.join(list(config.e3_ligase2uniprot.keys()))
+        logging.warning(f"The E3 ligase {e3_ligase} is not in the database. Using the default E3 ligase. Available E3 ligases are: {available_e3_ligases}")
+    if target_uniprot not in protein2embedding:
+        logging.warning(f"The target protein {target_uniprot} is not in the database. Using the default target protein.")
+    if cell_line not in load_cell2embedding():
+        logging.warning(f"The cell line {cell_line} is not in the database. Using the default cell line.")
+    default_protein_emb = np.zeros(config.protein_embedding_size)
+    default_cell_emb = np.zeros(config.cell_embedding_size)
+    # Convert the E3 ligase to Uniprot ID
+    e3_ligase_uniprot = config.e3_ligase2uniprot.get(e3_ligase, '')
+    # Get the embeddings
+    poi_emb = protein2embedding.get(target_uniprot, default_protein_emb)
+    e3_emb = protein2embedding.get(e3_ligase_uniprot, default_protein_emb)
+    cell_emb = cell2embedding.get(cell_line, default_cell_emb)
+    smiles_emb = get_fingerprint(protac_smiles)
+    # Convert to torch tensors
+    poi_emb = torch.tensor(poi_emb).to(device)
+    e3_emb = torch.tensor(e3_emb).to(device)
+    cell_emb = torch.tensor(cell_emb).to(device)
+    smiles_emb = torch.tensor(smiles_emb).to(device)
+    return model(poi_emb, e3_emb, cell_emb, smiles_emb).item()
+def is_protac_active(
+        protac_smiles: str,
+        e3_ligase: str,
+        target_uniprot: str,
+        cell_line: str,
+        device: str = 'cpu',
+        proba_threshold: float = 0.5,
+) -> bool:
+    """ Predict whether a PROTAC is active or not.
+    Args:
+        protac_smiles (str): The SMILES of the PROTAC.
+        e3_ligase (str): The Uniprot ID of the E3 ligase.
+        target_uniprot (str): The Uniprot ID of the target protein.
+        cell_line (str): The cell line identifier.
+        device (str): The device to run the model on.
+        proba_threshold (float): The probability threshold.
+    Returns:
+        bool: Whether the PROTAC is active or not.
+    """
+    pred = get_protac_active_proba(
+        protac_smiles,
+        e3_ligase,
+        target_uniprot,
+        cell_line,
+        device,
+    )
+    return sigmoid(pred) > proba_threshold

protac_degradation_predictor/pytorch_models.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import warnings
+from typing import Literal, List, Tuple, Optional, Dict
+from protac_dataset import PROTAC_Dataset
+from config import Config
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import pytorch_lightning as pl
+from torch.utils.data import Dataset, DataLoader
+from torchmetrics import (
+    Accuracy,
+    AUROC,
+    Precision,
+    Recall,
+    F1Score,
+    MetricCollection,
+)
+from imblearn.over_sampling import SMOTE
+class PROTAC_Predictor(nn.Module):
+    def __init__(
+        self,
+        hidden_dim: int,
+        smiles_emb_dim: int = Config.fingerprint_size,
+        poi_emb_dim: int = Config.protein_embedding_size,
+        e3_emb_dim: int = Config.protein_embedding_size,
+        cell_emb_dim: int = Config.cell_embedding_size,
+        dropout: float = 0.2,
+        join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
+        disabled_embeddings: list = [],
+    ):
+        """ Initialize the PROTAC model.
+        Args:
+            hidden_dim (int): The hidden dimension of the model
+            smiles_emb_dim (int): The dimension of the SMILES embeddings
+            poi_emb_dim (int): The dimension of the POI embeddings
+            e3_emb_dim (int): The dimension of the E3 Ligase embeddings
+            cell_emb_dim (int): The dimension of the cell line embeddings
+            dropout (float): The dropout rate
+            join_embeddings (Literal['beginning', 'concat', 'sum']): How to join the embeddings
+            disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
+        """
+        super().__init__()
+        self.poi_emb_dim = poi_emb_dim
+        self.e3_emb_dim = e3_emb_dim
+        self.cell_emb_dim = cell_emb_dim
+        self.smiles_emb_dim = smiles_emb_dim
+        self.hidden_dim = hidden_dim
+        self.join_embeddings = join_embeddings
+        self.disabled_embeddings = disabled_embeddings
+        # Set our init args as class attributes
+        self.__dict__.update(locals())
+        # Define "surrogate models" branches
+        if self.join_embeddings != 'beginning':
+            if 'poi' not in self.disabled_embeddings:
+                self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
+            if 'e3' not in self.disabled_embeddings:
+                self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
+            if 'cell' not in self.disabled_embeddings:
+                self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
+            if 'smiles' not in self.disabled_embeddings:
+                self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
+        # Define hidden dimension for joining layer
+        if self.join_embeddings == 'beginning':
+            joint_dim = smiles_emb_dim if 'smiles' not in self.disabled_embeddings else 0
+            joint_dim += poi_emb_dim if 'poi' not in self.disabled_embeddings else 0
+            joint_dim += e3_emb_dim if 'e3' not in self.disabled_embeddings else 0
+            joint_dim += cell_emb_dim if 'cell' not in self.disabled_embeddings else 0
+        elif self.join_embeddings == 'concat':
+            joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
+        elif self.join_embeddings == 'sum':
+            joint_dim = hidden_dim
+        self.fc0 = nn.Linear(joint_dim, joint_dim)
+        self.fc1 = nn.Linear(joint_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
+        self.fc3 = nn.Linear(hidden_dim, 1)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
+        embeddings = []
+        if self.join_embeddings == 'beginning':
+            if 'poi' not in self.disabled_embeddings:
+                embeddings.append(poi_emb)
+            if 'e3' not in self.disabled_embeddings:
+                embeddings.append(e3_emb)
+            if 'cell' not in self.disabled_embeddings:
+                embeddings.append(cell_emb)
+            if 'smiles' not in self.disabled_embeddings:
+                embeddings.append(smiles_emb)
+            x = torch.cat(embeddings, dim=1)
+            x = self.dropout(F.relu(self.fc0(x)))
+        else:
+            if 'poi' not in self.disabled_embeddings:
+                embeddings.append(self.poi_emb(poi_emb))
+            if 'e3' not in self.disabled_embeddings:
+                embeddings.append(self.e3_emb(e3_emb))
+            if 'cell' not in self.disabled_embeddings:
+                embeddings.append(self.cell_emb(cell_emb))
+            if 'smiles' not in self.disabled_embeddings:
+                embeddings.append(self.smiles_emb(smiles_emb))
+            if self.join_embeddings == 'concat':
+                x = torch.cat(embeddings, dim=1)
+            elif self.join_embeddings == 'sum':
+                if len(embeddings) > 1:
+                    embeddings = torch.stack(embeddings, dim=1)
+                    x = torch.sum(embeddings, dim=1)
+                else:
+                    x = embeddings[0]
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = self.dropout(F.relu(self.fc2(x)))
+        x = self.fc3(x)
+        return x
+class PROTAC_Model(pl.LightningModule):
+    def __init__(
+        self,
+        hidden_dim: int,
+        smiles_emb_dim: int = 224,
+        poi_emb_dim: int = 1024,
+        e3_emb_dim: int = 1024,
+        cell_emb_dim: int = 768,
+        batch_size: int = 32,
+        learning_rate: float = 1e-3,
+        dropout: float = 0.2,
+        join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
+        train_dataset: PROTAC_Dataset = None,
+        val_dataset: PROTAC_Dataset = None,
+        test_dataset: PROTAC_Dataset = None,
+        disabled_embeddings: list = [],
+        apply_scaling: bool = False,
+    ):
+        """ Initialize the PROTAC Pytorch Lightning model.
+        Args:
+            hidden_dim (int): The hidden dimension of the model
+            smiles_emb_dim (int): The dimension of the SMILES embeddings
+            poi_emb_dim (int): The dimension of the POI embeddings
+            e3_emb_dim (int): The dimension of the E3 Ligase embeddings
+            cell_emb_dim (int): The dimension of the cell line embeddings
+            batch_size (int): The batch size
+            learning_rate (float): The learning rate
+            dropout (float): The dropout rate
+            join_embeddings (Literal['beginning', 'concat', 'sum']): How to join the embeddings
+            train_dataset (PROTAC_Dataset): The training dataset
+            val_dataset (PROTAC_Dataset): The validation dataset
+            test_dataset (PROTAC_Dataset): The test dataset
+            disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
+            apply_scaling (bool): Whether to apply scaling to the embeddings
+        """
+        super().__init__()
+        self.poi_emb_dim = poi_emb_dim
+        self.e3_emb_dim = e3_emb_dim
+        self.cell_emb_dim = cell_emb_dim
+        self.smiles_emb_dim = smiles_emb_dim
+        self.hidden_dim = hidden_dim
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.join_embeddings = join_embeddings
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
+        self.disabled_embeddings = disabled_embeddings
+        self.apply_scaling = apply_scaling
+        # Set our init args as class attributes
+        self.__dict__.update(locals())  # Add arguments as attributes
+        # Save the arguments passed to init
+        ignore_args_as_hyperparams = [
+            'train_dataset',
+            'test_dataset',
+            'val_dataset',
+        ]
+        self.save_hyperparameters(ignore=ignore_args_as_hyperparams)
+        self.model = PROTAC_Predictor(
+            hidden_dim=hidden_dim,
+            smiles_emb_dim=smiles_emb_dim,
+            poi_emb_dim=poi_emb_dim,
+            e3_emb_dim=e3_emb_dim,
+            cell_emb_dim=cell_emb_dim,
+            dropout=dropout,
+            join_embeddings=join_embeddings,
+            disabled_embeddings=disabled_embeddings,
+        )
+        stages = ['train_metrics', 'val_metrics', 'test_metrics']
+        self.metrics = nn.ModuleDict({s: MetricCollection({
+            'acc': Accuracy(task='binary'),
+            'roc_auc': AUROC(task='binary'),
+            'precision': Precision(task='binary'),
+            'recall': Recall(task='binary'),
+            'f1_score': F1Score(task='binary'),
+            'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
+            'hp_metric': Accuracy(task='binary'),
+        }, prefix=s.replace('metrics', '')) for s in stages})
+        # Misc settings
+        self.missing_dataset_error = \
+            '''Class variable `{0}` is None. If the model was loaded from a checkpoint, the dataset must be set manually:
+            model = {1}.load_from_checkpoint('checkpoint.ckpt')
+            model.{0} = my_{0}
+            '''
+        # Apply scaling in datasets
+        if self.apply_scaling:
+            use_single_scaler = True if self.join_embeddings == 'beginning' else False
+            self.scalers = self.train_dataset.fit_scaling(use_single_scaler)
+            self.train_dataset.apply_scaling(self.scalers, use_single_scaler)
+            self.val_dataset.apply_scaling(self.scalers, use_single_scaler)
+            if self.test_dataset:
+                self.test_dataset.apply_scaling(self.scalers, use_single_scaler)
+    def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
+        return self.model(poi_emb, e3_emb, cell_emb, smiles_emb)
+    def step(self, batch, batch_idx, stage):
+        poi_emb = batch['poi_emb']
+        e3_emb = batch['e3_emb']
+        cell_emb = batch['cell_emb']
+        smiles_emb = batch['smiles_emb']
+        y = batch['active'].float().unsqueeze(1)
+        y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
+        loss = F.binary_cross_entropy_with_logits(y_hat, y)
+        self.metrics[f'{stage}_metrics'].update(y_hat, y)
+        self.log(f'{stage}_loss', loss, on_epoch=True, prog_bar=True)
+        self.log_dict(self.metrics[f'{stage}_metrics'], on_epoch=True)
+        return loss
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'train')
+    def validation_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'val')
+    def test_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'test')
+    def configure_optimizers(self):
+        return optim.Adam(self.parameters(), lr=self.learning_rate)
+    def predict_step(self, batch, batch_idx):
+        poi_emb = batch['poi_emb']
+        e3_emb = batch['e3_emb']
+        cell_emb = batch['cell_emb']
+        smiles_emb = batch['smiles_emb']
+        if self.apply_scaling:
+            if self.join_embeddings == 'beginning':
+                embeddings = np.hstack([
+                    np.array(smiles_emb.tolist()),
+                    np.array(poi_emb.tolist()),
+                    np.array(e3_emb.tolist()),
+                    np.array(cell_emb.tolist()),
+                ])
+                embeddings = self.scalers.transform(embeddings)
+                smiles_emb = embeddings[:, :self.smiles_emb_dim]
+                poi_emb = embeddings[:, self.smiles_emb_dim:self.smiles_emb_dim+self.poi_emb_dim]
+                e3_emb = embeddings[:, self.smiles_emb_dim+self.poi_emb_dim:self.smiles_emb_dim+2*self.poi_emb_dim]
+                cell_emb = embeddings[:, -self.cell_emb_dim:]
+            else:
+                poi_emb = self.scalers['Uniprot'].transform(poi_emb)
+                e3_emb = self.scalers['E3 Ligase Uniprot'].transform(e3_emb)
+                cell_emb = self.scalers['Cell Line Identifier'].transform(cell_emb)
+                smiles_emb = self.scalers['Smiles'].transform(smiles_emb)
+        y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
+        return torch.sigmoid(y_hat)
+    def train_dataloader(self):
+        if self.train_dataset is None:
+            format = 'train_dataset', self.__class__.__name__
+            raise ValueError(self.missing_dataset_error.format(*format))
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            # drop_last=True,
+        )
+    def val_dataloader(self):
+        if self.val_dataset is None:
+            format = 'val_dataset', self.__class__.__name__
+            raise ValueError(self.missing_dataset_error.format(*format))
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+        )
+    def test_dataloader(self):
+        if self.test_dataset is None:
+            format = 'test_dataset', self.__class__.__name__
+            raise ValueError(self.missing_dataset_error.format(*format))
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+        )
+def train_model(
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: Optional[pd.DataFrame] = None,
+        hidden_dim: int = 768,
+        batch_size: int = 8,
+        learning_rate: float = 2e-5,
+        dropout: float = 0.2,
+        max_epochs: int = 50,
+        smiles_emb_dim: int = 224,
+        join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
+        smote_k_neighbors:int = 5,
+        use_smote: bool = True,
+        apply_scaling: bool = False,
+        active_label: str = 'Active',
+        fast_dev_run: bool = False,
+        use_logger: bool = True,
+        logger_name: str = 'protac',
+        disabled_embeddings: List[str] = [],
+) -> tuple:
+    """ Train a PROTAC model using the given datasets and hyperparameters.
+    Args:
+        protein2embedding (dict): Dictionary of protein embeddings.
+        cell2embedding (dict): Dictionary of cell line embeddings.
+        smiles2fp (dict): Dictionary of SMILES to fingerprint.
+        train_df (pd.DataFrame): The training set. It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
+        val_df (pd.DataFrame): The validation set.  It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
+        test_df (pd.DataFrame): The test set. If provided, the returned metrics will include test performance.  It must include the following columns: 'Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', <active_label>.
+        hidden_dim (int): The hidden dimension of the model.
+        batch_size (int): The batch size.
+        learning_rate (float): The learning rate.
+        max_epochs (int): The maximum number of epochs.
+        smiles_emb_dim (int): The dimension of the SMILES embeddings.
+        smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
+        fast_dev_run (bool): Whether to run a fast development run.
+        disabled_embeddings (list): The list of disabled embeddings.
+    Returns:
+        tuple: The trained model, the trainer, and the metrics.
+    """
+    oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
+    train_ds = PROTAC_Dataset(
+        train_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        use_smote=use_smote,
+        oversampler=oversampler if use_smote else None,
+        active_label=active_label,
+    )
+    val_ds = PROTAC_Dataset(
+        val_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        active_label=active_label,
+    )
+    if test_df is not None:
+        test_ds = PROTAC_Dataset(
+            test_df,
+            protein2embedding,
+            cell2embedding,
+            smiles2fp,
+            active_label=active_label,
+        )
+    logger = pl.loggers.TensorBoardLogger(
+        save_dir='../logs',
+        name=logger_name,
+    )
+    callbacks = [
+        pl.callbacks.EarlyStopping(
+            monitor='train_loss',
+            patience=10,
+            mode='min',
+            verbose=False,
+        ),
+        pl.callbacks.EarlyStopping(
+            monitor='val_loss',
+            patience=5,
+            mode='min',
+            verbose=False,
+        ),
+        pl.callbacks.EarlyStopping(
+            monitor='val_acc',
+            patience=10,
+            mode='max',
+            verbose=False,
+        ),
+        # pl.callbacks.ModelCheckpoint(
+        #     monitor='val_acc',
+        #     mode='max',
+        #     verbose=True,
+        #     filename='{epoch}-{val_metrics_opt_score:.4f}',
+        # ),
+    ]
+    # Define Trainer
+    trainer = pl.Trainer(
+        logger=logger if use_logger else False,
+        callbacks=callbacks,
+        max_epochs=max_epochs,
+        fast_dev_run=fast_dev_run,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        enable_progress_bar=False,
+        devices=1,
+        num_nodes=1,
+    )
+    model = PROTAC_Model(
+        hidden_dim=hidden_dim,
+        smiles_emb_dim=smiles_emb_dim,
+        poi_emb_dim=1024,
+        e3_emb_dim=1024,
+        cell_emb_dim=768,
+        batch_size=batch_size,
+        join_embeddings=join_embeddings,
+        dropout=dropout,
+        learning_rate=learning_rate,
+        apply_scaling=apply_scaling,
+        train_dataset=train_ds,
+        val_dataset=val_ds,
+        test_dataset=test_ds if test_df is not None else None,
+        disabled_embeddings=disabled_embeddings,
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        trainer.fit(model)
+    metrics = trainer.validate(model, verbose=False)[0]
+    if test_df is not None:
+        test_metrics = trainer.test(model, verbose=False)[0]
+        metrics.update(test_metrics)
+    return model, trainer, metrics
+def load_model(
+        ckpt_path: str,
+) -> PROTAC_Model:
+    """ Load a PROTAC model from a checkpoint.
+    Args:
+        ckpt_path (str): The path to the checkpoint.
+    Returns:
+        PROTAC_Model: The loaded model.
+    """
+    model = PROTAC_Model.load_from_checkpoint(ckpt_path)
+    model.eval()
+    return model

protac_degradation_predictor/sklearn_models.py ADDED Viewed

	@@ -0,0 +1,243 @@

+from typing import Literal, List, Tuple, Optional, Dict
+from protac_dataset import PROTAC_Dataset
+import pandas as pd
+from sklearn.base import ClassifierMixin
+from sklearn.ensemble import (
+    RandomForestClassifier,
+    GradientBoostingClassifier,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+import torch
+import torch.nn as nn
+from torchmetrics import (
+    Accuracy,
+    AUROC,
+    Precision,
+    Recall,
+    F1Score,
+    MetricCollection,
+)
+import optuna
+def train_sklearn_model(
+    clf: ClassifierMixin,
+    protein2embedding: Dict,
+    cell2embedding: Dict,
+    smiles2fp: Dict,
+    train_df: pd.DataFrame,
+    val_df: pd.DataFrame,
+    test_df: Optional[pd.DataFrame] = None,
+    active_label: str = 'Active',
+    use_single_scaler: bool = True,
+) -> Tuple[ClassifierMixin, Dict]:
+    """ Train a classifier model on train and val sets and evaluate it on a test set.
+    Args:
+        clf: The classifier model to train and evaluate.
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        test_df (Optional[pd.DataFrame]): The test set.
+    Returns:
+        Tuple[ClassifierMixin, nn.ModuleDict]: The trained model and the metrics.
+    """
+    # Initialize the datasets
+    train_ds = PROTAC_Dataset(
+        train_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        active_label=active_label,
+        use_smote=False,
+    )
+    scaler = train_ds.fit_scaling(use_single_scaler=use_single_scaler)
+    train_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)
+    val_ds = PROTAC_Dataset(
+        val_df,
+        protein2embedding,
+        cell2embedding,
+        smiles2fp,
+        active_label=active_label,
+        use_smote=False,
+    )
+    val_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)
+    if test_df is not None:
+        test_ds = PROTAC_Dataset(
+            test_df,
+            protein2embedding,
+            cell2embedding,
+            smiles2fp,
+            active_label=active_label,
+            use_smote=False,
+        )
+        test_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)
+    # Get the numpy arrays
+    X_train, y_train = train_ds.get_numpy_arrays()
+    X_val, y_val = val_ds.get_numpy_arrays()
+    if test_df is not None:
+        X_test, y_test = test_ds.get_numpy_arrays()
+    # Train the model
+    clf.fit(X_train, y_train)
+    # Define the metrics as a module dict
+    stages = ['train_metrics', 'val_metrics', 'test_metrics']
+    metrics = nn.ModuleDict({s: MetricCollection({
+        'acc': Accuracy(task='binary'),
+        'roc_auc': AUROC(task='binary'),
+        'precision': Precision(task='binary'),
+        'recall': Recall(task='binary'),
+        'f1_score': F1Score(task='binary'),
+        'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
+        'hp_metric': Accuracy(task='binary'),
+    }, prefix=s.replace('metrics', '')) for s in stages})
+    # Get the predictions
+    metrics_out = {}
+    y_pred = torch.tensor(clf.predict_proba(X_train)[:, 1])
+    y_true = torch.tensor(y_train)
+    metrics['train_metrics'].update(y_pred, y_true)
+    metrics_out.update(metrics['train_metrics'].compute())
+    y_pred = torch.tensor(clf.predict_proba(X_val)[:, 1])
+    y_true = torch.tensor(y_val)
+    metrics['val_metrics'].update(y_pred, y_true)
+    metrics_out.update(metrics['val_metrics'].compute())
+    if test_df is not None:
+        y_pred = torch.tensor(clf.predict_proba(X_test)[:, 1])
+        y_true = torch.tensor(y_test)
+        metrics['test_metrics'].update(y_pred, y_true)
+        metrics_out.update(metrics['test_metrics'].compute())
+    return clf, metrics_out
+def suggest_random_forest(
+        trial: optuna.Trial,
+) -> ClassifierMixin:
+    """ Suggest hyperparameters for a Random Forest classifier.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+    Returns:
+        ClassifierMixin: The Random Forest classifier with the suggested hyperparameters.
+    """
+    n_estimators = trial.suggest_int('model_n_estimators', 10, 1000)
+    max_depth = trial.suggest_int('model_max_depth', 2, 100)
+    min_samples_split = trial.suggest_int('model_min_samples_split', 2, 10)
+    min_samples_leaf = trial.suggest_int('model_min_samples_leaf', 1, 10)
+    max_features = trial.suggest_categorical('model_max_features', [None, 'sqrt', 'log2'])
+    criterion = trial.suggest_categorical('model_criterion', ['gini', 'entropy'])
+    clf = RandomForestClassifier(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        min_samples_split=min_samples_split,
+        min_samples_leaf=min_samples_leaf,
+        max_features=max_features,
+        criterion=criterion,
+        random_state=42,
+    )
+    return clf
+def suggest_logistic_regression(
+        trial: optuna.Trial,
+) -> ClassifierMixin:
+    """ Suggest hyperparameters for a Logistic Regression classifier.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+    Returns:
+        ClassifierMixin: The Logistic Regression classifier with the suggested hyperparameters.
+    """
+        # Suggest values for the logistic regression hyperparameters
+    C = trial.suggest_loguniform('model_C', 1e-4, 1e2)
+    penalty = trial.suggest_categorical('model_penalty', ['l1', 'l2', 'elasticnet', None])
+    solver = trial.suggest_categorical('model_solver', ['newton-cholesky', 'lbfgs', 'liblinear', 'sag', 'saga'])
+    # Check solver compatibility
+    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
+        raise optuna.exceptions.TrialPruned()
+    if penalty == None and solver not in ['newton-cholesky', 'lbfgs', 'sag']:
+        raise optuna.exceptions.TrialPruned()
+    # Configure the classifier with the trial's suggested parameters
+    clf = LogisticRegression(
+        C=C,
+        penalty=penalty,
+        solver=solver,
+        max_iter=1000,
+        random_state=42,
+    )
+    return clf
+def suggest_svc(
+        trial: optuna.Trial,
+) -> ClassifierMixin:
+    """ Suggest hyperparameters for an SVC classifier.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+    Returns:
+        ClassifierMixin: The SVC classifier with the suggested hyperparameters.
+    """
+    C = trial.suggest_loguniform('model_C', 1e-4, 1e2)
+    kernel = trial.suggest_categorical('model_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
+    gamma = trial.suggest_categorical('model_gamma', ['scale', 'auto'])
+    degree = trial.suggest_int('model_degree', 2, 5) if kernel == 'poly' else 3
+    clf = SVC(
+        C=C,
+        kernel=kernel,
+        gamma=gamma,
+        degree=degree,
+        probability=True,
+        random_state=42,
+    )
+    return clf
+def suggest_gradient_boosting(
+        trial: optuna.Trial,
+) -> ClassifierMixin:
+    """ Suggest hyperparameters for a Gradient Boosting classifier.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+    Returns:
+        ClassifierMixin: The Gradient Boosting classifier with the suggested hyperparameters.
+    """
+    n_estimators = trial.suggest_int('model_n_estimators', 50, 500)
+    learning_rate = trial.suggest_loguniform('model_learning_rate', 0.01, 1)
+    max_depth = trial.suggest_int('model_max_depth', 3, 10)
+    min_samples_split = trial.suggest_int('model_min_samples_split', 2, 10)
+    min_samples_leaf = trial.suggest_int('model_min_samples_leaf', 1, 10)
+    max_features = trial.suggest_categorical('model_max_features', ['sqrt', 'log2', None])
+    clf = GradientBoostingClassifier(
+        n_estimators=n_estimators,
+        learning_rate=learning_rate,
+        max_depth=max_depth,
+        min_samples_split=min_samples_split,
+        min_samples_leaf=min_samples_leaf,
+        max_features=max_features,
+        random_state=42,
+    )
+    return clf

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_0_test_split_0.1.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27aa74ab92272f4c455f8eb32e7da3d3b71e213937f9a96a7d07e3ca61af06fb
 size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:9d2f036ce141fbeb81930cc9ce49dbd6effc76221b26b92ae0498af1c34289f3
 size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_1_test_split_0.1.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e8b8cd9536d7d1fab755506dc6cddf5d0d66ae3743f3a687dc5e82d44b134cd7
 size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:f8ce36b5f52f8f88105c3ec0c5b60f865e1b054aff8f9e96c21f1e037eaa65af
 size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_2_test_split_0.1.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2503fafc20807b89d6663a978a1a52d923cd36a43e09b2cd5761bfd42c505942
 size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:d22f4f54d46ca72b8585645fdfac43683a23dcc00d80fb8bd1f785d4eb4a9594
 size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_2_test_split_0.2.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f7298a04d0888f4de87041efd6b78c42e13c3f1630c43567d582bc7710a40847
-size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_3_test_split_0.1.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac2296caf215b91d3afbf2f31a625cdade8a1557f791e43350fe86e04373c6f7
 size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:54f869b328af6667567bd5cc805ce63fc5434ed1b77afc1e66d95b8f02e40642
 size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_3_test_split_0.2.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2a7a9f6ed11e1b5b6f876dd927612d03f4780f9db3e65b9f1ebb8fbd853677f
-size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_4_test_split_0.1.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:994f1b499f359d0fb32eb708054826edecd317862cff181455fae8040330f6b9
 size 45164

 version https://git-lfs.github.com/spec/v1
+oid sha256:27f0e76e7f89950199c843699c000eaad8628441c84aec394e20c23f701b1609
 size 45164

reports/study_Active_Dmax_0.6_pDC50_6.0_tanimoto_fold_4_test_split_0.2.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd4c40033da16a1cee16fd998c82e0403a31db4d14ba0604160ea143bae03668
-size 45164

setup.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import setuptools
+setuptools.setup(
+    name="protac_degradation_predictor",
+    version="0.0.1",
+    author="Stefano Ribes",
+    url="https://github.com/ribesstefano/PROTAC-Degradation-Predictor",
+    author_email="[email protected]",
+    description="A package to predict PROTAC-induced protein degradation.",
+    long_description=open("README.md").read(),
+    packages=setuptools.find_packages(),
+    install_requires=["torch", "pytorch_lightning", "sklearn", "imblearn", "pandas", "joblib", "h5py", "optuna", "torchmetrics"],
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.6",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    include_package_data=True,
+    package_data={"": ["data/*.h5", "data/*.pkl", "data/*.csv"]},
+)