Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Jul 16, 2024

Commit

ed339ed

1 Parent(s): 394ed39

Started renaming splits. Added datasets in paper as separate CSV.

Browse files

Files changed (13) hide show

README.md +14 -0
data/studies/e3_ligase_test_10split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/e3_ligase_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/similarity_test_10split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/similarity_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/standard_test_10split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/standard_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/target_test_10split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
data/studies/target_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv +0 -0
notebooks/predict_unknown_protacs.ipynb +0 -0
protac_degradation_predictor/optuna_utils.py +15 -164
src/README.md +70 -0
src/get_studies_datasets.py +289 -0

README.md CHANGED Viewed

@@ -1,3 +1,17 @@
 ![Maturity level-0](https://img.shields.io/badge/Maturity%20Level-ML--0-red)
 <a href="https://colab.research.google.com/github/ribesstefano/PROTAC-Degradation-Predictor/blob/main/notebooks/protac_degradation_predictor_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/ailab-bio/PROTAC-Degradation-Predictor)

+---
+title: PROTAC-Degradation-Predictor
+emoji: 🧬
+colorFrom: pink
+colorTo: green
+sdk: gradio
+sdk_version: 4.37.2
+app_file: app.py
+pinned: false
+license: mit
+---
 ![Maturity level-0](https://img.shields.io/badge/Maturity%20Level-ML--0-red)
 <a href="https://colab.research.google.com/github/ribesstefano/PROTAC-Degradation-Predictor/blob/main/notebooks/protac_degradation_predictor_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/ailab-bio/PROTAC-Degradation-Predictor)

data/studies/e3_ligase_test_10split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/e3_ligase_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/similarity_test_10split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/similarity_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/standard_test_10split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/standard_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/target_test_10split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/studies/target_train_val_90split_Active_Dmax_0.6_pDC50_6.0.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/predict_unknown_protacs.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -9,25 +9,11 @@ from .pytorch_models import (
 )
 from .protac_dataset import get_datasets
-from .sklearn_models import (
-    train_sklearn_model,
-    suggest_random_forest,
-    suggest_logistic_regression,
-    suggest_svc,
-    suggest_gradient_boosting,
-)
 import torch
 import optuna
 from optuna.samplers import TPESampler
 import joblib
 import pandas as pd
-from sklearn.ensemble import (
-    RandomForestClassifier,
-    GradientBoostingClassifier,
-)
-from sklearn.linear_model import LogisticRegression
-from sklearn.svm import SVC
 from sklearn.model_selection import (
     StratifiedKFold,
     StratifiedGroupKFold,
@@ -270,14 +256,23 @@ def hyperparameter_tuning_and_training(
     """ Hyperparameter tuning and training of a PROTAC model.
     Args:
-        train_df (pd.DataFrame): The training set.
-        val_df (pd.DataFrame): The validation set.
         test_df (pd.DataFrame): The test set.
         fast_dev_run (bool): Whether to run a fast development run.
-        n_trials (int): The number of hyperparameter optimization trials.
-        logger_name (str): The name of the logger.
         active_label (str): The active label column.
-        disabled_embeddings (List[str]): The list of disabled embeddings.
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
@@ -507,148 +502,4 @@ def hyperparameter_tuning_and_training(
     }
     if not fast_dev_run:
         ret['majority_vote_report'] = majority_vote_report
-    return ret
-def sklearn_model_objective(
-        trial: optuna.Trial,
-        protein2embedding: Dict,
-        cell2embedding: Dict,
-        smiles2fp: Dict,
-        train_df: pd.DataFrame,
-        val_df: pd.DataFrame,
-        model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
-        active_label: str = 'Active',
-) -> float:
-    """ Objective function for hyperparameter optimization.
-    Args:
-        trial (optuna.Trial): The Optuna trial object.
-        train_df (pd.DataFrame): The training set.
-        val_df (pd.DataFrame): The validation set.
-        model_type (str): The model type.
-        hyperparameters (Dict): The hyperparameters for the model.
-        fast_dev_run (bool): Whether to run a fast development run.
-        active_label (str): The active label column.
-    """
-    # Generate the hyperparameters
-    use_single_scaler = trial.suggest_categorical('use_single_scaler', [True, False])
-    if model_type == 'RandomForest':
-        clf = suggest_random_forest(trial)
-    elif model_type == 'SVC':
-        clf = suggest_svc(trial)
-    elif model_type == 'LogisticRegression':
-        clf = suggest_logistic_regression(trial)
-    elif model_type == 'GradientBoosting':
-        clf = suggest_gradient_boosting(trial)
-    else:
-        raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
-    # Train the model with the current set of hyperparameters
-    _, metrics = train_sklearn_model(
-        clf=clf,
-        protein2embedding=protein2embedding,
-        cell2embedding=cell2embedding,
-        smiles2fp=smiles2fp,
-        train_df=train_df,
-        val_df=val_df,
-        active_label=active_label,
-        use_single_scaler=use_single_scaler,
-    )
-    # Metrics is a dictionary containing at least the validation loss
-    val_acc = metrics['val_acc']
-    val_roc_auc = metrics['val_roc_auc']
-    # Optuna aims to minimize the sklearn_model_objective
-    return - val_acc - val_roc_auc
-def hyperparameter_tuning_and_training_sklearn(
-        protein2embedding: Dict,
-        cell2embedding: Dict,
-        smiles2fp: Dict,
-        train_df: pd.DataFrame,
-        val_df: pd.DataFrame,
-        test_df: Optional[pd.DataFrame] = None,
-        model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
-        active_label: str = 'Active',
-        n_trials: int = 50,
-        logger_name: str = 'protac_hparam_search_sklearn',
-        study_filename: Optional[str] = None,
-) -> Tuple:
-    """ Hyperparameter tuning and training of a PROTAC model.
-    Args:
-        train_df (pd.DataFrame): The training set.
-        val_df (pd.DataFrame): The validation set.
-        test_df (pd.DataFrame): The test set.
-        model_type (str): The model type.
-        n_trials (int): The number of hyperparameter optimization trials.
-        logger_name (str): The name of the logger. Unused, for compatibility with hyperparameter_tuning_and_training.
-        active_label (str): The active label column.
-    Returns:
-        tuple: The trained model and the best metrics.
-    """
-    # Set the verbosity of Optuna
-    optuna.logging.set_verbosity(optuna.logging.WARNING)
-    # Create an Optuna study object
-    sampler = TPESampler(seed=42, multivariate=True)
-    study = optuna.create_study(direction='minimize', sampler=sampler)
-    study_loaded = False
-    if study_filename:
-        if os.path.exists(study_filename):
-            study = joblib.load(study_filename)
-            study_loaded = True
-            logging.info(f'Loaded study from {study_filename}')
-    if not study_loaded:
-        study.optimize(
-            lambda trial: sklearn_model_objective(
-                trial=trial,
-                protein2embedding=protein2embedding,
-                cell2embedding=cell2embedding,
-                smiles2fp=smiles2fp,
-                train_df=train_df,
-                val_df=val_df,
-                model_type=model_type,
-                active_label=active_label,
-            ),
-            n_trials=n_trials,
-        )
-        if study_filename:
-            joblib.dump(study, study_filename)
-    # Retrain the model with the best hyperparameters
-    best_hyperparameters = {k.replace('model_', ''): v for k, v in study.best_params.items() if k.startswith('model_')}
-    if model_type == 'RandomForest':
-        clf = RandomForestClassifier(random_state=42, **best_hyperparameters)
-    elif model_type == 'SVC':
-        clf = SVC(random_state=42, probability=True, **best_hyperparameters)
-    elif model_type == 'LogisticRegression':
-        clf = LogisticRegression(random_state=42, max_iter=1000, **best_hyperparameters)
-    elif model_type == 'GradientBoosting':
-        clf = GradientBoostingClassifier(random_state=42, **best_hyperparameters)
-    else:
-        raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
-    model, metrics = train_sklearn_model(
-        clf=clf,
-        protein2embedding=protein2embedding,
-        cell2embedding=cell2embedding,
-        smiles2fp=smiles2fp,
-        train_df=train_df,
-        val_df=val_df,
-        test_df=test_df,
-        active_label=active_label,
-        use_single_scaler=study.best_params['use_single_scaler'],
-    )
-    # Report the best hyperparameters found
-    metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
-    # Return the best metrics
-    return model, metrics

 )
 from .protac_dataset import get_datasets
 import torch
 import optuna
 from optuna.samplers import TPESampler
 import joblib
 import pandas as pd
 from sklearn.model_selection import (
     StratifiedKFold,
     StratifiedGroupKFold,
     """ Hyperparameter tuning and training of a PROTAC model.
     Args:
+        protein2embedding (Dict): The protein to embedding dictionary.
+        cell2embedding (Dict): The cell to embedding dictionary.
+        smiles2fp (Dict): The SMILES to fingerprint dictionary.
+        train_val_df (pd.DataFrame): The training and validation set.
         test_df (pd.DataFrame): The test set.
+        kf (StratifiedKFold | StratifiedGroupKFold): The KFold object.
+        groups (np.array): The groups for the StratifiedGroupKFold.
+        split_type (str): The split type.
+        n_models_for_test (int): The number of models to train for the test set.
         fast_dev_run (bool): Whether to run a fast development run.
+        n_trials (int): The number of trials for the hyperparameter search.
+        logger_save_dir (str): The logger save directory.
+        logger_name (str): The logger name.
         active_label (str): The active label column.
+        max_epochs (int): The maximum number of epochs.
+        study_filename (str): The study filename.
+        force_study (bool): Whether to force the study.
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
     }
     if not fast_dev_run:
         ret['majority_vote_report'] = majority_vote_report
+    return ret

src/README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Training Models
+## Dataset Specification
+From the repository top level directory, run the following command to get the datasets reported in the paper:
+```bash
+cd src
+python get_studies_datasets.py
+```
+For training on custom datasets, please refer to the class `PROTAC_Dataset` in the file [`protac_dataset.py`](../protac_degradation_predictor/protac_dataset.py). The class expects a Pandas dataframe, so plase assemble a file to be parsed into a Pandas DataFrame with the following columns:
+| Column Name | Type | Description |
+| --- | --- | --- |
+| Smiles | str | The SMILES representation of the PROTAC molecule. |
+| Uniprot | str | The Uniprot ID of the target protein. |
+| E3 Ligase Uniprot | str | The Uniprot ID of the E3 ligase. |
+| Cell Line Identifier | str | The cell line identifier as one reported in Cellosaurus. |
+| `<active_label>` | bool | The activity label of the PROTAC molecule to be predicted by the model. |
+The column `<active_label>` is set _"Active"_ as default in the `PROTAC_Dataset` class and in the `hyperparameter_tuning_and_training` function (see below for how to use it).
+## Training on Custom Data
+For training on custom datasets, please refer to the function `hyperparameter_tuning_and_training` in [`optuna_utils.py`](../protac_degradation_predictor/optuna_utils.py) and the file [`run_experiments.py`](../src/run_experiments.py) for inspiration on how to use the function.
+An example of skeleton implementation is as follows:
+```python
+import protac_degradation_predictor as pdp
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+# Load train/val and test dataframes
+train_val_df = pd.read_csv('path/to/custom_dataset.csv')
+test_df = pd.read_csv('path/to/test_dataset.csv') # Load one of our test datasets
+# NOTE: Make sure to avoid data leakage by removing leaking data in the train/val
+# dataframe. Do NOT do remove/alter the test set, as it would impair comparison
+# with our work. Data leakage can occur if the test set contains any combination
+# of SMILES, Uniprot, E3 Ligase Uniprot, or Cell Line Identifier that is present
+# in the train/val set too.
+# Precompute Morgan fingerprints
+unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
+smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
+# Load embedding dictionaries
+protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
+cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+# Setup Cross-Validation object
+kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+pdp.hyperparameter_tuning_and_training(
+    protein2embedding=protein2embedding,
+    cell2embedding=cell2embedding,
+    smiles2fp=smiles2fp,
+    train_val_df=train_val_df,
+    test_df=test_df,
+    kf=kf,
+    n_models_for_test=3,
+    n_trials=100,
+    max_epochs=20,
+    logger_save_dir='../logs',
+    logger_name=f'logs_{experiment_name}',
+    study_filename=f'../reports/study_{experiment_name}.pkl',
+)
+```

src/get_studies_datasets.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import protac_degradation_predictor as pdp
+from collections import defaultdict
+import warnings
+import logging
+from typing import Literal
+from sklearn.preprocessing import OrdinalEncoder
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+import pytorch_lightning as pl
+from rdkit import DataStructs
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
+def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
+    """ Get the indices of the test set using a random split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    test_df = active_df.sample(frac=test_split, random_state=42)
+    return test_df.index
+def get_e3_ligase_split_indices(active_df: pd.DataFrame) -> pd.Index:
+    """ Get the indices of the test set using the E3 ligase split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    encoder = OrdinalEncoder()
+    active_df['E3 Group'] = encoder.fit_transform(active_df[['E3 Ligase']]).astype(int)
+    test_df = active_df[(active_df['E3 Ligase'] != 'VHL') & (active_df['E3 Ligase'] != 'CRBN')]
+    return test_df.index
+def get_smiles2fp_and_avg_tanimoto(protac_df: pd.DataFrame) -> tuple:
+    """ Get the SMILES to fingerprint dictionary and the average Tanimoto similarity.
+    Args:
+        protac_df (pd.DataFrame): The DataFrame containing the PROTACs.
+    Returns:
+        tuple: The SMILES to fingerprint dictionary and the average Tanimoto similarity.
+    """
+    unique_smiles = protac_df['Smiles'].unique().tolist()
+    smiles2fp = {}
+    for smiles in tqdm(unique_smiles, desc='Precomputing fingerprints'):
+        smiles2fp[smiles] = pdp.get_fingerprint(smiles)
+    # # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
+    # tanimoto_matrix = defaultdict(list)
+    # for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
+    #     fp1 = smiles2fp[smiles1]
+    #     # TODO: Use BulkTanimotoSimilarity for better performance
+    #     for j, smiles2 in enumerate(protac_df['Smiles'].unique()[i:]):
+    #         fp2 = smiles2fp[smiles2]
+    #         tanimoto_dist = 1 - DataStructs.TanimotoSimilarity(fp1, fp2)
+    #         tanimoto_matrix[smiles1].append(tanimoto_dist)
+    # avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    # protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+    tanimoto_matrix = defaultdict(list)
+    fps = list(smiles2fp.values())
+    # Compute all-against-all Tanimoto similarity using BulkTanimotoSimilarity
+    for i, (smiles1, fp1) in enumerate(tqdm(zip(unique_smiles, fps), desc='Computing Tanimoto similarity', total=len(fps))):
+        similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps[i:])  # Only compute for i to end, avoiding duplicates
+        for j, similarity in enumerate(similarities):
+            distance = 1 - similarity
+            tanimoto_matrix[smiles1].append(distance)  # Store as distance
+            if i != i + j:
+                tanimoto_matrix[unique_smiles[i + j]].append(distance)  # Symmetric filling
+    # Calculate average Tanimoto distance for each unique SMILES
+    avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+    smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
+    return smiles2fp, protac_df
+def get_tanimoto_split_indices(
+        active_df: pd.DataFrame,
+        active_col: str,
+        test_split: float,
+        n_bins_tanimoto: int = 200,
+) -> pd.Index:
+    """ Get the indices of the test set using the Tanimoto-based split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        n_bins_tanimoto (int): The number of bins to use for the Tanimoto similarity.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    tanimoto_groups = pd.cut(active_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+    encoder = OrdinalEncoder()
+    active_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
+    # Sort the groups so that samples with the highest tanimoto similarity,
+    # i.e., the "less similar" ones, are placed in the test set first
+    tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    for group in tanimoto_groups:
+        group_df = active_df[active_df['Tanimoto Group'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    return test_df.index
+def get_target_split_indices(active_df: pd.DataFrame, active_col: str, test_split: float) -> pd.Index:
+    """ Get the indices of the test set using the target-based split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        active_col (str): The column containing the active/inactive information.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    encoder = OrdinalEncoder()
+    active_df['Uniprot Group'] = encoder.fit_transform(active_df[['Uniprot']]).astype(int)
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    # Start the loop from the groups containing the smallest number of entries.
+    for group in reversed(active_df['Uniprot'].value_counts().index):
+        group_df = active_df[active_df['Uniprot'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    return test_df.index
+def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
+    test_split: float = 0.1,
+    studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
+):
+    """ Get and save the datasets for the different studies.
+    Args:
+        active_col (str): The column containing the active/inactive information. It should be in the format 'Active (Dmax N, pDC50 M)', where N and M are the thresholds float values for Dmax and pDC50, respectively.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+        studies (str): The type of studies to save dataset for. Options: 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
+    """
+    pl.seed_everything(42)
+    # Set the Column to Predict
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    # Get Dmax_threshold from the active_col
+    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    # Load the PROTAC dataset
+    protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+    # Map E3 Ligase Iap to IAP
+    protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    protac_df[active_col] = protac_df.apply(
+        lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    )
+    _, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
+    ## Get the test sets
+    test_indeces = {}
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    # Remove legacy column 'Active - OR' if it exists
+    if 'Active - OR' in active_df.columns:
+        active_df.drop(columns='Active - OR', inplace=True)
+    if studies == 'standard' or studies == 'all':
+        test_indeces['standard'] = get_random_split_indices(active_df, test_split)
+    if studies == 'target' or studies == 'all':
+        test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
+    if studies == 'e3_ligase' or studies == 'all':
+        test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
+    if studies == 'similarity' or studies == 'all':
+        test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
+    # Make directory for studies datasets if it does not exist
+    data_dir = '../data/studies'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    # Cross-Validation Training
+    for split_type, indeces in test_indeces.items():
+        test_df = active_df.loc[indeces].copy()
+        train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
+        # Save the datasets
+        train_val_perc = f'{int((1 - test_split) * 100)}'
+        test_perc = f'{int(test_split * 100)}'
+        train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
+        test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
+        print('')
+        print(f'Saving train_val datasets as: {train_val_filename}')
+        print(f'Saving test datasets as:      {test_filename}')
+        train_val_df.to_csv(train_val_filename, index=False)
+        test_df.to_csv(test_filename, index=False)
+if __name__ == '__main__':
+    main()