In [12]:
import os
import sys
from collections import defaultdict
import warnings
import logging
from typing import Literal

sys.path.append('~/PROTAC-Degradation-Predictor/protac_degradation_predictor')
import protac_degradation_predictor as pdp

import pytorch_lightning as pl
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from jsonargparse import CLI
import pandas as pd
# Import tqdm for notebook
from tqdm.notebook import tqdm
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import (
    StratifiedKFold,
    StratifiedGroupKFold,
)


active_col = 'Active (Dmax 0.6, pDC50 6.0)'
pDC50_threshold = 6.0
Dmax_threshold = 0.6

protac_df = pd.read_csv('~/PROTAC-Degradation-Predictor/data/PROTAC-Degradation-DB.csv')
protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
protac_df[active_col] = protac_df.apply(
    lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
)

In [13]:
def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
    """ Get the indices of the test set using a random split.
    
    Args:
        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
        test_split (float): The percentage of the active PROTACs to use as the test set.
    
    Returns:
        pd.Index: The indices of the test set.
    """
    test_df = active_df.sample(frac=test_split, random_state=42)
    return test_df.index

active_df = protac_df[protac_df[active_col].notna()].copy()
test_split = 0.1
test_indices = get_random_split_indices(active_df, test_split)
train_val_df = active_df[~active_df.index.isin(test_indices)].copy()
len(train_val_df)

771

In [29]:
import optuna

def objective(trial: optuna.Trial, verbose: int = 0) -> float:
    
    radius = trial.suggest_int('radius', 1, 15)
    fpsize = trial.suggest_int('fpsize', 128, 2048, step=128)

    morgan_fpgen = AllChem.GetMorganGenerator(
        radius=radius,
        fpSize=fpsize,
        includeChirality=True,
    )

    smiles2fp = {}
    for smiles in train_val_df['Smiles'].unique().tolist():
        smiles2fp[smiles] = pdp.get_fingerprint(smiles, morgan_fpgen)

    # Count the number of unique SMILES and the number of unique Morgan fingerprints
    unique_fps = set([tuple(fp) for fp in smiles2fp.values()])
    # Get the list of SMILES with overlapping fingerprints
    overlapping_smiles = []
    unique_fps = set()
    for smiles, fp in smiles2fp.items():
        if tuple(fp) in unique_fps:
            overlapping_smiles.append(smiles)
        else:
            unique_fps.add(tuple(fp))
    num_overlaps = len(train_val_df[train_val_df["Smiles"].isin(overlapping_smiles)])
    num_overlaps_tot = len(protac_df[protac_df["Smiles"].isin(overlapping_smiles)])

    if verbose:
        print(f'Radius: {radius}')
        print(f'FP length: {fpsize}')
        print(f'Number of unique SMILES: {len(smiles2fp)}')
        print(f'Number of unique fingerprints: {len(unique_fps)}')
        print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)}')
        print(f'Number of overlapping SMILES in train_val_df: {num_overlaps}')
        print(f'Number of overlapping SMILES in protac_df: {num_overlaps_tot}')
    return num_overlaps + radius + fpsize / 100

In [30]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-04-29 11:28:05,626] A new study created in memory with name: no-name-4db5d822-6220-4ab8-bc3a-c776b0e5cac2


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-04-29 11:28:07,705] Trial 0 finished with value: 39.480000000000004 and parameters: {'radius': 6, 'fpsize': 2048}. Best is trial 0 with value: 39.480000000000004.
[I 2024-04-29 11:28:09,590] Trial 1 finished with value: 23.8 and parameters: {'radius': 11, 'fpsize': 1280}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:10,474] Trial 2 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:11,978] Trial 3 finished with value: 281.92 and parameters: {'radius': 1, 'fpsize': 1792}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:13,994] Trial 4 finished with value: 25.36 and parameters: {'radius': 10, 'fpsize': 1536}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:15,642] Trial 5 finished with value: 284.48 and parameters: {'radius': 1, 'fpsize': 2048}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:17,154] Trial 6 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 51

In [31]:
# Run objective with best params and verbose
objective(study.best_trial, verbose=1)

Radius: 10
FP length: 256
Number of unique SMILES: 532
Number of unique fingerprints: 532
Number of SMILES with overlapping fingerprints: 0
Number of overlapping SMILES in train_val_df: 0
Number of overlapping SMILES in protac_df: 0


12.56