Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

PROTAC-Degradation-Predictor / notebooks /protac_degradation_predictor.py

ribesstefano

Added script file for hparam CV training

b09510c over 1 year ago

raw

history blame

35.8 kB

	# %% [markdown]
	# # PROTAC-Degradation-Predictor

	# %%
	import pandas as pd

	protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
	protac_df.head()

	# %%
	# Get the unique Article IDs of the entries with NaN values in the Active column
	nan_active = protac_df[protac_df['Active'].isna()]['Article DOI'].unique()
	nan_active

	# %%
	# Map E3 Ligase Iap to IAP
	protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')

	# %%
	protac_df.columns

	# %%
	cells = sorted(protac_df['Cell Type'].dropna().unique().tolist())
	print(f'Number of non-cleaned cell lines: {len(cells)}')

	# %%
	cells = sorted(protac_df['Cell Line Identifier'].dropna().unique().tolist())
	print(f'Number of cleaned cell lines: {len(cells)}')

	# %%
	unlabeled_df = protac_df[protac_df['Active'].isna()]
	print(f'Number of compounds in test set: {len(unlabeled_df)}')

	# %% [markdown]
	# ## Load Protein Embeddings

	# %% [markdown]
	# Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).
	#
	# Please note that running the following cell the first time might take a while.

	# %%
	import os
	import urllib.request

	download_link = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5"
	embeddings_path = "../data/uniprot2embedding.h5"
	if not os.path.exists(embeddings_path):
	# Download the file
	print(f'Downloading embeddings from {download_link}')
	urllib.request.urlretrieve(download_link, embeddings_path)

	# %%
	import h5py
	import numpy as np
	from tqdm.auto import tqdm

	protein_embeddings = {}
	with h5py.File("../data/uniprot2embedding.h5", "r") as file:
	print(f"number of entries: {len(file.items()):,}")
	uniprots = protac_df['Uniprot'].unique().tolist()
	uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()
	for i, sequence_id in tqdm(enumerate(uniprots), desc='Loading protein embeddings'):
	try:
	embedding = file[sequence_id][:]
	protein_embeddings[sequence_id] = np.array(embedding)
	if i < 10:
	print(
	f"\tid: {sequence_id}, "
	f"\tembeddings shape: {embedding.shape}, "
	f"\tembeddings mean: {np.array(embedding).mean()}"
	)
	except KeyError:
	print(f'KeyError for {sequence_id}')
	protein_embeddings[sequence_id] = np.zeros((1024,))

	# %% [markdown]
	# ## Load Cell Embeddings

	# %%
	import pickle

	cell2embedding_filepath = '../data/cell2embedding.pkl'
	with open(cell2embedding_filepath, 'rb') as f:
	cell2embedding = pickle.load(f)
	print(f'Loaded {len(cell2embedding)} cell lines')

	# %%
	emb_shape = cell2embedding[list(cell2embedding.keys())[0]].shape
	# Assign all-zero vectors to cell lines that are not in the embedding file
	for cell_line in protac_df['Cell Line Identifier'].unique():
	if cell_line not in cell2embedding:
	cell2embedding[cell_line] = np.zeros(emb_shape)

	# %% [markdown]
	# ## Precompute Molecular Fingerprints

	# %%
	from rdkit import Chem
	from rdkit.Chem import AllChem
	from rdkit.Chem import Draw

	morgan_radius = 15
	n_bits = 1024

	# fpgen = AllChem.GetAtomPairGenerator()
	rdkit_fpgen = AllChem.GetRDKitFPGenerator(maxPath=5, fpSize=512)
	morgan_fpgen = AllChem.GetMorganGenerator(radius=morgan_radius, fpSize=n_bits)

	smiles2fp = {}
	for smiles in tqdm(protac_df['Smiles'].unique().tolist(), desc='Precomputing fingerprints'):
	# Get the fingerprint as a bit vector
	morgan_fp = morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
	# rdkit_fp = rdkit_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
	# fp = np.concatenate([morgan_fp, rdkit_fp])
	smiles2fp[smiles] = morgan_fp

	# Count the number of unique SMILES and the number of unique Morgan fingerprints
	print(f'Number of unique SMILES: {len(smiles2fp)}')
	print(f'Number of unique fingerprints: {len(set([tuple(fp) for fp in smiles2fp.values()]))}')
	# Get the list of SMILES with overlapping fingerprints
	overlapping_smiles = []
	unique_fps = set()
	for smiles, fp in smiles2fp.items():
	if tuple(fp) in unique_fps:
	overlapping_smiles.append(smiles)
	else:
	unique_fps.add(tuple(fp))
	print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)}')
	print(f'Number of overlapping SMILES in protac_df: {len(protac_df[protac_df["Smiles"].isin(overlapping_smiles)])}')

	# %%
	# Get the pair-wise tanimoto similarity between the PROTAC fingerprints
	from rdkit import DataStructs
	from collections import defaultdict

	tanimoto_matrix = defaultdict(list)
	for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
	fp1 = smiles2fp[smiles1]
	# TODO: Use BulkTanimotoSimilarity
	for j, smiles2 in enumerate(protac_df['Smiles'].unique()):
	if j < i:
	continue
	fp2 = smiles2fp[smiles2]
	tanimoto_dist = DataStructs.TanimotoSimilarity(fp1, fp2)
	tanimoto_matrix[smiles1].append(tanimoto_dist)
	avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
	protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)

	# %%
	# # Plot the distribution of the average Tanimoto similarity
	# import seaborn as sns
	# import matplotlib.pyplot as plt

	# sns.histplot(protac_df['Avg Tanimoto'], bins=50)
	# plt.xlabel('Average Tanimoto similarity')
	# plt.ylabel('Count')
	# plt.title('Distribution of average Tanimoto similarity')
	# plt.grid(axis='y', alpha=0.5)
	# plt.show()

	# %%
	smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}

	# %% [markdown]
	# ## Set the Column to Predict

	# %%
	# active_col = 'Active'
	active_col = 'Active - OR'


	from sklearn.preprocessing import StandardScaler

	# %% [markdown]
	# ## Define Torch Dataset

	# %%
	from imblearn.over_sampling import SMOTE, ADASYN
	from sklearn.preprocessing import LabelEncoder
	import pandas as pd
	import numpy as np

	# %%
	from torch.utils.data import Dataset, DataLoader


	class PROTAC_Dataset(Dataset):
	def __init__(
	self,
	protac_df,
	protein_embeddings=protein_embeddings,
	cell2embedding=cell2embedding,
	smiles2fp=smiles2fp,
	use_smote=False,
	oversampler=None,
	use_ored_activity=False,
	):
	""" Initialize the PROTAC dataset

	Args:
	protac_df (pd.DataFrame): The PROTAC dataframe
	protein_embeddings (dict): Dictionary of protein embeddings
	cell2embedding (dict): Dictionary of cell line embeddings
	smiles2fp (dict): Dictionary of SMILES to fingerprint
	use_smote (bool): Whether to use SMOTE for oversampling
	use_ored_activity (bool): Whether to use the 'Active - OR' column
	"""
	# Filter out examples with NaN in 'Active' column
	self.data = protac_df # [~protac_df['Active'].isna()]
	self.protein_embeddings = protein_embeddings
	self.cell2embedding = cell2embedding
	self.smiles2fp = smiles2fp

	self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
	self.protein_emb_dim = protein_embeddings[list(
	protein_embeddings.keys())[0]].shape[0]
	self.cell_emb_dim = cell2embedding[list(
	cell2embedding.keys())[0]].shape[0]

	self.active_label = 'Active - OR' if use_ored_activity else 'Active'

	self.use_smote = use_smote
	self.oversampler = oversampler
	# Apply SMOTE
	if self.use_smote:
	self.apply_smote()

	def apply_smote(self):
	# Prepare the dataset for SMOTE
	features = []
	labels = []
	for _, row in self.data.iterrows():
	smiles_emb = smiles2fp[row['Smiles']]
	poi_emb = protein_embeddings[row['Uniprot']]
	e3_emb = protein_embeddings[row['E3 Ligase Uniprot']]
	cell_emb = cell2embedding[row['Cell Line Identifier']]
	features.append(np.hstack([
	smiles_emb.astype(np.float32),
	poi_emb.astype(np.float32),
	e3_emb.astype(np.float32),
	cell_emb.astype(np.float32),
	]))
	labels.append(row[self.active_label])

	# Convert to numpy array
	features = np.array(features).astype(np.float32)
	labels = np.array(labels).astype(np.float32)

	# Initialize SMOTE and fit
	if self.oversampler is None:
	oversampler = SMOTE(random_state=42)
	else:
	oversampler = self.oversampler
	features_smote, labels_smote = oversampler.fit_resample(features, labels)

	# Separate the features back into their respective embeddings
	smiles_embs = features_smote[:, :self.smiles_emb_dim]
	poi_embs = features_smote[:,
	self.smiles_emb_dim:self.smiles_emb_dim+self.protein_emb_dim]
	e3_embs = features_smote[:, self.smiles_emb_dim +
	self.protein_emb_dim:self.smiles_emb_dim+2*self.protein_emb_dim]
	cell_embs = features_smote[:, -self.cell_emb_dim:]

	# Reconstruct the dataframe with oversampled data
	df_smote = pd.DataFrame({
	'Smiles': list(smiles_embs),
	'Uniprot': list(poi_embs),
	'E3 Ligase Uniprot': list(e3_embs),
	'Cell Line Identifier': list(cell_embs),
	self.active_label: labels_smote
	})
	self.data = df_smote

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	if self.use_smote:
	# NOTE: We do not need to look up the embeddings anymore
	elem = {
	'smiles_emb': self.data['Smiles'].iloc[idx],
	'poi_emb': self.data['Uniprot'].iloc[idx],
	'e3_emb': self.data['E3 Ligase Uniprot'].iloc[idx],
	'cell_emb': self.data['Cell Line Identifier'].iloc[idx],
	'active': self.data[self.active_label].iloc[idx],
	}
	else:
	elem = {
	'smiles_emb': self.smiles2fp[self.data['Smiles'].iloc[idx]].astype(np.float32),
	'poi_emb': self.protein_embeddings[self.data['Uniprot'].iloc[idx]].astype(np.float32),
	'e3_emb': self.protein_embeddings[self.data['E3 Ligase Uniprot'].iloc[idx]].astype(np.float32),
	'cell_emb': self.cell2embedding[self.data['Cell Line Identifier'].iloc[idx]].astype(np.float32),
	'active': 1. if self.data[self.active_label].iloc[idx] else 0.,
	}
	return elem

	# %%
	import warnings
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import pytorch_lightning as pl
	from torchmetrics import (
	Accuracy,
	AUROC,
	Precision,
	Recall,
	F1Score,
	)
	from torchmetrics import MetricCollection

	# Ignore UserWarning from PyTorch Lightning
	warnings.filterwarnings("ignore", ".does not have many workers.")

	class PROTAC_Model(pl.LightningModule):

	def __init__(
	self,
	hidden_dim,
	smiles_emb_dim=1024,
	poi_emb_dim=1024,
	e3_emb_dim=1024,
	cell_emb_dim=768,
	batch_size=32,
	learning_rate=1e-3,
	dropout=0.2,
	train_dataset=None,
	val_dataset=None,
	test_dataset=None,
	disabled_embeddings=[],
	):
	super().__init__()
	self.poi_emb_dim = poi_emb_dim
	self.e3_emb_dim = e3_emb_dim
	self.cell_emb_dim = cell_emb_dim
	self.smiles_emb_dim = smiles_emb_dim
	self.hidden_dim = hidden_dim
	self.batch_size = batch_size
	self.learning_rate = learning_rate
	self.train_dataset = train_dataset
	self.val_dataset = val_dataset
	self.test_dataset = test_dataset
	self.disabled_embeddings = disabled_embeddings
	# Set our init args as class attributes
	self.__dict__.update(locals()) # Add arguments as attributes
	# Save the arguments passed to init
	ignore_args_as_hyperparams = [
	'train_dataset',
	'test_dataset',
	'val_dataset',
	]
	self.save_hyperparameters(ignore=ignore_args_as_hyperparams)

	if 'poi' not in self.disabled_embeddings:
	self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
	# # Set the POI surrogate model as a Sequential model
	# self.poi_emb = nn.Sequential(
	# nn.Linear(poi_emb_dim, hidden_dim),
	# nn.GELU(),
	# nn.Dropout(p=dropout),
	# nn.Linear(hidden_dim, hidden_dim),
	# # nn.ReLU(),
	# # nn.Dropout(p=dropout),
	# )
	if 'e3' not in self.disabled_embeddings:
	self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
	# self.e3_emb = nn.Sequential(
	# nn.Linear(e3_emb_dim, hidden_dim),
	# # nn.ReLU(),
	# nn.Dropout(p=dropout),
	# # nn.Linear(hidden_dim, hidden_dim),
	# # nn.ReLU(),
	# # nn.Dropout(p=dropout),
	# )
	if 'cell' not in self.disabled_embeddings:
	self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
	# self.cell_emb = nn.Sequential(
	# nn.Linear(cell_emb_dim, hidden_dim),
	# # nn.ReLU(),
	# nn.Dropout(p=dropout),
	# # nn.Linear(hidden_dim, hidden_dim),
	# # nn.ReLU(),
	# # nn.Dropout(p=dropout),
	# )
	if 'smiles' not in self.disabled_embeddings:
	self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
	# self.smiles_emb = nn.Sequential(
	# nn.Linear(smiles_emb_dim, hidden_dim),
	# # nn.ReLU(),
	# nn.Dropout(p=dropout),
	# # nn.Linear(hidden_dim, hidden_dim),
	# # nn.ReLU(),
	# # nn.Dropout(p=dropout),
	# )

	self.fc1 = nn.Linear(
	hidden_dim * (4 - len(self.disabled_embeddings)), hidden_dim)
	self.fc2 = nn.Linear(hidden_dim, hidden_dim)
	self.fc3 = nn.Linear(hidden_dim, 1)

	self.dropout = nn.Dropout(p=dropout)

	stages = ['train_metrics', 'val_metrics', 'test_metrics']
	self.metrics = nn.ModuleDict({s: MetricCollection({
	'acc': Accuracy(task='binary'),
	'roc_auc': AUROC(task='binary'),
	'precision': Precision(task='binary'),
	'recall': Recall(task='binary'),
	'f1_score': F1Score(task='binary'),
	'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
	'hp_metric': Accuracy(task='binary'),
	}, prefix=s.replace('metrics', '')) for s in stages})

	# Misc settings
	self.missing_dataset_error = \
	'''Class variable `{0}` is None. If the model was loaded from a checkpoint, the dataset must be set manually:

	model = {1}.load_from_checkpoint('checkpoint.ckpt')
	model.{0} = my_{0}
	'''

	def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
	embeddings = []
	if 'poi' not in self.disabled_embeddings:
	embeddings.append(self.poi_emb(poi_emb))
	if 'e3' not in self.disabled_embeddings:
	embeddings.append(self.e3_emb(e3_emb))
	if 'cell' not in self.disabled_embeddings:
	embeddings.append(self.cell_emb(cell_emb))
	if 'smiles' not in self.disabled_embeddings:
	embeddings.append(self.smiles_emb(smiles_emb))
	x = torch.cat(embeddings, dim=1)
	x = self.dropout(F.gelu(self.fc1(x)))
	x = self.dropout(F.gelu(self.fc2(x)))
	x = self.fc3(x)
	return x

	def step(self, batch, batch_idx, stage):
	poi_emb = batch['poi_emb']
	e3_emb = batch['e3_emb']
	cell_emb = batch['cell_emb']
	smiles_emb = batch['smiles_emb']
	y = batch['active'].float().unsqueeze(1)

	y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
	loss = F.binary_cross_entropy_with_logits(y_hat, y)

	self.metrics[f'{stage}_metrics'].update(y_hat, y)
	self.log(f'{stage}_loss', loss, on_epoch=True, prog_bar=True)
	self.log_dict(self.metrics[f'{stage}_metrics'], on_epoch=True)

	return loss

	def training_step(self, batch, batch_idx):
	return self.step(batch, batch_idx, 'train')

	def validation_step(self, batch, batch_idx):
	return self.step(batch, batch_idx, 'val')

	def test_step(self, batch, batch_idx):
	return self.step(batch, batch_idx, 'test')

	def configure_optimizers(self):
	return optim.Adam(self.parameters(), lr=self.learning_rate)

	def predict_step(self, batch, batch_idx):
	poi_emb = batch['poi_emb']
	e3_emb = batch['e3_emb']
	cell_emb = batch['cell_emb']
	smiles_emb = batch['smiles_emb']

	y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
	return torch.sigmoid(y_hat)

	def train_dataloader(self):
	if self.train_dataset is None:
	format = 'train_dataset', self.__class__.__name__
	raise ValueError(self.missing_dataset_error.format(*format))
	return DataLoader(
	self.train_dataset,
	batch_size=self.batch_size,
	shuffle=True,
	# drop_last=True,
	)

	def val_dataloader(self):
	if self.val_dataset is None:
	format = 'val_dataset', self.__class__.__name__
	raise ValueError(self.missing_dataset_error.format(*format))
	return DataLoader(
	self.val_dataset,
	batch_size=self.batch_size,
	shuffle=False,
	)

	def test_dataloader(self):
	if self.test_dataset is None:
	format = 'test_dataset', self.__class__.__name__
	raise ValueError(self.missing_dataset_error.format(*format))
	return DataLoader(
	self.test_dataset,
	batch_size=self.batch_size,
	shuffle=False,
	)

	# %% [markdown]
	# ## Test Sets

	# %% [markdown]
	# We want a different test set per Cross-Validation (CV) experiment (see further down). We are interested in three scenarios:
	# * Randomly splitting the data into training and test sets. Hence, the test st shall contain unique SMILES and Uniprots
	# * Splitting the data according to their Uniprot. Hence, the test set shall contain unique Uniprots
	# * Splitting the data according to their SMILES, _i.e._, the test set shall contain unique SMILES

	# %%
	test_indeces = {}

	# %% [markdown]
	# Isolating the unique SMILES and Uniprots:

	# %%
	active_df = protac_df[protac_df[active_col].notna()].copy()

	# Get the unique SMILES and Uniprot
	unique_smiles = active_df['Smiles'].value_counts() == 1
	unique_uniprot = active_df['Uniprot'].value_counts() == 1
	print(f'Number of unique SMILES: {unique_smiles.sum()}')
	print(f'Number of unique Uniprot: {unique_uniprot.sum()}')
	# Sample 1% of the len(active_df) from unique SMILES and Uniprot and get the
	# indices for a test set
	n = int(0.05 * len(active_df)) // 2
	unique_smiles = unique_smiles[unique_smiles].sample(n=n, random_state=42)
	# unique_uniprot = unique_uniprot[unique_uniprot].sample(n=, random_state=42)
	unique_indices = active_df[
	active_df['Smiles'].isin(unique_smiles.index) &
	active_df['Uniprot'].isin(unique_uniprot.index)
	].index
	print(f'Number of unique indices: {len(unique_indices)} ({len(unique_indices) / len(active_df):.1%})')

	test_indeces['random'] = unique_indices

	# # Get the test set
	# test_df = active_df.loc[unique_indices]
	# # Bar plot of the test Active distribution as percentage
	# test_df['Active'].value_counts(normalize=True).plot(kind='bar')
	# plt.title('Test set Active distribution')
	# plt.show()
	# # Bar plot of the test Active - OR distribution as percentage
	# test_df['Active - OR'].value_counts(normalize=True).plot(kind='bar')
	# plt.title('Test set Active - OR distribution')
	# plt.show()

	# %% [markdown]
	# Isolating the unique Uniprots:

	# %%
	active_df = protac_df[protac_df[active_col].notna()].copy()

	unique_uniprot = active_df['Uniprot'].value_counts() == 1
	print(f'Number of unique Uniprot: {unique_uniprot.sum()}')

	# NOTE: Since they are very few, all unique Uniprot will be used as test set.
	# Get the indices for a test set
	unique_indices = active_df[active_df['Uniprot'].isin(unique_uniprot.index)].index


	test_indeces['uniprot'] = unique_indices
	print(f'Number of unique indices: {len(unique_indices)} ({len(unique_indices) / len(active_df):.1%})')

	# %% [markdown]
	# DEPRECATED: The following results in a too Before starting any training, we isolate a small group of test data. Each element in the test set is selected so that all the following conditions are met:
	# * its SMILES is unique
	# * its POI is unique
	# * its (SMILES, POI) pair is unique

	# %%
	active_df = protac_df[protac_df[active_col].notna()]

	# Find the samples that:
	# * have their SMILES appearing only once in the dataframe
	# * have their Uniprot appearing only once in the dataframe
	# * have their (Smiles, Uniprot) pair appearing only once in the dataframe
	unique_smiles = active_df['Smiles'].value_counts() == 1
	unique_uniprot = active_df['Uniprot'].value_counts() == 1
	unique_smiles_uniprot = active_df.groupby(['Smiles', 'Uniprot']).size() == 1

	# Get the indices of the unique samples
	unique_smiles_idx = active_df['Smiles'].map(unique_smiles)
	unique_uniprot_idx = active_df['Uniprot'].map(unique_uniprot)
	unique_smiles_uniprot_idx = active_df.set_index(['Smiles', 'Uniprot']).index.map(unique_smiles_uniprot)

	# Cross the indices to get the unique samples
	# unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx & unique_smiles_uniprot_idx].index
	unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx].index
	test_df = active_df.loc[unique_samples]

	warnings.filterwarnings("ignore", ".FixedLocator")

	# %% [markdown]
	# ## Cross-Validation Training

	# %% [markdown]
	# Cross validation training with 5 splits. The split operation is done in three different ways:
	#
	# * Random split
	# * POI-wise: some POIs never in both splits
	# * Least Tanimoto similarity PROTAC-wise

	# %% [markdown]
	# ### Plotting CV Folds

	# %%
	from sklearn.model_selection import (
	StratifiedKFold,
	StratifiedGroupKFold,
	)
	from sklearn.preprocessing import OrdinalEncoder

	# NOTE: When set to 60, it will result in 29 groups, with nice distributions of
	# the number of unique groups in the train and validation sets, together with
	# the number of active and inactive PROTACs.
	n_bins_tanimoto = 60 if active_col == 'Active' else 400
	n_splits = 5
	# The train and validation sets will be created from the active PROTACs only,
	# i.e., the ones with 'Active' column not NaN, and that are NOT in the test set
	active_df = protac_df[protac_df[active_col].notna()]
	train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()

	# Make three groups for CV:
	# * Random split
	# * Split by Uniprot (POI)
	# * Split by least tanimoto similarity PROTAC-wise
	groups = [
	'random',
	'uniprot',
	'tanimoto',
	]
	for group_type in groups:
	if group_type == 'random':
	kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
	groups = None
	elif group_type == 'uniprot':
	# Split by Uniprot
	kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
	encoder = OrdinalEncoder()
	groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
	print(f'Number of unique groups: {len(encoder.categories_[0])}')
	elif group_type == 'tanimoto':
	# Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
	kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
	tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
	encoder = OrdinalEncoder()
	groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
	print(f'Number of unique groups: {len(encoder.categories_[0])}')


	X = train_val_df.drop(columns=active_col)
	y = train_val_df[active_col].tolist()

	# print(f'Group: {group_type}')
	# fig, ax = plt.subplots(figsize=(6, 3))
	# plot_cv_indices(kf, X=X, y=y, group=groups, ax=ax, n_splits=n_splits)
	# plt.tight_layout()
	# plt.show()

	stats = []
	for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
	train_df = train_val_df.iloc[train_index]
	val_df = train_val_df.iloc[val_index]
	stat = {
	'fold': k,
	'train_len': len(train_df),
	'val_len': len(val_df),
	'train_perc': len(train_df) / len(train_val_df),
	'val_perc': len(val_df) / len(train_val_df),
	'train_active (%)': train_df[active_col].sum() / len(train_df) * 100,
	'train_inactive (%)': (len(train_df) - train_df[active_col].sum()) / len(train_df) * 100,
	'val_active (%)': val_df[active_col].sum() / len(val_df) * 100,
	'val_inactive (%)': (len(val_df) - val_df[active_col].sum()) / len(val_df) * 100,
	'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
	'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
	}
	if group_type != 'random':
	stat['train_unique_groups'] = len(np.unique(groups[train_index]))
	stat['val_unique_groups'] = len(np.unique(groups[val_index]))
	stats.append(stat)
	print('-' * 120)

	# %% [markdown]
	# ### Run CV

	# %%
	import warnings

	# Seed everything in pytorch lightning
	pl.seed_everything(42)


	def train_model(
	train_df,
	val_df,
	test_df=None,
	hidden_dim=768,
	batch_size=8,
	learning_rate=2e-5,
	max_epochs=50,
	smiles_emb_dim=1024,
	smote_n_neighbors=5,
	use_ored_activity=False if active_col == 'Active' else True,
	fast_dev_run=False,
	disabled_embeddings=[],
	) -> tuple:
	""" Train a PROTAC model using the given datasets and hyperparameters.

	Args:
	train_df (pd.DataFrame): The training set.
	val_df (pd.DataFrame): The validation set.
	test_df (pd.DataFrame): The test set.
	hidden_dim (int): The hidden dimension of the model.
	batch_size (int): The batch size.
	learning_rate (float): The learning rate.
	max_epochs (int): The maximum number of epochs.
	smiles_emb_dim (int): The dimension of the SMILES embeddings.
	smote_n_neighbors (int): The number of neighbors for the SMOTE oversampler.
	use_ored_activity (bool): Whether to use the ORED activity column.
	fast_dev_run (bool): Whether to run a fast development run.
	disabled_embeddings (list): The list of disabled embeddings.

	Returns:
	tuple: The trained model, the trainer, and the metrics.
	"""
	oversampler = SMOTE(k_neighbors=smote_n_neighbors, random_state=42)
	train_ds = PROTAC_Dataset(
	train_df,
	protein_embeddings,
	cell2embedding,
	smiles2fp,
	use_smote=True,
	oversampler=oversampler,
	use_ored_activity=use_ored_activity,
	)
	val_ds = PROTAC_Dataset(
	val_df,
	protein_embeddings,
	cell2embedding,
	smiles2fp,
	use_ored_activity=use_ored_activity,
	)
	if test_df is not None:
	test_ds = PROTAC_Dataset(
	test_df,
	protein_embeddings,
	cell2embedding,
	smiles2fp,
	use_ored_activity=use_ored_activity,
	)
	logger = pl.loggers.TensorBoardLogger(
	save_dir='../logs',
	name='protac',
	)
	callbacks = [
	pl.callbacks.EarlyStopping(
	monitor='train_loss',
	patience=10,
	mode='max',
	verbose=True,
	),
	# pl.callbacks.ModelCheckpoint(
	# monitor='val_acc',
	# mode='max',
	# verbose=True,
	# filename='{epoch}-{val_metrics_opt_score:.4f}',
	# ),
	]
	# Define Trainer
	trainer = pl.Trainer(
	logger=logger,
	callbacks=callbacks,
	max_epochs=max_epochs,
	fast_dev_run=fast_dev_run,
	enable_model_summary=False,
	enable_checkpointing=False,
	)
	model = PROTAC_Model(
	hidden_dim=hidden_dim,
	smiles_emb_dim=smiles_emb_dim,
	poi_emb_dim=1024,
	e3_emb_dim=1024,
	cell_emb_dim=768,
	batch_size=batch_size,
	learning_rate=learning_rate,
	train_dataset=train_ds,
	val_dataset=val_ds,
	test_dataset=test_ds if test_df is not None else None,
	disabled_embeddings=disabled_embeddings,
	)
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	trainer.fit(model)
	metrics = trainer.validate(model, verbose=False)[0]
	if test_df is not None:
	test_metrics = trainer.test(model, verbose=False)[0]
	metrics.update(test_metrics)
	return model, trainer, metrics

	# %% [markdown]
	# Setup hyperparameter optimization:

	# %%
	import optuna
	import pandas as pd


	def objective(
	trial,
	train_df,
	val_df,
	hidden_dim_options,
	batch_size_options,
	learning_rate_options,
	max_epochs_options,
	fast_dev_run=False,
	) -> float:
	# Generate the hyperparameters
	hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
	batch_size = trial.suggest_categorical('batch_size', batch_size_options)
	learning_rate = trial.suggest_loguniform('learning_rate', *learning_rate_options)
	max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)

	# Train the model with the current set of hyperparameters
	_, _, metrics = train_model(
	train_df,
	val_df,
	hidden_dim=hidden_dim,
	batch_size=batch_size,
	learning_rate=learning_rate,
	max_epochs=max_epochs,
	fast_dev_run=fast_dev_run,
	)

	# Metrics is a dictionary containing at least the validation loss
	val_loss = metrics['val_loss']
	val_acc = metrics['val_acc']
	val_roc_auc = metrics['val_roc_auc']

	# Optuna aims to minimize the objective
	return val_loss - val_acc - val_roc_auc


	def hyperparameter_tuning_and_training(
	train_df,
	val_df,
	test_df,
	fast_dev_run=False,
	n_trials=20,
	) -> tuple:
	""" Hyperparameter tuning and training of a PROTAC model.

	Args:
	train_df (pd.DataFrame): The training set.
	val_df (pd.DataFrame): The validation set.
	test_df (pd.DataFrame): The test set.
	fast_dev_run (bool): Whether to run a fast development run.

	Returns:
	tuple: The trained model, the trainer, and the best metrics.
	"""
	# Define the search space
	hidden_dim_options = [256, 512, 768]
	batch_size_options = [8, 16, 32]
	learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
	max_epochs_options = [10, 20, 50]

	# Create an Optuna study object
	study = optuna.create_study(direction='minimize')
	study.optimize(lambda trial: objective(
	trial,
	train_df,
	val_df,
	hidden_dim_options,
	batch_size_options,
	learning_rate_options,
	max_epochs_options,
	fast_dev_run=fast_dev_run,),
	n_trials=n_trials,
	)

	# Retrieve the best hyperparameters
	best_params = study.best_params
	best_hidden_dim = best_params['hidden_dim']
	best_batch_size = best_params['batch_size']
	best_learning_rate = best_params['learning_rate']
	best_max_epochs = best_params['max_epochs']

	# Retrain the model with the best hyperparameters
	model, trainer, metrics = train_model(
	train_df,
	val_df,
	test_df,
	hidden_dim=best_hidden_dim,
	batch_size=best_batch_size,
	learning_rate=best_learning_rate,
	max_epochs=best_max_epochs,
	fast_dev_run=fast_dev_run,
	)

	# Return the best metrics
	return model, trainer, metrics

	# Example usage
	# train_df, val_df, test_df = load_your_data() # You need to load your datasets here
	# model, trainer, best_metrics = hyperparameter_tuning_and_training(train_df, val_df, test_df)

	# %% [markdown]
	# Loop over the different splits and train the model:

	# %%
	n_splits = 5
	report = []
	active_df = protac_df[protac_df[active_col].notna()]
	train_val_df = active_df[~active_df.index.isin(unique_samples)]

	# Make directory ../reports if it does not exist
	if not os.path.exists('../reports'):
	os.makedirs('../reports')

	for group_type in ['random', 'uniprot', 'tanimoto']:
	print(f'Starting CV for group type: {group_type}')
	# Setup CV iterator and groups
	if group_type == 'random':
	kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
	groups = None
	elif group_type == 'uniprot':
	# Split by Uniprot
	kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
	encoder = OrdinalEncoder()
	groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
	elif group_type == 'tanimoto':
	# Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
	kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
	tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
	encoder = OrdinalEncoder()
	groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
	# Start the CV over the folds
	X = train_val_df.drop(columns=active_col)
	y = train_val_df[active_col].tolist()
	for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
	train_df = train_val_df.iloc[train_index]
	val_df = train_val_df.iloc[val_index]
	stats = {
	'fold': k,
	'group_type': group_type,
	'train_len': len(train_df),
	'val_len': len(val_df),
	'train_perc': len(train_df) / len(train_val_df),
	'val_perc': len(val_df) / len(train_val_df),
	'train_active_perc': train_df[active_col].sum() / len(train_df),
	'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
	'val_active_perc': val_df[active_col].sum() / len(val_df),
	'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
	'test_active_perc': test_df[active_col].sum() / len(test_df),
	'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
	'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
	'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
	}
	if group_type != 'random':
	stats['train_unique_groups'] = len(np.unique(groups[train_index]))
	stats['val_unique_groups'] = len(np.unique(groups[val_index]))
	# Train and evaluate the model
	# model, trainer, metrics = train_model(train_df, val_df, test_df)
	model, trainer, metrics = hyperparameter_tuning_and_training(
	train_df,
	val_df,
	test_df,
	fast_dev_run=False,
	n_trials=50,
	)
	stats.update(metrics)
	del model
	del trainer
	report.append(stats)
	report = pd.DataFrame(report)
	report.to_csv(
	f'../reports/cv_report_hparam_search_{n_splits}-splits.csv', index=False,
	)