Spaces:
Running
Running
# Deep learning | |
import torch | |
from torch.utils.data import Dataset | |
from sklearn.metrics import confusion_matrix | |
# Data | |
import pandas as pd | |
import numpy as np | |
# Standard library | |
import os | |
# Chemistry | |
from rdkit import Chem | |
from rdkit.Chem import PandasTools | |
from rdkit.Chem import Descriptors | |
PandasTools.RenderImagesInAllDataFrames(True) | |
def normalize_smiles(smi, canonical=True, isomeric=False): | |
try: | |
normalized = Chem.MolToSmiles( | |
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric | |
) | |
except: | |
normalized = None | |
return normalized | |
class RMSELoss: | |
def __init__(self): | |
pass | |
def __call__(self, yhat, y): | |
return torch.sqrt(torch.mean((yhat-y)**2)) | |
def RMSE(predictions, targets): | |
return np.sqrt(((predictions - targets) ** 2).mean()) | |
def sensitivity(y_true, y_pred): | |
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() | |
return (tp/(tp+fn)) | |
def specificity(y_true, y_pred): | |
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() | |
return (tn/(tn+fp)) | |
def get_optim_groups(module, keep_decoder=False): | |
# setup optimizer | |
# separate out all parameters to those that will and won't experience regularizing weight decay | |
decay = set() | |
no_decay = set() | |
whitelist_weight_modules = (torch.nn.Linear,) | |
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding) | |
for mn, m in module.named_modules(): | |
for pn, p in m.named_parameters(): | |
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name | |
if not keep_decoder and 'decoder' in fpn: # exclude decoder components | |
continue | |
if pn.endswith('bias'): | |
# all biases will not be decayed | |
no_decay.add(fpn) | |
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): | |
# weights of whitelist modules will be weight decayed | |
decay.add(fpn) | |
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): | |
# weights of blacklist modules will NOT be weight decayed | |
no_decay.add(fpn) | |
# validate that we considered every parameter | |
param_dict = {pn: p for pn, p in module.named_parameters()} | |
# create the pytorch optimizer object | |
optim_groups = [ | |
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.0}, | |
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, | |
] | |
return optim_groups | |
class CustomDataset(Dataset): | |
def __init__(self, dataset, target): | |
self.dataset = dataset | |
self.target = target | |
def __len__(self): | |
return len(self.dataset) | |
def __getitem__(self, idx): | |
smiles = self.dataset['canon_smiles'].iloc[idx] | |
labels = self.dataset[self.target].iloc[idx] | |
return smiles, labels | |
class CustomDatasetMultitask(Dataset): | |
def __init__(self, dataset, targets): | |
self.dataset = dataset | |
self.targets = targets | |
def __len__(self): | |
return len(self.dataset) | |
def __getitem__(self, idx): | |
smiles = self.dataset['canon_smiles'].iloc[idx] | |
labels = self.dataset[self.targets].iloc[idx].to_numpy() | |
mask = [0.0 if np.isnan(x) else 1.0 for x in labels] | |
labels = [0.0 if np.isnan(x) else x for x in labels] | |
return smiles, torch.tensor(labels, dtype=torch.float32), torch.tensor(mask) |