|
import numpy as np |
|
import joblib |
|
from sklearn.preprocessing import KBinsDiscretizer |
|
from sklearn.feature_selection import VarianceThreshold |
|
from rdkit import Chem |
|
import pandas as pd |
|
from rdkit.Chem import Descriptors |
|
from tqdm import tqdm |
|
|
|
MAX_NA = 0.2 |
|
|
|
class NanFilter(object): |
|
def __init__(self): |
|
self._name = "nan_filter" |
|
|
|
def fit(self, X): |
|
max_na = int((1 - MAX_NA) * X.shape[0]) |
|
idxs = [] |
|
for j in range(X.shape[1]): |
|
c = np.sum(np.isnan(X[:, j])) |
|
if c > max_na: |
|
continue |
|
else: |
|
idxs += [j] |
|
self.col_idxs = idxs |
|
|
|
def transform(self, X): |
|
return X[:, self.col_idxs] |
|
|
|
def save(self, file_name): |
|
joblib.dump(self, file_name) |
|
|
|
def load(self, file_name): |
|
return joblib.load(file_name) |
|
|
|
|
|
class Imputer(object): |
|
def __init__(self): |
|
self._name = "imputer" |
|
self._fallback = 0 |
|
|
|
def fit(self, X): |
|
ms = [] |
|
for j in range(X.shape[1]): |
|
vals = X[:, j] |
|
mask = ~np.isnan(vals) |
|
vals = vals[mask] |
|
if len(vals) == 0: |
|
m = self._fallback |
|
else: |
|
m = np.median(vals) |
|
ms += [m] |
|
self.impute_values = np.array(ms) |
|
|
|
def transform(self, X): |
|
for j in range(X.shape[1]): |
|
mask = np.isnan(X[:, j]) |
|
X[mask, j] = self.impute_values[j] |
|
return X |
|
|
|
def save(self, file_name): |
|
joblib.dump(self, file_name) |
|
|
|
def load(self, file_name): |
|
return joblib.load(file_name) |
|
|
|
|
|
class VarianceFilter(object): |
|
def __init__(self): |
|
self._name = "variance_filter" |
|
|
|
def fit(self, X): |
|
self.sel = VarianceThreshold() |
|
self.sel.fit(X) |
|
self.col_idxs = self.sel.transform([[i for i in range(X.shape[1])]]).ravel() |
|
|
|
def transform(self, X): |
|
return self.sel.transform(X) |
|
|
|
def save(self, file_name): |
|
joblib.dump(self, file_name) |
|
|
|
def load(self, file_name): |
|
return joblib.load(file_name) |
|
|
|
|
|
def physchem_featurizer(smiles_list): |
|
R = [] |
|
for smiles in tqdm(smiles_list): |
|
mol = Chem.MolFromSmiles(smiles) |
|
descriptors = [] |
|
for _, descr_calc_fn in Descriptors._descList: |
|
descriptors.append(descr_calc_fn(mol)) |
|
R += [np.array(descriptors)] |
|
return np.array(R) |
|
|
|
|
|
def physchem_featurizer_as_dataframe(smiles_list): |
|
R = [] |
|
for smiles in tqdm(smiles_list): |
|
mol = Chem.MolFromSmiles(smiles) |
|
descriptors = [] |
|
for _, descr_calc_fn in Descriptors._descList: |
|
descriptors.append(descr_calc_fn(mol)) |
|
R += [np.array(descriptors)] |
|
return pd.DataFrame(np.array(R), columns=[x[0] for x in Descriptors._descList]) |
|
|
|
|
|
class PhyschemDescriptor(object): |
|
def __init__(self, discretize=True): |
|
self.nan_filter = NanFilter() |
|
self.imputer = Imputer() |
|
self.variance_filter = VarianceFilter() |
|
self.discretizer = KBinsDiscretizer( |
|
n_bins=5, encode="ordinal", strategy="quantile" |
|
) |
|
self.discretize = discretize |
|
|
|
def fit(self, smiles): |
|
R = physchem_featurizer(smiles) |
|
X = np.array(R, dtype=np.float32) |
|
self.nan_filter.fit(X) |
|
X = self.nan_filter.transform(X) |
|
self.imputer.fit(X) |
|
X = self.imputer.transform(X) |
|
self.variance_filter.fit(X) |
|
X = self.variance_filter.transform(X) |
|
self.discretizer.fit(X) |
|
|
|
def transform(self, smiles): |
|
df = physchem_featurizer_as_dataframe(smiles) |
|
X = np.array(df, dtype=np.float32) |
|
X = self.nan_filter.transform(X) |
|
X = self.imputer.transform(X) |
|
X = self.variance_filter.transform(X) |
|
X = self.discretizer.transform(X) |
|
return np.array(X, dtype=int) |
|
|
|
|
|
class PhyschemDescriptorWithFeatures(object): |
|
def __init__(self, discretize=True): |
|
self.nan_filter = NanFilter() |
|
self.imputer = Imputer() |
|
self.variance_filter = VarianceFilter() |
|
self.discretizer = KBinsDiscretizer( |
|
n_bins=5, encode="ordinal", strategy="quantile" |
|
) |
|
self.discretize = discretize |
|
|
|
def fit(self, smiles): |
|
df = physchem_featurizer_as_dataframe(smiles) |
|
X = np.array(df, dtype=np.float32) |
|
self.nan_filter.fit(X) |
|
X = self.nan_filter.transform(X) |
|
self.imputer.fit(X) |
|
X = self.imputer.transform(X) |
|
self.variance_filter.fit(X) |
|
X = self.variance_filter.transform(X) |
|
if self.discretize: |
|
self.discretizer.fit(X) |
|
col_idxs = self.variance_filter.col_idxs |
|
feature_names = list(df.columns) |
|
self.feature_names = [feature_names[i] for i in col_idxs] |
|
|
|
def transform(self, smiles): |
|
df = physchem_featurizer_as_dataframe(smiles) |
|
X = np.array(df, dtype=np.float32) |
|
X = self.nan_filter.transform(X) |
|
X = self.imputer.transform(X) |
|
X = self.variance_filter.transform(X) |
|
if self.discretize: |
|
X = self.discretizer.transform(X) |
|
return np.array(X, dtype=int) |
|
|