from rdkit import Chem, DataStructs import numpy as np from rdkit.Chem import rdFingerprintGenerator from sklearn.feature_selection import VarianceThreshold class MorganFingerprint(object): def __init__(self): self.variance_filter = VarianceThreshold(threshold=0) def get_ecfp_fingerprint(self, smiles_list): R = [] for smiles in smiles_list: mol = Chem.MolFromSmiles(smiles) fingerprints_vect = rdFingerprintGenerator.GetCountFPs( [mol], fpType=rdFingerprintGenerator.MorganFP )[0] fingerprint = np.zeros((0,), np.float32) # Generate target pointer to fill DataStructs.ConvertToNumpyArray(fingerprints_vect, fingerprint) R += [fingerprint] X = np.array(R, dtype=int) return X def fit(self, smiles): X = self.get_ecfp_fingerprint(smiles) self.variance_filter.fit(X) def transform(self, smiles): X = self.get_ecfp_fingerprint(smiles) X = self.variance_filter.transform(X) return X