darabos's picture
Do not put joblib-cache in "..".
b48a3ad
raw
history blame
2.45 kB
"""An expansion for `lynxkite-graph-analytics` that provides algorithms for biological applications."""
from lynxkite_graph_analytics import Bundle, RelationDefinition
from lynxkite.core import ops
import joblib
import numpy as np
import pandas as pd
import rdkit.Chem
import rdkit.Chem.rdFingerprintGenerator
import rdkit.Chem.Fingerprints.ClusterMols
import scipy
mem = joblib.Memory("joblib-cache")
ENV = "LynxKite Graph Analytics"
op = ops.op_registration(ENV)
@op("Parse SMILES")
def parse_smiles(bundle: Bundle, *, table="df", smiles_column="SMILES", save_as="mols"):
"""Parse SMILES strings into RDKit molecules."""
df = bundle.dfs[table]
mols = [rdkit.Chem.MolFromSmiles(smiles) for smiles in df[smiles_column].dropna()]
mols = [mol for mol in mols if mol is not None]
bundle = bundle.copy()
bundle.dfs[table] = df.assign(**{save_as: mols})
return bundle
def _get_similarity_matrix(mols):
mfpgen = rdkit.Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
fps = [(0, mfpgen.GetFingerprint(mol)) for mol in mols]
similarity_matrix = rdkit.Chem.Fingerprints.ClusterMols.GetDistanceMatrix(
fps, metric=rdkit.Chem.DataStructs.TanimotoSimilarity, isSimilarity=1
)
return scipy.spatial.distance.squareform(similarity_matrix)
@op("Graph from molecule similarity")
def graph_from_similarity(
bundle: Bundle, *, table="df", mols_column="mols", average_degree=10
):
"""Creates edges for pairs of molecules that are the most similar."""
df = bundle.dfs[table]
mols = df[mols_column]
similarity_matrix = _get_similarity_matrix(mols)
i_idx, j_idx = np.triu_indices_from(similarity_matrix, k=1)
sim_values = similarity_matrix[i_idx, j_idx]
N = int(average_degree * len(mols))
top_n_idx = np.argsort(sim_values)[-N:]
top_n_pairs = [(i_idx[k], j_idx[k], sim_values[k]) for k in top_n_idx]
edges = pd.DataFrame(top_n_pairs, columns=["source", "target", "similarity"])
nodes = df.copy()
nodes.index.name = "id"
bundle = Bundle(
dfs={"edges": edges, "nodes": nodes},
relations=[
RelationDefinition(
df="edges",
source_column="source",
target_column="target",
source_table="nodes",
target_table="nodes",
source_key="id",
target_key="id",
)
],
)
return bundle