darabos commited on
Commit
b95d49e
·
1 Parent(s): ce575c3

Graph from molecule similarity.

Browse files
README.md CHANGED
@@ -14,6 +14,7 @@ original LynxKite. The primary goals of this rewrite are:
14
  - `lynxkite-graph-analytics`: Graph analytics plugin. The classical LynxKite experience!
15
  - `lynxkite-pillow`: A simple example plugin.
16
  - `lynxkite-lynxscribe`: A plugin for building and running LynxScribe applications.
 
17
  - `docs`: User-facing documentation. It's shared between all packages.
18
 
19
  ## Development
@@ -25,7 +26,7 @@ uv venv
25
  source .venv/bin/activate
26
  uvx pre-commit install
27
  # The [dev] tag is only needed if you intend on running tests
28
- uv pip install -e lynxkite-core/[dev] -e lynxkite-app/[dev] -e lynxkite-graph-analytics/[dev] -e lynxkite-lynxscribe/ -e lynxkite-pillow-example/
29
  ```
30
 
31
  This also builds the frontend, hopefully very quickly. To run it:
 
14
  - `lynxkite-graph-analytics`: Graph analytics plugin. The classical LynxKite experience!
15
  - `lynxkite-pillow`: A simple example plugin.
16
  - `lynxkite-lynxscribe`: A plugin for building and running LynxScribe applications.
17
+ - `lynxkite-bio`: Bioinformatics additions for LynxKite Graph Analytics.
18
  - `docs`: User-facing documentation. It's shared between all packages.
19
 
20
  ## Development
 
26
  source .venv/bin/activate
27
  uvx pre-commit install
28
  # The [dev] tag is only needed if you intend on running tests
29
+ uv pip install -e lynxkite-core/[dev] -e lynxkite-app/[dev] -e lynxkite-graph-analytics/[dev] -e lynxkite-bio -e lynxkite-lynxscribe/ -e lynxkite-pillow-example/
30
  ```
31
 
32
  This also builds the frontend, hopefully very quickly. To run it:
lynxkite-app/src/lynxkite_app/crdt.py CHANGED
@@ -3,7 +3,6 @@
3
  import asyncio
4
  import contextlib
5
  import enum
6
- import pathlib
7
  import fastapi
8
  import os.path
9
  import pycrdt
 
3
  import asyncio
4
  import contextlib
5
  import enum
 
6
  import fastapi
7
  import os.path
8
  import pycrdt
lynxkite-bio/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # LynxKite Bio
2
+
3
+ An expansion for `lynxkite-graph-analytics` that provides algorithms for biological applications.
lynxkite-bio/pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "lynxkite-bio"
3
+ version = "0.1.0"
4
+ description = "Additional boxes for LynxKite Graph Analytics that add algorithms for biology."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "fsspec>=2025.2.0",
9
+ "joblib>=1.4.2",
10
+ "lynxkite-core",
11
+ "lynxkite-graph-analytics",
12
+ "pandas>=2.2.3",
13
+ "rdkit>=2024.9.5",
14
+ "scipy>=1.15.2",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ dev = [
19
+ "pytest>=8.3.4",
20
+ ]
21
+
22
+ [tool.uv.sources]
23
+ lynxkite-core = { path = "../lynxkite-core" }
24
+ lynxkite-graph-analytics = { path = "../lynxkite-graph-analytics" }
lynxkite-bio/src/lynxkite_bio/__init__.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Graph analytics operations. To be split into separate files when we have more."""
2
+
3
+ from lynxkite_graph_analytics import Bundle, RelationDefinition
4
+ from lynxkite.core import ops
5
+ import joblib
6
+ import numpy as np
7
+ import pandas as pd
8
+ import rdkit.Chem
9
+ import rdkit.Chem.rdFingerprintGenerator
10
+ import rdkit.Chem.Fingerprints.ClusterMols
11
+ import scipy
12
+
13
+ mem = joblib.Memory("../joblib-cache")
14
+ ENV = "LynxKite Graph Analytics"
15
+ op = ops.op_registration(ENV)
16
+
17
+
18
+ @op("Parse SMILES")
19
+ def parse_smiles(bundle: Bundle, *, table="df", smiles_column="SMILES", save_as="mols"):
20
+ """Parse SMILES strings into RDKit molecules."""
21
+ df = bundle.dfs[table]
22
+ mols = [rdkit.Chem.MolFromSmiles(smiles) for smiles in df[smiles_column].dropna()]
23
+ mols = [mol for mol in mols if mol is not None]
24
+ bundle = bundle.copy()
25
+ bundle.dfs[table] = df.assign(**{save_as: mols})
26
+ return bundle
27
+
28
+
29
+ def _get_similarity_matrix(mols):
30
+ mfpgen = rdkit.Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
31
+ fps = [(0, mfpgen.GetFingerprint(mol)) for mol in mols]
32
+ similarity_matrix = rdkit.Chem.Fingerprints.ClusterMols.GetDistanceMatrix(
33
+ fps, metric=rdkit.Chem.DataStructs.TanimotoSimilarity, isSimilarity=1
34
+ )
35
+ return scipy.spatial.distance.squareform(similarity_matrix)
36
+
37
+
38
+ @op("Graph from molecule similarity")
39
+ def graph_from_similarity(
40
+ bundle: Bundle, *, table="df", mols_column="mols", average_degree=10
41
+ ):
42
+ df = bundle.dfs[table]
43
+ mols = df[mols_column]
44
+ similarity_matrix = _get_similarity_matrix(mols)
45
+ i_idx, j_idx = np.triu_indices_from(similarity_matrix, k=1)
46
+ sim_values = similarity_matrix[i_idx, j_idx]
47
+ N = int(average_degree * len(mols))
48
+ top_n_idx = np.argsort(sim_values)[-N:]
49
+ top_n_pairs = [(i_idx[k], j_idx[k], sim_values[k]) for k in top_n_idx]
50
+ edges = pd.DataFrame(top_n_pairs, columns=["source", "target", "similarity"])
51
+ nodes = df.copy()
52
+ nodes.index.name = "id"
53
+ bundle = Bundle(
54
+ dfs={"edges": edges, "nodes": nodes},
55
+ relations=[
56
+ RelationDefinition(
57
+ df="edges",
58
+ source_column="source",
59
+ target_column="target",
60
+ source_table="nodes",
61
+ target_table="nodes",
62
+ source_key="id",
63
+ target_key="id",
64
+ )
65
+ ],
66
+ )
67
+ return bundle
lynxkite-graph-analytics/src/lynxkite_graph_analytics/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- from . import lynxkite_ops # noqa (imported to trigger registration)
2
  from . import networkx_ops # noqa (imported to trigger registration)
3
  from . import pytorch_model_ops # noqa (imported to trigger registration)
 
1
+ from .lynxkite_ops import * # noqa (imported to trigger registration)
2
  from . import networkx_ops # noqa (imported to trigger registration)
3
  from . import pytorch_model_ops # noqa (imported to trigger registration)
lynxkite-graph-analytics/src/lynxkite_graph_analytics/lynxkite_ops.py CHANGED
@@ -80,9 +80,10 @@ class Bundle:
80
  # TODO: Use relations.
81
  graph = nx.DiGraph()
82
  if "nodes" in self.dfs:
83
- graph.add_nodes_from(
84
- self.dfs["nodes"].set_index("id").to_dict("index").items()
85
- )
 
86
  graph.add_edges_from(
87
  self.dfs["edges"][["source", "target"]].itertuples(index=False, name=None)
88
  )
 
80
  # TODO: Use relations.
81
  graph = nx.DiGraph()
82
  if "nodes" in self.dfs:
83
+ df = self.dfs["nodes"]
84
+ if df.index.name != "id":
85
+ df = df.set_index("id")
86
+ graph.add_nodes_from(df.to_dict("index").items())
87
  graph.add_edges_from(
88
  self.dfs["edges"][["source", "target"]].itertuples(index=False, name=None)
89
  )