Spaces:
Running
Running
Split lynxkite_ops.py into two.
Browse files
lynxkite-graph-analytics/src/lynxkite_graph_analytics/core.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Graph analytics executor and data types."""
|
2 |
+
|
3 |
+
from lynxkite.core import ops
|
4 |
+
import dataclasses
|
5 |
+
import functools
|
6 |
+
import networkx as nx
|
7 |
+
import pandas as pd
|
8 |
+
import polars as pl
|
9 |
+
import traceback
|
10 |
+
import typing
|
11 |
+
|
12 |
+
|
13 |
+
ENV = "LynxKite Graph Analytics"
|
14 |
+
|
15 |
+
|
16 |
+
@dataclasses.dataclass
|
17 |
+
class RelationDefinition:
|
18 |
+
"""Defines a set of edges."""
|
19 |
+
|
20 |
+
df: str # The DataFrame that contains the edges.
|
21 |
+
source_column: (
|
22 |
+
str # The column in the edge DataFrame that contains the source node ID.
|
23 |
+
)
|
24 |
+
target_column: (
|
25 |
+
str # The column in the edge DataFrame that contains the target node ID.
|
26 |
+
)
|
27 |
+
source_table: str # The DataFrame that contains the source nodes.
|
28 |
+
target_table: str # The DataFrame that contains the target nodes.
|
29 |
+
source_key: str # The column in the source table that contains the node ID.
|
30 |
+
target_key: str # The column in the target table that contains the node ID.
|
31 |
+
name: str | None = None # Descriptive name for the relation.
|
32 |
+
|
33 |
+
|
34 |
+
@dataclasses.dataclass
|
35 |
+
class Bundle:
|
36 |
+
"""A collection of DataFrames and other data.
|
37 |
+
|
38 |
+
Can efficiently represent a knowledge graph (homogeneous or heterogeneous) or tabular data.
|
39 |
+
It can also carry other data, such as a trained model.
|
40 |
+
"""
|
41 |
+
|
42 |
+
dfs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
|
43 |
+
relations: list[RelationDefinition] = dataclasses.field(default_factory=list)
|
44 |
+
other: dict[str, typing.Any] = None
|
45 |
+
|
46 |
+
@classmethod
|
47 |
+
def from_nx(cls, graph: nx.Graph):
|
48 |
+
edges = nx.to_pandas_edgelist(graph)
|
49 |
+
d = dict(graph.nodes(data=True))
|
50 |
+
nodes = pd.DataFrame(d.values(), index=d.keys())
|
51 |
+
nodes["id"] = nodes.index
|
52 |
+
if "index" in nodes.columns:
|
53 |
+
nodes.drop(columns=["index"], inplace=True)
|
54 |
+
return cls(
|
55 |
+
dfs={"edges": edges, "nodes": nodes},
|
56 |
+
relations=[
|
57 |
+
RelationDefinition(
|
58 |
+
df="edges",
|
59 |
+
source_column="source",
|
60 |
+
target_column="target",
|
61 |
+
source_table="nodes",
|
62 |
+
target_table="nodes",
|
63 |
+
source_key="id",
|
64 |
+
target_key="id",
|
65 |
+
)
|
66 |
+
],
|
67 |
+
)
|
68 |
+
|
69 |
+
@classmethod
|
70 |
+
def from_df(cls, df: pd.DataFrame):
|
71 |
+
return cls(dfs={"df": df})
|
72 |
+
|
73 |
+
def to_nx(self):
|
74 |
+
# TODO: Use relations.
|
75 |
+
graph = nx.DiGraph()
|
76 |
+
if "nodes" in self.dfs:
|
77 |
+
df = self.dfs["nodes"]
|
78 |
+
if df.index.name != "id":
|
79 |
+
df = df.set_index("id")
|
80 |
+
graph.add_nodes_from(df.to_dict("index").items())
|
81 |
+
if "edges" in self.dfs:
|
82 |
+
edges = self.dfs["edges"]
|
83 |
+
graph.add_edges_from(
|
84 |
+
[
|
85 |
+
(
|
86 |
+
e["source"],
|
87 |
+
e["target"],
|
88 |
+
{
|
89 |
+
k: e[k]
|
90 |
+
for k in edges.columns
|
91 |
+
if k not in ["source", "target"]
|
92 |
+
},
|
93 |
+
)
|
94 |
+
for e in edges.to_records()
|
95 |
+
]
|
96 |
+
)
|
97 |
+
return graph
|
98 |
+
|
99 |
+
def copy(self):
|
100 |
+
"""Returns a medium depth copy of the bundle. The Bundle is completely new, but the DataFrames and RelationDefinitions are shared."""
|
101 |
+
return Bundle(
|
102 |
+
dfs=dict(self.dfs),
|
103 |
+
relations=list(self.relations),
|
104 |
+
other=dict(self.other) if self.other else None,
|
105 |
+
)
|
106 |
+
|
107 |
+
def to_dict(self, limit: int = 100):
|
108 |
+
return {
|
109 |
+
"dataframes": {
|
110 |
+
name: {
|
111 |
+
"columns": [str(c) for c in df.columns],
|
112 |
+
"data": df_for_frontend(df, limit).values.tolist(),
|
113 |
+
}
|
114 |
+
for name, df in self.dfs.items()
|
115 |
+
},
|
116 |
+
"relations": [dataclasses.asdict(relation) for relation in self.relations],
|
117 |
+
"other": self.other,
|
118 |
+
}
|
119 |
+
|
120 |
+
|
121 |
+
def nx_node_attribute_func(name):
|
122 |
+
"""Decorator for wrapping a function that adds a NetworkX node attribute."""
|
123 |
+
|
124 |
+
def decorator(func):
|
125 |
+
@functools.wraps(func)
|
126 |
+
def wrapper(graph: nx.Graph, **kwargs):
|
127 |
+
graph = graph.copy()
|
128 |
+
attr = func(graph, **kwargs)
|
129 |
+
nx.set_node_attributes(graph, attr, name)
|
130 |
+
return graph
|
131 |
+
|
132 |
+
return wrapper
|
133 |
+
|
134 |
+
return decorator
|
135 |
+
|
136 |
+
|
137 |
+
def disambiguate_edges(ws):
|
138 |
+
"""If an input plug is connected to multiple edges, keep only the last edge."""
|
139 |
+
seen = set()
|
140 |
+
for edge in reversed(ws.edges):
|
141 |
+
if (edge.target, edge.targetHandle) in seen:
|
142 |
+
ws.edges.remove(edge)
|
143 |
+
seen.add((edge.target, edge.targetHandle))
|
144 |
+
|
145 |
+
|
146 |
+
@ops.register_executor(ENV)
|
147 |
+
async def execute(ws):
|
148 |
+
catalog: dict[str, ops.Op] = ops.CATALOGS[ws.env]
|
149 |
+
disambiguate_edges(ws)
|
150 |
+
outputs = {}
|
151 |
+
failed = 0
|
152 |
+
while len(outputs) + failed < len(ws.nodes):
|
153 |
+
for node in ws.nodes:
|
154 |
+
if node.id in outputs:
|
155 |
+
continue
|
156 |
+
# TODO: Take the input/output handles into account.
|
157 |
+
inputs = [edge.source for edge in ws.edges if edge.target == node.id]
|
158 |
+
if all(input in outputs for input in inputs):
|
159 |
+
# All inputs for this node are ready, we can compute the output.
|
160 |
+
inputs = [outputs[input] for input in inputs]
|
161 |
+
data = node.data
|
162 |
+
params = {**data.params}
|
163 |
+
op = catalog.get(data.title)
|
164 |
+
if not op:
|
165 |
+
data.error = "Operation not found in catalog"
|
166 |
+
failed += 1
|
167 |
+
continue
|
168 |
+
try:
|
169 |
+
# Convert inputs types to match operation signature.
|
170 |
+
for i, (x, p) in enumerate(zip(inputs, op.inputs.values())):
|
171 |
+
if p.type == nx.Graph and isinstance(x, Bundle):
|
172 |
+
inputs[i] = x.to_nx()
|
173 |
+
elif p.type == Bundle and isinstance(x, nx.Graph):
|
174 |
+
inputs[i] = Bundle.from_nx(x)
|
175 |
+
elif p.type == Bundle and isinstance(x, pd.DataFrame):
|
176 |
+
inputs[i] = Bundle.from_df(x)
|
177 |
+
result = op(*inputs, **params)
|
178 |
+
except Exception as e:
|
179 |
+
traceback.print_exc()
|
180 |
+
data.error = str(e)
|
181 |
+
failed += 1
|
182 |
+
continue
|
183 |
+
if len(op.inputs) == 1 and op.inputs.get("multi") == "*":
|
184 |
+
# It's a flexible input. Create n+1 handles.
|
185 |
+
data.inputs = {f"input{i}": None for i in range(len(inputs) + 1)}
|
186 |
+
data.error = None
|
187 |
+
outputs[node.id] = result.output
|
188 |
+
if result.display:
|
189 |
+
data.display = result.display
|
190 |
+
|
191 |
+
|
192 |
+
def df_for_frontend(df: pd.DataFrame, limit: int) -> pd.DataFrame:
|
193 |
+
"""Returns a DataFrame with values that are safe to send to the frontend."""
|
194 |
+
df = df[:limit]
|
195 |
+
if isinstance(df, pl.LazyFrame):
|
196 |
+
df = df.collect()
|
197 |
+
if isinstance(df, pl.DataFrame):
|
198 |
+
df = df.to_pandas()
|
199 |
+
# Convert non-numeric columns to strings.
|
200 |
+
for c in df.columns:
|
201 |
+
if not pd.api.types.is_numeric_dtype(df[c]):
|
202 |
+
df[c] = df[c].astype(str)
|
203 |
+
return df
|
lynxkite-graph-analytics/src/lynxkite_graph_analytics/lynxkite_ops.py
CHANGED
@@ -1,201 +1,21 @@
|
|
1 |
-
"""Graph analytics operations.
|
2 |
|
3 |
import os
|
4 |
import fsspec
|
5 |
from lynxkite.core import ops
|
6 |
from collections import deque
|
7 |
-
import
|
8 |
-
import functools
|
9 |
import grandcypher
|
10 |
import joblib
|
11 |
import matplotlib
|
12 |
import networkx as nx
|
13 |
import pandas as pd
|
14 |
import polars as pl
|
15 |
-
import traceback
|
16 |
-
import typing
|
17 |
import json
|
18 |
|
19 |
|
20 |
mem = joblib.Memory("../joblib-cache")
|
21 |
-
|
22 |
-
op = ops.op_registration(ENV)
|
23 |
-
|
24 |
-
|
25 |
-
@dataclasses.dataclass
|
26 |
-
class RelationDefinition:
|
27 |
-
"""Defines a set of edges."""
|
28 |
-
|
29 |
-
df: str # The DataFrame that contains the edges.
|
30 |
-
source_column: (
|
31 |
-
str # The column in the edge DataFrame that contains the source node ID.
|
32 |
-
)
|
33 |
-
target_column: (
|
34 |
-
str # The column in the edge DataFrame that contains the target node ID.
|
35 |
-
)
|
36 |
-
source_table: str # The DataFrame that contains the source nodes.
|
37 |
-
target_table: str # The DataFrame that contains the target nodes.
|
38 |
-
source_key: str # The column in the source table that contains the node ID.
|
39 |
-
target_key: str # The column in the target table that contains the node ID.
|
40 |
-
name: str | None = None # Descriptive name for the relation.
|
41 |
-
|
42 |
-
|
43 |
-
@dataclasses.dataclass
|
44 |
-
class Bundle:
|
45 |
-
"""A collection of DataFrames and other data.
|
46 |
-
|
47 |
-
Can efficiently represent a knowledge graph (homogeneous or heterogeneous) or tabular data.
|
48 |
-
It can also carry other data, such as a trained model.
|
49 |
-
"""
|
50 |
-
|
51 |
-
dfs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
|
52 |
-
relations: list[RelationDefinition] = dataclasses.field(default_factory=list)
|
53 |
-
other: dict[str, typing.Any] = None
|
54 |
-
|
55 |
-
@classmethod
|
56 |
-
def from_nx(cls, graph: nx.Graph):
|
57 |
-
edges = nx.to_pandas_edgelist(graph)
|
58 |
-
d = dict(graph.nodes(data=True))
|
59 |
-
nodes = pd.DataFrame(d.values(), index=d.keys())
|
60 |
-
nodes["id"] = nodes.index
|
61 |
-
if "index" in nodes.columns:
|
62 |
-
nodes.drop(columns=["index"], inplace=True)
|
63 |
-
return cls(
|
64 |
-
dfs={"edges": edges, "nodes": nodes},
|
65 |
-
relations=[
|
66 |
-
RelationDefinition(
|
67 |
-
df="edges",
|
68 |
-
source_column="source",
|
69 |
-
target_column="target",
|
70 |
-
source_table="nodes",
|
71 |
-
target_table="nodes",
|
72 |
-
source_key="id",
|
73 |
-
target_key="id",
|
74 |
-
)
|
75 |
-
],
|
76 |
-
)
|
77 |
-
|
78 |
-
@classmethod
|
79 |
-
def from_df(cls, df: pd.DataFrame):
|
80 |
-
return cls(dfs={"df": df})
|
81 |
-
|
82 |
-
def to_nx(self):
|
83 |
-
# TODO: Use relations.
|
84 |
-
graph = nx.DiGraph()
|
85 |
-
if "nodes" in self.dfs:
|
86 |
-
df = self.dfs["nodes"]
|
87 |
-
if df.index.name != "id":
|
88 |
-
df = df.set_index("id")
|
89 |
-
graph.add_nodes_from(df.to_dict("index").items())
|
90 |
-
if "edges" in self.dfs:
|
91 |
-
edges = self.dfs["edges"]
|
92 |
-
graph.add_edges_from(
|
93 |
-
[
|
94 |
-
(
|
95 |
-
e["source"],
|
96 |
-
e["target"],
|
97 |
-
{
|
98 |
-
k: e[k]
|
99 |
-
for k in edges.columns
|
100 |
-
if k not in ["source", "target"]
|
101 |
-
},
|
102 |
-
)
|
103 |
-
for e in edges.to_records()
|
104 |
-
]
|
105 |
-
)
|
106 |
-
return graph
|
107 |
-
|
108 |
-
def copy(self):
|
109 |
-
"""Returns a medium depth copy of the bundle. The Bundle is completely new, but the DataFrames and RelationDefinitions are shared."""
|
110 |
-
return Bundle(
|
111 |
-
dfs=dict(self.dfs),
|
112 |
-
relations=list(self.relations),
|
113 |
-
other=dict(self.other) if self.other else None,
|
114 |
-
)
|
115 |
-
|
116 |
-
def to_dict(self, limit: int = 100):
|
117 |
-
return {
|
118 |
-
"dataframes": {
|
119 |
-
name: {
|
120 |
-
"columns": [str(c) for c in df.columns],
|
121 |
-
"data": df_for_frontend(df, limit).values.tolist(),
|
122 |
-
}
|
123 |
-
for name, df in self.dfs.items()
|
124 |
-
},
|
125 |
-
"relations": [dataclasses.asdict(relation) for relation in self.relations],
|
126 |
-
"other": self.other,
|
127 |
-
}
|
128 |
-
|
129 |
-
|
130 |
-
def nx_node_attribute_func(name):
|
131 |
-
"""Decorator for wrapping a function that adds a NetworkX node attribute."""
|
132 |
-
|
133 |
-
def decorator(func):
|
134 |
-
@functools.wraps(func)
|
135 |
-
def wrapper(graph: nx.Graph, **kwargs):
|
136 |
-
graph = graph.copy()
|
137 |
-
attr = func(graph, **kwargs)
|
138 |
-
nx.set_node_attributes(graph, attr, name)
|
139 |
-
return graph
|
140 |
-
|
141 |
-
return wrapper
|
142 |
-
|
143 |
-
return decorator
|
144 |
-
|
145 |
-
|
146 |
-
def disambiguate_edges(ws):
|
147 |
-
"""If an input plug is connected to multiple edges, keep only the last edge."""
|
148 |
-
seen = set()
|
149 |
-
for edge in reversed(ws.edges):
|
150 |
-
if (edge.target, edge.targetHandle) in seen:
|
151 |
-
ws.edges.remove(edge)
|
152 |
-
seen.add((edge.target, edge.targetHandle))
|
153 |
-
|
154 |
-
|
155 |
-
@ops.register_executor(ENV)
|
156 |
-
async def execute(ws):
|
157 |
-
catalog: dict[str, ops.Op] = ops.CATALOGS[ENV]
|
158 |
-
disambiguate_edges(ws)
|
159 |
-
outputs = {}
|
160 |
-
failed = 0
|
161 |
-
while len(outputs) + failed < len(ws.nodes):
|
162 |
-
for node in ws.nodes:
|
163 |
-
if node.id in outputs:
|
164 |
-
continue
|
165 |
-
# TODO: Take the input/output handles into account.
|
166 |
-
inputs = [edge.source for edge in ws.edges if edge.target == node.id]
|
167 |
-
if all(input in outputs for input in inputs):
|
168 |
-
# All inputs for this node are ready, we can compute the output.
|
169 |
-
inputs = [outputs[input] for input in inputs]
|
170 |
-
data = node.data
|
171 |
-
params = {**data.params}
|
172 |
-
op = catalog.get(data.title)
|
173 |
-
if not op:
|
174 |
-
data.error = "Operation not found in catalog"
|
175 |
-
failed += 1
|
176 |
-
continue
|
177 |
-
try:
|
178 |
-
# Convert inputs types to match operation signature.
|
179 |
-
for i, (x, p) in enumerate(zip(inputs, op.inputs.values())):
|
180 |
-
if p.type == nx.Graph and isinstance(x, Bundle):
|
181 |
-
inputs[i] = x.to_nx()
|
182 |
-
elif p.type == Bundle and isinstance(x, nx.Graph):
|
183 |
-
inputs[i] = Bundle.from_nx(x)
|
184 |
-
elif p.type == Bundle and isinstance(x, pd.DataFrame):
|
185 |
-
inputs[i] = Bundle.from_df(x)
|
186 |
-
result = op(*inputs, **params)
|
187 |
-
except Exception as e:
|
188 |
-
traceback.print_exc()
|
189 |
-
data.error = str(e)
|
190 |
-
failed += 1
|
191 |
-
continue
|
192 |
-
if len(op.inputs) == 1 and op.inputs.get("multi") == "*":
|
193 |
-
# It's a flexible input. Create n+1 handles.
|
194 |
-
data.inputs = {f"input{i}": None for i in range(len(inputs) + 1)}
|
195 |
-
data.error = None
|
196 |
-
outputs[node.id] = result.output
|
197 |
-
if result.display:
|
198 |
-
data.display = result.display
|
199 |
|
200 |
|
201 |
@op("Import Parquet")
|
@@ -246,14 +66,14 @@ def create_scale_free_graph(*, nodes: int = 10):
|
|
246 |
|
247 |
|
248 |
@op("Compute PageRank")
|
249 |
-
@nx_node_attribute_func("pagerank")
|
250 |
def compute_pagerank(graph: nx.Graph, *, damping=0.85, iterations=100):
|
251 |
# TODO: This requires scipy to be installed.
|
252 |
return nx.pagerank(graph, alpha=damping, max_iter=iterations)
|
253 |
|
254 |
|
255 |
@op("Compute betweenness centrality")
|
256 |
-
@nx_node_attribute_func("betweenness_centrality")
|
257 |
def compute_betweenness_centrality(graph: nx.Graph, *, k=10):
|
258 |
return nx.betweenness_centrality(graph, k=k)
|
259 |
|
@@ -271,7 +91,7 @@ def discard_parallel_edges(graph: nx.Graph):
|
|
271 |
|
272 |
|
273 |
@op("SQL")
|
274 |
-
def sql(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
275 |
"""Run a SQL query on the DataFrames in the bundle. Save the results as a new DataFrame."""
|
276 |
bundle = bundle.copy()
|
277 |
if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
|
@@ -292,7 +112,7 @@ def sql(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
|
292 |
|
293 |
|
294 |
@op("Cypher")
|
295 |
-
def cypher(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
296 |
"""Run a Cypher query on the graph in the bundle. Save the results as a new DataFrame."""
|
297 |
bundle = bundle.copy()
|
298 |
graph = bundle.to_nx()
|
@@ -302,7 +122,7 @@ def cypher(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
|
302 |
|
303 |
|
304 |
@op("Organize bundle")
|
305 |
-
def organize_bundle(bundle: Bundle, *, code: ops.LongStr):
|
306 |
"""Lets you rename/copy/delete DataFrames, and modify relations.
|
307 |
|
308 |
TODO: Use a declarative solution instead of Python code. Add UI.
|
@@ -351,13 +171,13 @@ def _map_color(value):
|
|
351 |
|
352 |
@op("Visualize graph", view="visualization")
|
353 |
def visualize_graph(
|
354 |
-
graph: Bundle,
|
355 |
*,
|
356 |
color_nodes_by: ops.NodeAttribute = None,
|
357 |
label_by: ops.NodeAttribute = None,
|
358 |
color_edges_by: ops.EdgeAttribute = None,
|
359 |
):
|
360 |
-
nodes = df_for_frontend(graph.dfs["nodes"], 10_000)
|
361 |
if color_nodes_by:
|
362 |
nodes["color"] = _map_color(nodes[color_nodes_by])
|
363 |
for cols in ["x y", "long lat"]:
|
@@ -387,7 +207,7 @@ def visualize_graph(
|
|
387 |
)
|
388 |
curveness = 0.3
|
389 |
nodes = nodes.to_records()
|
390 |
-
edges = df_for_frontend(
|
391 |
graph.dfs["edges"].drop_duplicates(["source", "target"]), 10_000
|
392 |
)
|
393 |
if color_edges_by:
|
@@ -446,22 +266,8 @@ def visualize_graph(
|
|
446 |
return v
|
447 |
|
448 |
|
449 |
-
def df_for_frontend(df: pd.DataFrame, limit: int) -> pd.DataFrame:
|
450 |
-
"""Returns a DataFrame with values that are safe to send to the frontend."""
|
451 |
-
df = df[:limit]
|
452 |
-
if isinstance(df, pl.LazyFrame):
|
453 |
-
df = df.collect()
|
454 |
-
if isinstance(df, pl.DataFrame):
|
455 |
-
df = df.to_pandas()
|
456 |
-
# Convert non-numeric columns to strings.
|
457 |
-
for c in df.columns:
|
458 |
-
if not pd.api.types.is_numeric_dtype(df[c]):
|
459 |
-
df[c] = df[c].astype(str)
|
460 |
-
return df
|
461 |
-
|
462 |
-
|
463 |
@op("View tables", view="table_view")
|
464 |
-
def view_tables(bundle: Bundle, *, limit: int = 100):
|
465 |
return bundle.to_dict(limit=limit)
|
466 |
|
467 |
|
@@ -470,7 +276,7 @@ def view_tables(bundle: Bundle, *, limit: int = 100):
|
|
470 |
view="graph_creation_view",
|
471 |
outputs=["output"],
|
472 |
)
|
473 |
-
def create_graph(bundle: Bundle, *, relations: str = None) -> Bundle:
|
474 |
"""Replace relations of the given bundle
|
475 |
|
476 |
relations is a stringified JSON, instead of a dict, because complex Yjs types (arrays, maps)
|
@@ -489,6 +295,6 @@ def create_graph(bundle: Bundle, *, relations: str = None) -> Bundle:
|
|
489 |
bundle = bundle.copy()
|
490 |
if not (relations is None or relations.strip() == ""):
|
491 |
bundle.relations = [
|
492 |
-
RelationDefinition(**r) for r in json.loads(relations).values()
|
493 |
]
|
494 |
return ops.Result(output=bundle, display=bundle.to_dict(limit=100))
|
|
|
1 |
+
"""Graph analytics operations."""
|
2 |
|
3 |
import os
|
4 |
import fsspec
|
5 |
from lynxkite.core import ops
|
6 |
from collections import deque
|
7 |
+
from . import core
|
|
|
8 |
import grandcypher
|
9 |
import joblib
|
10 |
import matplotlib
|
11 |
import networkx as nx
|
12 |
import pandas as pd
|
13 |
import polars as pl
|
|
|
|
|
14 |
import json
|
15 |
|
16 |
|
17 |
mem = joblib.Memory("../joblib-cache")
|
18 |
+
op = ops.op_registration(core.ENV)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
@op("Import Parquet")
|
|
|
66 |
|
67 |
|
68 |
@op("Compute PageRank")
|
69 |
+
@core.nx_node_attribute_func("pagerank")
|
70 |
def compute_pagerank(graph: nx.Graph, *, damping=0.85, iterations=100):
|
71 |
# TODO: This requires scipy to be installed.
|
72 |
return nx.pagerank(graph, alpha=damping, max_iter=iterations)
|
73 |
|
74 |
|
75 |
@op("Compute betweenness centrality")
|
76 |
+
@core.nx_node_attribute_func("betweenness_centrality")
|
77 |
def compute_betweenness_centrality(graph: nx.Graph, *, k=10):
|
78 |
return nx.betweenness_centrality(graph, k=k)
|
79 |
|
|
|
91 |
|
92 |
|
93 |
@op("SQL")
|
94 |
+
def sql(bundle: core.Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
95 |
"""Run a SQL query on the DataFrames in the bundle. Save the results as a new DataFrame."""
|
96 |
bundle = bundle.copy()
|
97 |
if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
|
|
|
112 |
|
113 |
|
114 |
@op("Cypher")
|
115 |
+
def cypher(bundle: core.Bundle, *, query: ops.LongStr, save_as: str = "result"):
|
116 |
"""Run a Cypher query on the graph in the bundle. Save the results as a new DataFrame."""
|
117 |
bundle = bundle.copy()
|
118 |
graph = bundle.to_nx()
|
|
|
122 |
|
123 |
|
124 |
@op("Organize bundle")
|
125 |
+
def organize_bundle(bundle: core.Bundle, *, code: ops.LongStr):
|
126 |
"""Lets you rename/copy/delete DataFrames, and modify relations.
|
127 |
|
128 |
TODO: Use a declarative solution instead of Python code. Add UI.
|
|
|
171 |
|
172 |
@op("Visualize graph", view="visualization")
|
173 |
def visualize_graph(
|
174 |
+
graph: core.Bundle,
|
175 |
*,
|
176 |
color_nodes_by: ops.NodeAttribute = None,
|
177 |
label_by: ops.NodeAttribute = None,
|
178 |
color_edges_by: ops.EdgeAttribute = None,
|
179 |
):
|
180 |
+
nodes = core.df_for_frontend(graph.dfs["nodes"], 10_000)
|
181 |
if color_nodes_by:
|
182 |
nodes["color"] = _map_color(nodes[color_nodes_by])
|
183 |
for cols in ["x y", "long lat"]:
|
|
|
207 |
)
|
208 |
curveness = 0.3
|
209 |
nodes = nodes.to_records()
|
210 |
+
edges = core.df_for_frontend(
|
211 |
graph.dfs["edges"].drop_duplicates(["source", "target"]), 10_000
|
212 |
)
|
213 |
if color_edges_by:
|
|
|
266 |
return v
|
267 |
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
@op("View tables", view="table_view")
|
270 |
+
def view_tables(bundle: core.Bundle, *, limit: int = 100):
|
271 |
return bundle.to_dict(limit=limit)
|
272 |
|
273 |
|
|
|
276 |
view="graph_creation_view",
|
277 |
outputs=["output"],
|
278 |
)
|
279 |
+
def create_graph(bundle: core.Bundle, *, relations: str = None) -> core.Bundle:
|
280 |
"""Replace relations of the given bundle
|
281 |
|
282 |
relations is a stringified JSON, instead of a dict, because complex Yjs types (arrays, maps)
|
|
|
295 |
bundle = bundle.copy()
|
296 |
if not (relations is None or relations.strip() == ""):
|
297 |
bundle.relations = [
|
298 |
+
core.RelationDefinition(**r) for r in json.loads(relations).values()
|
299 |
]
|
300 |
return ops.Result(output=bundle, display=bundle.to_dict(limit=100))
|
lynxkite-graph-analytics/tests/test_lynxkite_ops.py
CHANGED
@@ -2,12 +2,12 @@ import pandas as pd
|
|
2 |
import pytest
|
3 |
import networkx as nx
|
4 |
|
5 |
-
from lynxkite.core import workspace
|
6 |
-
from lynxkite_graph_analytics.
|
7 |
|
8 |
|
9 |
async def test_execute_operation_not_in_catalog():
|
10 |
-
ws = workspace.Workspace(env=
|
11 |
ws.nodes.append(
|
12 |
workspace.WorkspaceNode(
|
13 |
id="1",
|
@@ -23,6 +23,8 @@ async def test_execute_operation_not_in_catalog():
|
|
23 |
async def test_execute_operation_inputs_correct_cast():
|
24 |
# Test that the automatic casting of operation inputs works correctly.
|
25 |
|
|
|
|
|
26 |
@op("Create Bundle")
|
27 |
def create_bundle() -> Bundle:
|
28 |
df = pd.DataFrame({"source": [1, 2, 3], "target": [4, 5, 6]})
|
|
|
2 |
import pytest
|
3 |
import networkx as nx
|
4 |
|
5 |
+
from lynxkite.core import workspace, ops
|
6 |
+
from lynxkite_graph_analytics.core import Bundle, execute, ENV
|
7 |
|
8 |
|
9 |
async def test_execute_operation_not_in_catalog():
|
10 |
+
ws = workspace.Workspace(env=ENV)
|
11 |
ws.nodes.append(
|
12 |
workspace.WorkspaceNode(
|
13 |
id="1",
|
|
|
23 |
async def test_execute_operation_inputs_correct_cast():
|
24 |
# Test that the automatic casting of operation inputs works correctly.
|
25 |
|
26 |
+
op = ops.op_registration("test")
|
27 |
+
|
28 |
@op("Create Bundle")
|
29 |
def create_bundle() -> Bundle:
|
30 |
df = pd.DataFrame({"source": [1, 2, 3], "target": [4, 5, 6]})
|