Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

darabos commited on Feb 27

Commit

560385e

unverified ·

2 Parent(s): e9bf53e 0e76f97

Merge pull request #79 from biggraph/darabos-progress

Browse files

Files changed (14) hide show

.github/workflows/test.yaml +5 -0
lynxkite-app/src/lynxkite_app/crdt.py +7 -8
lynxkite-app/src/lynxkite_app/main.py +0 -6
lynxkite-app/web/src/apiTypes.ts +1 -0
lynxkite-app/web/src/index.css +66 -5
lynxkite-app/web/src/workspace/nodes/LynxKiteNode.tsx +4 -1
lynxkite-core/src/lynxkite/core/executors/one_by_one.py +8 -7
lynxkite-core/src/lynxkite/core/ops.py +13 -3
lynxkite-core/src/lynxkite/core/workspace.py +35 -0
lynxkite-graph-analytics/src/lynxkite_graph_analytics/__init__.py +14 -1
lynxkite-graph-analytics/src/lynxkite_graph_analytics/core.py +198 -0
lynxkite-graph-analytics/src/lynxkite_graph_analytics/lynxkite_ops.py +15 -209
lynxkite-graph-analytics/tests/test_lynxkite_ops.py +5 -3
lynxkite-lynxscribe/tests/test_llm_ops.py +1 -1

.github/workflows/test.yaml CHANGED Viewed

@@ -56,6 +56,11 @@ jobs:
           cd lynxkite-graph-analytics
           pytest
       - name: Try building the documentation
         run: |
           uv pip install mkdocs-material mkdocstrings[python]

           cd lynxkite-graph-analytics
           pytest
+      - name: Run LynxScribe tests
+        run: |
+          cd lynxkite-lynxscribe
+          pytest
       - name: Try building the documentation
         run: |
           uv pip install mkdocs-material mkdocstrings[python]

lynxkite-app/src/lynxkite_app/crdt.py CHANGED Viewed

@@ -86,6 +86,7 @@ def clean_input(ws_pyd):
     for node in ws_pyd.nodes:
         node.data.display = None
         node.data.error = None
         node.position.x = 0
         node.position.y = 0
         if node.model_extra:
@@ -175,7 +176,6 @@ delayed_executions = {}
 async def workspace_changed(name: str, changes: pycrdt.MapEvent, ws_crdt: pycrdt.Map):
     """Callback to react to changes in the workspace.
     Args:
         name: Name of the workspace.
         changes: Changes performed to the workspace.
@@ -197,6 +197,7 @@ async def workspace_changed(name: str, changes: pycrdt.MapEvent, ws_crdt: pycrdt
         getattr(change, "keys", {}).get("__execution_delay", {}).get("newValue", 0)
         for change in changes
     )
     if delay:
         task = asyncio.create_task(execute(name, ws_crdt, ws_pyd, delay))
         delayed_executions[name] = task
@@ -224,17 +225,15 @@ async def execute(
     assert path.is_relative_to(config.DATA_PATH), "Provided workspace path is invalid"
     # Save user changes before executing, in case the execution fails.
     workspace.save(ws_pyd, path)
-    await workspace.execute(ws_pyd)
-    workspace.save(ws_pyd, path)
-    # Execution happened on the Python object, we need to replicate
-    # the results to the CRDT object.
     with ws_crdt.doc.transaction():
         for nc, np in zip(ws_crdt["nodes"], ws_pyd.nodes):
             if "data" not in nc:
                 nc["data"] = pycrdt.Map()
-            # Display is added as a non collaborative field.
-            nc["data"]["display"] = np.data.display
-            nc["data"]["error"] = np.data.error
 @contextlib.asynccontextmanager

     for node in ws_pyd.nodes:
         node.data.display = None
         node.data.error = None
+        node.data.status = workspace.NodeStatus.done
         node.position.x = 0
         node.position.y = 0
         if node.model_extra:
 async def workspace_changed(name: str, changes: pycrdt.MapEvent, ws_crdt: pycrdt.Map):
     """Callback to react to changes in the workspace.
     Args:
         name: Name of the workspace.
         changes: Changes performed to the workspace.
         getattr(change, "keys", {}).get("__execution_delay", {}).get("newValue", 0)
         for change in changes
     )
+    print(f"Running {name} in {ws_pyd.env}...")
     if delay:
         task = asyncio.create_task(execute(name, ws_crdt, ws_pyd, delay))
         delayed_executions[name] = task
     assert path.is_relative_to(config.DATA_PATH), "Provided workspace path is invalid"
     # Save user changes before executing, in case the execution fails.
     workspace.save(ws_pyd, path)
     with ws_crdt.doc.transaction():
         for nc, np in zip(ws_crdt["nodes"], ws_pyd.nodes):
             if "data" not in nc:
                 nc["data"] = pycrdt.Map()
+            nc["data"]["status"] = "planned"
+            # Nodes get a reference to their CRDT maps, so they can update them as the results come in.
+            np._crdt = nc
+    await workspace.execute(ws_pyd)
+    workspace.save(ws_pyd, path)
 @contextlib.asynccontextmanager

lynxkite-app/src/lynxkite_app/main.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """The FastAPI server for serving the LynxKite application."""
-import os
 import shutil
 import pydantic
 import fastapi
@@ -13,11 +12,6 @@ from lynxkite.core import ops
 from lynxkite.core import workspace
 from . import crdt, config
-if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
-    import cudf.pandas
-    cudf.pandas.install()
 def detect_plugins():
     plugins = {}

 """The FastAPI server for serving the LynxKite application."""
 import shutil
 import pydantic
 import fastapi
 from lynxkite.core import workspace
 from . import crdt, config
 def detect_plugins():
     plugins = {}

lynxkite-app/web/src/apiTypes.ts CHANGED Viewed

@@ -41,6 +41,7 @@ export interface WorkspaceNodeData {
   };
   display?: unknown;
   error?: string | null;
   [k: string]: unknown;
 }
 export interface Position {

   };
   display?: unknown;
   error?: string | null;
+  in_progress?: boolean;
   [k: string]: unknown;
 }
 export interface Position {

lynxkite-app/web/src/index.css CHANGED Viewed

@@ -90,9 +90,33 @@ body {
   }
   .lynxkite-node .title {
-    /* background: oklch(75% 0.2 55); */
     font-weight: bold;
     padding: 8px;
   }
   .handle-name {
@@ -322,6 +346,38 @@ body {
   }
 }
 .react-flow__edge.selected path.react-flow__edge-path {
   outline: var(--xy-selection-border, var(--xy-selection-border-default));
   outline-offset: 10px;
@@ -379,13 +435,15 @@ body {
   display: flex;
   justify-content: space-between;
   padding: 8px 12px;
-  border-bottom: 1px solid #ccc; /* Adds a separator between rows */
 }
 /* Alternating background colors for table-like effect */
 .graph-creation-view .df-head:nth-child(odd) {
   background-color: #f9f9f9;
 }
 .graph-creation-view .df-head:nth-child(even) {
   background-color: #e0e0e0;
 }
@@ -393,7 +451,8 @@ body {
 .graph-relation-attributes {
   display: flex;
   flex-direction: column;
-  gap: 10px; /* Adds space between each label-input pair */
   width: 100%;
 }
@@ -402,7 +461,8 @@ body {
   font-weight: bold;
   display: block;
   margin-bottom: 2px;
-  color: #666; /* Lighter text for labels */
 }
 .graph-relation-attributes input {
@@ -415,7 +475,8 @@ body {
 }
 .graph-relation-attributes input:focus {
-  border-color: #007bff; /* Highlight input on focus */
 }
 .add-relationship-button {

   }
   .lynxkite-node .title {
     font-weight: bold;
     padding: 8px;
+    background-image: linear-gradient(
+      to right,
+      var(--status-color-1),
+      var(--status-color-2),
+      var(--status-color-3)
+    );
+    background-size: 180% 180%;
+    --status-color-1: oklch(75% 0.2 55);
+    --status-color-2: oklch(75% 0.2 55);
+    --status-color-3: oklch(75% 0.2 55);
+    transition: --status-color-1 0.3s, --status-color-2 0.3s, --status-color-3
+      0.3s;
+  }
+  .lynxkite-node .title.active {
+    --status-color-1: oklch(75% 0.2 55);
+    --status-color-2: oklch(90% 0.2 55);
+    --status-color-3: oklch(75% 0.1 55);
+    /* animation: active-node-gradient-animation 2s ease-in-out infinite; */
+  }
+  .lynxkite-node .title.planned {
+    --status-color-1: oklch(75% 0.1 55);
+    --status-color-2: oklch(75% 0.1 55);
+    --status-color-3: oklch(75% 0.1 55);
   }
   .handle-name {
   }
 }
+@keyframes active-node-gradient-animation {
+  0% {
+    background-position-x: 100%;
+  }
+  50% {
+    background-position-x: 0%;
+  }
+  100% {
+    background-position-x: 100%;
+  }
+}
+@property --status-color-1 {
+  syntax: "<color>";
+  initial-value: red;
+  inherits: false;
+}
+@property --status-color-2 {
+  syntax: "<color>";
+  initial-value: red;
+  inherits: false;
+}
+@property --status-color-3 {
+  syntax: "<color>";
+  initial-value: red;
+  inherits: false;
+}
 .react-flow__edge.selected path.react-flow__edge-path {
   outline: var(--xy-selection-border, var(--xy-selection-border-default));
   outline-offset: 10px;
   display: flex;
   justify-content: space-between;
   padding: 8px 12px;
+  /* Adds a separator between rows */
+  border-bottom: 1px solid #ccc;
 }
 /* Alternating background colors for table-like effect */
 .graph-creation-view .df-head:nth-child(odd) {
   background-color: #f9f9f9;
 }
 .graph-creation-view .df-head:nth-child(even) {
   background-color: #e0e0e0;
 }
 .graph-relation-attributes {
   display: flex;
   flex-direction: column;
+  /* Adds space between each label-input pair */
+  gap: 10px;
   width: 100%;
 }
   font-weight: bold;
   display: block;
   margin-bottom: 2px;
+  /* Lighter text for labels */
+  color: #666;
 }
 .graph-relation-attributes input {
 }
 .graph-relation-attributes input:focus {
+  /* Highlight input on focus */
+  border-color: #007bff;
 }
 .add-relationship-button {

lynxkite-app/web/src/workspace/nodes/LynxKiteNode.tsx CHANGED Viewed

@@ -71,7 +71,10 @@ export default function LynxKiteNode(props: LynxKiteNodeProps) {
       }}
     >
       <div className="lynxkite-node" style={props.nodeStyle}>
-        <div className="title bg-primary" onClick={titleClicked}>
           {data.title}
           {data.error && <span className="title-icon">⚠️</span>}
           {expanded || <span className="title-icon">⋯</span>}

       }}
     >
       <div className="lynxkite-node" style={props.nodeStyle}>
+        <div
+          className={`title bg-primary ${data.status}`}
+          onClick={titleClicked}
+        >
           {data.title}
           {data.error && <span className="title-icon">⚠️</span>}
           {expanded || <span className="title-icon">⋯</span>}

lynxkite-core/src/lynxkite/core/executors/one_by_one.py CHANGED Viewed

@@ -104,11 +104,11 @@ async def execute(ws: workspace.Workspace, catalog, cache=None):
     tasks = {}
     NO_INPUT = object()  # Marker for initial tasks.
     for node in ws.nodes:
-        node.data.error = None
         op = catalog.get(node.data.title)
         if op is None:
-            node.data.error = f'Operation "{node.data.title}" not found.'
             continue
         # Start tasks for nodes that have no non-batch inputs.
         if all([i.position in "top or bottom" for i in op.inputs.values()]):
             tasks[node.id] = [NO_INPUT]
@@ -123,12 +123,12 @@ async def execute(ws: workspace.Workspace, catalog, cache=None):
                 next_stage.setdefault(n, []).extend(ts)
                 continue
             node = nodes[n]
-            data = node.data
-            op = catalog[data.title]
-            params = {**data.params}
             if has_ctx(op):
                 params["_ctx"] = contexts[node.id]
             results = []
             for task in ts:
                 try:
                     inputs = []
@@ -150,7 +150,7 @@ async def execute(ws: workspace.Workspace, catalog, cache=None):
                         output = await await_if_needed(result.output)
                 except Exception as e:
                     traceback.print_exc()
-                    data.error = str(e)
                     break
                 contexts[node.id].last_result = output
                 # Returned lists and DataFrames are considered multiple tasks.
@@ -161,7 +161,7 @@ async def execute(ws: workspace.Workspace, catalog, cache=None):
                 results.extend(output)
             else:  # Finished all tasks without errors.
                 if result.display:
-                    data.display = await await_if_needed(result.display)
                 for edge in edges[node.id]:
                     t = nodes[edge.target]
                     op = catalog[t.data.title]
@@ -172,5 +172,6 @@ async def execute(ws: workspace.Workspace, catalog, cache=None):
                         ).extend(results)
                     else:
                         tasks.setdefault(edge.target, []).extend(results)
         tasks = next_stage
     return contexts

     tasks = {}
     NO_INPUT = object()  # Marker for initial tasks.
     for node in ws.nodes:
         op = catalog.get(node.data.title)
         if op is None:
+            node.publish_error(f'Operation "{node.data.title}" not found.')
             continue
+        node.publish_error(None)
         # Start tasks for nodes that have no non-batch inputs.
         if all([i.position in "top or bottom" for i in op.inputs.values()]):
             tasks[node.id] = [NO_INPUT]
                 next_stage.setdefault(n, []).extend(ts)
                 continue
             node = nodes[n]
+            op = catalog[node.data.title]
+            params = {**node.data.params}
             if has_ctx(op):
                 params["_ctx"] = contexts[node.id]
             results = []
+            node.publish_started()
             for task in ts:
                 try:
                     inputs = []
                         output = await await_if_needed(result.output)
                 except Exception as e:
                     traceback.print_exc()
+                    node.publish_error(e)
                     break
                 contexts[node.id].last_result = output
                 # Returned lists and DataFrames are considered multiple tasks.
                 results.extend(output)
             else:  # Finished all tasks without errors.
                 if result.display:
+                    result.display = await await_if_needed(result.display)
                 for edge in edges[node.id]:
                     t = nodes[edge.target]
                     op = catalog[t.data.title]
                         ).extend(results)
                     else:
                         tasks.setdefault(edge.target, []).extend(results)
+                node.publish_result(result)
         tasks = next_stage
     return contexts

lynxkite-core/src/lynxkite/core/ops.py CHANGED Viewed

@@ -9,6 +9,9 @@ import typing
 from dataclasses import dataclass
 from typing_extensions import Annotated
 CATALOGS = {}
 EXECUTORS = {}
@@ -94,8 +97,9 @@ class Result:
     JSON-serializable.
     """
-    output: typing.Any
     display: ReadOnlyJSON | None = None
 MULTI_INPUT = Input(name="multi", type="*")
@@ -232,9 +236,15 @@ def register_passive_op(env: str, name: str, inputs=[], outputs=["output"], para
 def register_executor(env: str):
-    """Decorator for registering an executor."""
-    def decorator(func):
         EXECUTORS[env] = func
         return func

 from dataclasses import dataclass
 from typing_extensions import Annotated
+if typing.TYPE_CHECKING:
+    from . import workspace
 CATALOGS = {}
 EXECUTORS = {}
     JSON-serializable.
     """
+    output: typing.Any = None
     display: ReadOnlyJSON | None = None
+    error: str | None = None
 MULTI_INPUT = Input(name="multi", type="*")
 def register_executor(env: str):
+    """Decorator for registering an executor.
+    The executor is a function that takes a workspace and executes the operations in it.
+    When it starts executing an operation, it should call `node.publish_started()` to indicate
+    the status on the UI. When the execution is finished, it should call `node.publish_result()`.
+    This will update the UI with the result of the operation.
+    """
+    def decorator(func: typing.Callable[[workspace.Workspace], typing.Any]):
         EXECUTORS[env] = func
         return func

lynxkite-core/src/lynxkite/core/workspace.py CHANGED Viewed

@@ -3,7 +3,9 @@
 import json
 from typing import Optional
 import dataclasses
 import os
 import pydantic
 import tempfile
 from . import ops
@@ -20,11 +22,18 @@ class Position(BaseConfig):
     y: float
 class WorkspaceNodeData(BaseConfig):
     title: str
     params: dict
     display: Optional[object] = None
     error: Optional[str] = None
     # Also contains a "meta" field when going out.
     # This is ignored when coming back from the frontend.
@@ -36,6 +45,32 @@ class WorkspaceNode(BaseConfig):
     type: str
     data: WorkspaceNodeData
     position: Position
 class WorkspaceEdge(BaseConfig):

 import json
 from typing import Optional
 import dataclasses
+import enum
 import os
+import pycrdt
 import pydantic
 import tempfile
 from . import ops
     y: float
+class NodeStatus(str, enum.Enum):
+    planned = "planned"
+    active = "active"
+    done = "done"
 class WorkspaceNodeData(BaseConfig):
     title: str
     params: dict
     display: Optional[object] = None
     error: Optional[str] = None
+    status: NodeStatus = NodeStatus.done
     # Also contains a "meta" field when going out.
     # This is ignored when coming back from the frontend.
     type: str
     data: WorkspaceNodeData
     position: Position
+    _crdt: pycrdt.Map
+    def publish_started(self):
+        """Notifies the frontend that work has started on this node."""
+        self.data.error = None
+        self.data.status = NodeStatus.active
+        if hasattr(self, "_crdt"):
+            with self._crdt.doc.transaction():
+                self._crdt["data"]["error"] = None
+                self._crdt["data"]["status"] = NodeStatus.active
+    def publish_result(self, result: ops.Result):
+        """Sends the result to the frontend. Call this in an executor when the result is available."""
+        self.data.display = result.display
+        self.data.error = result.error
+        self.data.status = NodeStatus.done
+        if hasattr(self, "_crdt"):
+            with self._crdt.doc.transaction():
+                self._crdt["data"]["display"] = result.display
+                self._crdt["data"]["error"] = result.error
+                self._crdt["data"]["status"] = NodeStatus.done
+    def publish_error(self, error: Exception | str | None):
+        """Can be called with None to clear the error state."""
+        result = ops.Result(error=str(error) if error else None)
+        self.publish_result(result)
 class WorkspaceEdge(BaseConfig):

lynxkite-graph-analytics/src/lynxkite_graph_analytics/__init__.py CHANGED Viewed

@@ -1,3 +1,16 @@
-from .lynxkite_ops import *  # noqa (imported to trigger registration)
 from . import networkx_ops  # noqa (imported to trigger registration)
 from . import pytorch_model_ops  # noqa (imported to trigger registration)

+"""Graph analytics environment for LynxKite. The core types and functions are imported here for easy access."""
+import os
+import pandas as pd
+if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
+    import cudf.pandas
+    cudf.pandas.install()
+pd.options.mode.copy_on_write = True  # Prepare for Pandas 3.0.
+from .core import *  # noqa (easier access for core classes)
+from . import lynxkite_ops  # noqa (imported to trigger registration)
 from . import networkx_ops  # noqa (imported to trigger registration)
 from . import pytorch_model_ops  # noqa (imported to trigger registration)

lynxkite-graph-analytics/src/lynxkite_graph_analytics/core.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""Graph analytics executor and data types."""
+from lynxkite.core import ops
+import dataclasses
+import functools
+import networkx as nx
+import pandas as pd
+import polars as pl
+import traceback
+import typing
+ENV = "LynxKite Graph Analytics"
+@dataclasses.dataclass
+class RelationDefinition:
+    """Defines a set of edges."""
+    df: str  # The DataFrame that contains the edges.
+    source_column: (
+        str  # The column in the edge DataFrame that contains the source node ID.
+    )
+    target_column: (
+        str  # The column in the edge DataFrame that contains the target node ID.
+    )
+    source_table: str  # The DataFrame that contains the source nodes.
+    target_table: str  # The DataFrame that contains the target nodes.
+    source_key: str  # The column in the source table that contains the node ID.
+    target_key: str  # The column in the target table that contains the node ID.
+    name: str | None = None  # Descriptive name for the relation.
+@dataclasses.dataclass
+class Bundle:
+    """A collection of DataFrames and other data.
+    Can efficiently represent a knowledge graph (homogeneous or heterogeneous) or tabular data.
+    It can also carry other data, such as a trained model.
+    """
+    dfs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
+    relations: list[RelationDefinition] = dataclasses.field(default_factory=list)
+    other: dict[str, typing.Any] = None
+    @classmethod
+    def from_nx(cls, graph: nx.Graph):
+        edges = nx.to_pandas_edgelist(graph)
+        d = dict(graph.nodes(data=True))
+        nodes = pd.DataFrame(d.values(), index=d.keys())
+        nodes["id"] = nodes.index
+        if "index" in nodes.columns:
+            nodes.drop(columns=["index"], inplace=True)
+        return cls(
+            dfs={"edges": edges, "nodes": nodes},
+            relations=[
+                RelationDefinition(
+                    df="edges",
+                    source_column="source",
+                    target_column="target",
+                    source_table="nodes",
+                    target_table="nodes",
+                    source_key="id",
+                    target_key="id",
+                )
+            ],
+        )
+    @classmethod
+    def from_df(cls, df: pd.DataFrame):
+        return cls(dfs={"df": df})
+    def to_nx(self):
+        # TODO: Use relations.
+        graph = nx.DiGraph()
+        if "nodes" in self.dfs:
+            df = self.dfs["nodes"]
+            if df.index.name != "id":
+                df = df.set_index("id")
+            graph.add_nodes_from(df.to_dict("index").items())
+        if "edges" in self.dfs:
+            edges = self.dfs["edges"]
+            graph.add_edges_from(
+                [
+                    (
+                        e["source"],
+                        e["target"],
+                        {
+                            k: e[k]
+                            for k in edges.columns
+                            if k not in ["source", "target"]
+                        },
+                    )
+                    for e in edges.to_records()
+                ]
+            )
+        return graph
+    def copy(self):
+        """Returns a medium depth copy of the bundle. The Bundle is completely new, but the DataFrames and RelationDefinitions are shared."""
+        return Bundle(
+            dfs=dict(self.dfs),
+            relations=list(self.relations),
+            other=dict(self.other) if self.other else None,
+        )
+    def to_dict(self, limit: int = 100):
+        return {
+            "dataframes": {
+                name: {
+                    "columns": [str(c) for c in df.columns],
+                    "data": df_for_frontend(df, limit).values.tolist(),
+                }
+                for name, df in self.dfs.items()
+            },
+            "relations": [dataclasses.asdict(relation) for relation in self.relations],
+            "other": self.other,
+        }
+def nx_node_attribute_func(name):
+    """Decorator for wrapping a function that adds a NetworkX node attribute."""
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(graph: nx.Graph, **kwargs):
+            graph = graph.copy()
+            attr = func(graph, **kwargs)
+            nx.set_node_attributes(graph, attr, name)
+            return graph
+        return wrapper
+    return decorator
+def disambiguate_edges(ws):
+    """If an input plug is connected to multiple edges, keep only the last edge."""
+    seen = set()
+    for edge in reversed(ws.edges):
+        if (edge.target, edge.targetHandle) in seen:
+            ws.edges.remove(edge)
+        seen.add((edge.target, edge.targetHandle))
+@ops.register_executor(ENV)
+async def execute(ws):
+    catalog: dict[str, ops.Op] = ops.CATALOGS[ws.env]
+    disambiguate_edges(ws)
+    outputs = {}
+    failed = 0
+    while len(outputs) + failed < len(ws.nodes):
+        for node in ws.nodes:
+            if node.id in outputs:
+                continue
+            # TODO: Take the input/output handles into account.
+            inputs = [edge.source for edge in ws.edges if edge.target == node.id]
+            if all(input in outputs for input in inputs):
+                # All inputs for this node are ready, we can compute the output.
+                inputs = [outputs[input] for input in inputs]
+                params = {**node.data.params}
+                op = catalog.get(node.data.title)
+                if not op:
+                    node.publish_error("Operation not found in catalog")
+                    failed += 1
+                    continue
+                node.publish_started()
+                try:
+                    # Convert inputs types  to match operation signature.
+                    for i, (x, p) in enumerate(zip(inputs, op.inputs.values())):
+                        if p.type == nx.Graph and isinstance(x, Bundle):
+                            inputs[i] = x.to_nx()
+                        elif p.type == Bundle and isinstance(x, nx.Graph):
+                            inputs[i] = Bundle.from_nx(x)
+                        elif p.type == Bundle and isinstance(x, pd.DataFrame):
+                            inputs[i] = Bundle.from_df(x)
+                    result = op(*inputs, **params)
+                except Exception as e:
+                    traceback.print_exc()
+                    node.publish_error(e)
+                    failed += 1
+                    continue
+                outputs[node.id] = result.output
+                node.publish_result(result)
+def df_for_frontend(df: pd.DataFrame, limit: int) -> pd.DataFrame:
+    """Returns a DataFrame with values that are safe to send to the frontend."""
+    df = df[:limit]
+    if isinstance(df, pl.LazyFrame):
+        df = df.collect()
+    if isinstance(df, pl.DataFrame):
+        df = df.to_pandas()
+    # Convert non-numeric columns to strings.
+    for c in df.columns:
+        if not pd.api.types.is_numeric_dtype(df[c]):
+            df[c] = df[c].astype(str)
+    return df

lynxkite-graph-analytics/src/lynxkite_graph_analytics/lynxkite_ops.py CHANGED Viewed

@@ -1,201 +1,21 @@
-"""Graph analytics operations. To be split into separate files when we have more."""
 import os
 import fsspec
 from lynxkite.core import ops
 from collections import deque
-import dataclasses
-import functools
 import grandcypher
 import joblib
 import matplotlib
 import networkx as nx
 import pandas as pd
 import polars as pl
-import traceback
-import typing
 import json
 mem = joblib.Memory("../joblib-cache")
-ENV = "LynxKite Graph Analytics"
-op = ops.op_registration(ENV)
-@dataclasses.dataclass
-class RelationDefinition:
-    """Defines a set of edges."""
-    df: str  # The DataFrame that contains the edges.
-    source_column: (
-        str  # The column in the edge DataFrame that contains the source node ID.
-    )
-    target_column: (
-        str  # The column in the edge DataFrame that contains the target node ID.
-    )
-    source_table: str  # The DataFrame that contains the source nodes.
-    target_table: str  # The DataFrame that contains the target nodes.
-    source_key: str  # The column in the source table that contains the node ID.
-    target_key: str  # The column in the target table that contains the node ID.
-    name: str | None = None  # Descriptive name for the relation.
-@dataclasses.dataclass
-class Bundle:
-    """A collection of DataFrames and other data.
-    Can efficiently represent a knowledge graph (homogeneous or heterogeneous) or tabular data.
-    It can also carry other data, such as a trained model.
-    """
-    dfs: dict[str, pd.DataFrame] = dataclasses.field(default_factory=dict)
-    relations: list[RelationDefinition] = dataclasses.field(default_factory=list)
-    other: dict[str, typing.Any] = None
-    @classmethod
-    def from_nx(cls, graph: nx.Graph):
-        edges = nx.to_pandas_edgelist(graph)
-        d = dict(graph.nodes(data=True))
-        nodes = pd.DataFrame(d.values(), index=d.keys())
-        nodes["id"] = nodes.index
-        if "index" in nodes.columns:
-            nodes.drop(columns=["index"], inplace=True)
-        return cls(
-            dfs={"edges": edges, "nodes": nodes},
-            relations=[
-                RelationDefinition(
-                    df="edges",
-                    source_column="source",
-                    target_column="target",
-                    source_table="nodes",
-                    target_table="nodes",
-                    source_key="id",
-                    target_key="id",
-                )
-            ],
-        )
-    @classmethod
-    def from_df(cls, df: pd.DataFrame):
-        return cls(dfs={"df": df})
-    def to_nx(self):
-        # TODO: Use relations.
-        graph = nx.DiGraph()
-        if "nodes" in self.dfs:
-            df = self.dfs["nodes"]
-            if df.index.name != "id":
-                df = df.set_index("id")
-            graph.add_nodes_from(df.to_dict("index").items())
-        if "edges" in self.dfs:
-            edges = self.dfs["edges"]
-            graph.add_edges_from(
-                [
-                    (
-                        e["source"],
-                        e["target"],
-                        {
-                            k: e[k]
-                            for k in edges.columns
-                            if k not in ["source", "target"]
-                        },
-                    )
-                    for e in edges.to_records()
-                ]
-            )
-        return graph
-    def copy(self):
-        """Returns a medium depth copy of the bundle. The Bundle is completely new, but the DataFrames and RelationDefinitions are shared."""
-        return Bundle(
-            dfs=dict(self.dfs),
-            relations=list(self.relations),
-            other=dict(self.other) if self.other else None,
-        )
-    def to_dict(self, limit: int = 100):
-        return {
-            "dataframes": {
-                name: {
-                    "columns": [str(c) for c in df.columns],
-                    "data": df_for_frontend(df, limit).values.tolist(),
-                }
-                for name, df in self.dfs.items()
-            },
-            "relations": [dataclasses.asdict(relation) for relation in self.relations],
-            "other": self.other,
-        }
-def nx_node_attribute_func(name):
-    """Decorator for wrapping a function that adds a NetworkX node attribute."""
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(graph: nx.Graph, **kwargs):
-            graph = graph.copy()
-            attr = func(graph, **kwargs)
-            nx.set_node_attributes(graph, attr, name)
-            return graph
-        return wrapper
-    return decorator
-def disambiguate_edges(ws):
-    """If an input plug is connected to multiple edges, keep only the last edge."""
-    seen = set()
-    for edge in reversed(ws.edges):
-        if (edge.target, edge.targetHandle) in seen:
-            ws.edges.remove(edge)
-        seen.add((edge.target, edge.targetHandle))
-@ops.register_executor(ENV)
-async def execute(ws):
-    catalog: dict[str, ops.Op] = ops.CATALOGS[ENV]
-    disambiguate_edges(ws)
-    outputs = {}
-    failed = 0
-    while len(outputs) + failed < len(ws.nodes):
-        for node in ws.nodes:
-            if node.id in outputs:
-                continue
-            # TODO: Take the input/output handles into account.
-            inputs = [edge.source for edge in ws.edges if edge.target == node.id]
-            if all(input in outputs for input in inputs):
-                # All inputs for this node are ready, we can compute the output.
-                inputs = [outputs[input] for input in inputs]
-                data = node.data
-                params = {**data.params}
-                op = catalog.get(data.title)
-                if not op:
-                    data.error = "Operation not found in catalog"
-                    failed += 1
-                    continue
-                try:
-                    # Convert inputs types  to match operation signature.
-                    for i, (x, p) in enumerate(zip(inputs, op.inputs.values())):
-                        if p.type == nx.Graph and isinstance(x, Bundle):
-                            inputs[i] = x.to_nx()
-                        elif p.type == Bundle and isinstance(x, nx.Graph):
-                            inputs[i] = Bundle.from_nx(x)
-                        elif p.type == Bundle and isinstance(x, pd.DataFrame):
-                            inputs[i] = Bundle.from_df(x)
-                    result = op(*inputs, **params)
-                except Exception as e:
-                    traceback.print_exc()
-                    data.error = str(e)
-                    failed += 1
-                    continue
-                if len(op.inputs) == 1 and op.inputs.get("multi") == "*":
-                    # It's a flexible input. Create n+1 handles.
-                    data.inputs = {f"input{i}": None for i in range(len(inputs) + 1)}
-                data.error = None
-                outputs[node.id] = result.output
-                if result.display:
-                    data.display = result.display
 @op("Import Parquet")
@@ -246,14 +66,14 @@ def create_scale_free_graph(*, nodes: int = 10):
 @op("Compute PageRank")
-@nx_node_attribute_func("pagerank")
 def compute_pagerank(graph: nx.Graph, *, damping=0.85, iterations=100):
     # TODO: This requires scipy to be installed.
     return nx.pagerank(graph, alpha=damping, max_iter=iterations)
 @op("Compute betweenness centrality")
-@nx_node_attribute_func("betweenness_centrality")
 def compute_betweenness_centrality(graph: nx.Graph, *, k=10):
     return nx.betweenness_centrality(graph, k=k)
@@ -271,7 +91,7 @@ def discard_parallel_edges(graph: nx.Graph):
 @op("SQL")
-def sql(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
     """Run a SQL query on the DataFrames in the bundle. Save the results as a new DataFrame."""
     bundle = bundle.copy()
     if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
@@ -292,7 +112,7 @@ def sql(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
 @op("Cypher")
-def cypher(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
     """Run a Cypher query on the graph in the bundle. Save the results as a new DataFrame."""
     bundle = bundle.copy()
     graph = bundle.to_nx()
@@ -302,7 +122,7 @@ def cypher(bundle: Bundle, *, query: ops.LongStr, save_as: str = "result"):
 @op("Organize bundle")
-def organize_bundle(bundle: Bundle, *, code: ops.LongStr):
     """Lets you rename/copy/delete DataFrames, and modify relations.
     TODO: Use a declarative solution instead of Python code. Add UI.
@@ -332,7 +152,7 @@ def _map_color(value):
     if pd.api.types.is_numeric_dtype(value):
         cmap = matplotlib.cm.get_cmap("viridis")
         value = (value - value.min()) / (value.max() - value.min())
-        rgba = cmap(value)
         return [
             "#{:02x}{:02x}{:02x}".format(int(r * 255), int(g * 255), int(b * 255))
             for r, g, b in rgba[:, :3]
@@ -351,13 +171,13 @@ def _map_color(value):
 @op("Visualize graph", view="visualization")
 def visualize_graph(
-    graph: Bundle,
     *,
     color_nodes_by: ops.NodeAttribute = None,
     label_by: ops.NodeAttribute = None,
     color_edges_by: ops.EdgeAttribute = None,
 ):
-    nodes = df_for_frontend(graph.dfs["nodes"], 10_000)
     if color_nodes_by:
         nodes["color"] = _map_color(nodes[color_nodes_by])
     for cols in ["x y", "long lat"]:
@@ -387,7 +207,7 @@ def visualize_graph(
         )
         curveness = 0.3
     nodes = nodes.to_records()
-    edges = df_for_frontend(
         graph.dfs["edges"].drop_duplicates(["source", "target"]), 10_000
     )
     if color_edges_by:
@@ -446,22 +266,8 @@ def visualize_graph(
     return v
-def df_for_frontend(df: pd.DataFrame, limit: int) -> pd.DataFrame:
-    """Returns a DataFrame with values that are safe to send to the frontend."""
-    df = df[:limit]
-    if isinstance(df, pl.LazyFrame):
-        df = df.collect()
-    if isinstance(df, pl.DataFrame):
-        df = df.to_pandas()
-    # Convert non-numeric columns to strings.
-    for c in df.columns:
-        if not pd.api.types.is_numeric_dtype(df[c]):
-            df[c] = df[c].astype(str)
-    return df
 @op("View tables", view="table_view")
-def view_tables(bundle: Bundle, *, limit: int = 100):
     return bundle.to_dict(limit=limit)
@@ -470,7 +276,7 @@ def view_tables(bundle: Bundle, *, limit: int = 100):
     view="graph_creation_view",
     outputs=["output"],
 )
-def create_graph(bundle: Bundle, *, relations: str = None) -> Bundle:
     """Replace relations of the given bundle
     relations is a stringified JSON, instead of a dict, because complex Yjs types (arrays, maps)
@@ -489,6 +295,6 @@ def create_graph(bundle: Bundle, *, relations: str = None) -> Bundle:
     bundle = bundle.copy()
     if not (relations is None or relations.strip() == ""):
         bundle.relations = [
-            RelationDefinition(**r) for r in json.loads(relations).values()
         ]
     return ops.Result(output=bundle, display=bundle.to_dict(limit=100))

+"""Graph analytics operations."""
 import os
 import fsspec
 from lynxkite.core import ops
 from collections import deque
+from . import core
 import grandcypher
 import joblib
 import matplotlib
 import networkx as nx
 import pandas as pd
 import polars as pl
 import json
 mem = joblib.Memory("../joblib-cache")
+op = ops.op_registration(core.ENV)
 @op("Import Parquet")
 @op("Compute PageRank")
+@core.nx_node_attribute_func("pagerank")
 def compute_pagerank(graph: nx.Graph, *, damping=0.85, iterations=100):
     # TODO: This requires scipy to be installed.
     return nx.pagerank(graph, alpha=damping, max_iter=iterations)
 @op("Compute betweenness centrality")
+@core.nx_node_attribute_func("betweenness_centrality")
 def compute_betweenness_centrality(graph: nx.Graph, *, k=10):
     return nx.betweenness_centrality(graph, k=k)
 @op("SQL")
+def sql(bundle: core.Bundle, *, query: ops.LongStr, save_as: str = "result"):
     """Run a SQL query on the DataFrames in the bundle. Save the results as a new DataFrame."""
     bundle = bundle.copy()
     if os.environ.get("NX_CUGRAPH_AUTOCONFIG", "").strip().lower() == "true":
 @op("Cypher")
+def cypher(bundle: core.Bundle, *, query: ops.LongStr, save_as: str = "result"):
     """Run a Cypher query on the graph in the bundle. Save the results as a new DataFrame."""
     bundle = bundle.copy()
     graph = bundle.to_nx()
 @op("Organize bundle")
+def organize_bundle(bundle: core.Bundle, *, code: ops.LongStr):
     """Lets you rename/copy/delete DataFrames, and modify relations.
     TODO: Use a declarative solution instead of Python code. Add UI.
     if pd.api.types.is_numeric_dtype(value):
         cmap = matplotlib.cm.get_cmap("viridis")
         value = (value - value.min()) / (value.max() - value.min())
+        rgba = cmap(value.values)
         return [
             "#{:02x}{:02x}{:02x}".format(int(r * 255), int(g * 255), int(b * 255))
             for r, g, b in rgba[:, :3]
 @op("Visualize graph", view="visualization")
 def visualize_graph(
+    graph: core.Bundle,
     *,
     color_nodes_by: ops.NodeAttribute = None,
     label_by: ops.NodeAttribute = None,
     color_edges_by: ops.EdgeAttribute = None,
 ):
+    nodes = core.df_for_frontend(graph.dfs["nodes"], 10_000)
     if color_nodes_by:
         nodes["color"] = _map_color(nodes[color_nodes_by])
     for cols in ["x y", "long lat"]:
         )
         curveness = 0.3
     nodes = nodes.to_records()
+    edges = core.df_for_frontend(
         graph.dfs["edges"].drop_duplicates(["source", "target"]), 10_000
     )
     if color_edges_by:
     return v
 @op("View tables", view="table_view")
+def view_tables(bundle: core.Bundle, *, limit: int = 100):
     return bundle.to_dict(limit=limit)
     view="graph_creation_view",
     outputs=["output"],
 )
+def create_graph(bundle: core.Bundle, *, relations: str = None) -> core.Bundle:
     """Replace relations of the given bundle
     relations is a stringified JSON, instead of a dict, because complex Yjs types (arrays, maps)
     bundle = bundle.copy()
     if not (relations is None or relations.strip() == ""):
         bundle.relations = [
+            core.RelationDefinition(**r) for r in json.loads(relations).values()
         ]
     return ops.Result(output=bundle, display=bundle.to_dict(limit=100))

lynxkite-graph-analytics/tests/test_lynxkite_ops.py CHANGED Viewed

@@ -2,12 +2,12 @@ import pandas as pd
 import pytest
 import networkx as nx
-from lynxkite.core import workspace
-from lynxkite_graph_analytics.lynxkite_ops import Bundle, execute, op
 async def test_execute_operation_not_in_catalog():
-    ws = workspace.Workspace(env="test")
     ws.nodes.append(
         workspace.WorkspaceNode(
             id="1",
@@ -23,6 +23,8 @@ async def test_execute_operation_not_in_catalog():
 async def test_execute_operation_inputs_correct_cast():
     # Test that the automatic casting of operation inputs works correctly.
     @op("Create Bundle")
     def create_bundle() -> Bundle:
         df = pd.DataFrame({"source": [1, 2, 3], "target": [4, 5, 6]})

 import pytest
 import networkx as nx
+from lynxkite.core import workspace, ops
+from lynxkite_graph_analytics.core import Bundle, execute, ENV
 async def test_execute_operation_not_in_catalog():
+    ws = workspace.Workspace(env=ENV)
     ws.nodes.append(
         workspace.WorkspaceNode(
             id="1",
 async def test_execute_operation_inputs_correct_cast():
     # Test that the automatic casting of operation inputs works correctly.
+    op = ops.op_registration("test")
     @op("Create Bundle")
     def create_bundle() -> Bundle:
         df = pd.DataFrame({"source": [1, 2, 3], "target": [4, 5, 6]})

lynxkite-lynxscribe/tests/test_llm_ops.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import unittest
-from lynxscribe.lynxkite import llm_ops  # noqa: F401
 from lynxkite.core.executors import one_by_one
 from lynxkite.core import ops, workspace

 import unittest
+from lynxkite_lynxscribe import llm_ops  # noqa: F401
 from lynxkite.core.executors import one_by_one
 from lynxkite.core import ops, workspace