Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

darabos commited on Jun 8, 2024

Commit

6988728

1 Parent(s): a07e9cb

Add RAG, batch inputs, caching.

Browse files

Files changed (5) hide show

requirements.txt +2 -0
server/llm_ops.py +118 -55
server/ops.py +0 -3
server/test_llm_ops.py +36 -14
server/workspace.py +0 -1

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ pandas
 scipy
 uvicorn[standard]
 # For llm_ops
 openai

 scipy
 uvicorn[standard]
 # For llm_ops
+chromadb
+Jinja2
 openai

server/llm_ops.py CHANGED Viewed

@@ -1,48 +1,50 @@
 '''For specifying an LLM agent logic flow.'''
 from . import ops
-import dataclasses
 import inspect
 import json
 import openai
 import pandas as pd
 import traceback
 from . import workspace
 client = openai.OpenAI(base_url="http://localhost:11434/v1")
-CACHE = {}
 ENV = 'LLM logic'
 op = ops.op_registration(ENV)
-@dataclasses.dataclass
-class Context:
   '''Passed to operation functions as "_ctx" if they have such a parameter.'''
   node: workspace.WorkspaceNode
-  last_result = None
-@dataclasses.dataclass
-class Output:
   '''Return this to send values to specific outputs of a node.'''
   output_handle: str
   value: dict
 def chat(*args, **kwargs):
   key = json.dumps({'args': args, 'kwargs': kwargs})
-  if key not in CACHE:
     completion = client.chat.completions.create(*args, **kwargs)
-    CACHE[key] = [c.message.content for c in completion.choices]
-  return CACHE[key]
 @op("Input")
 def input(*, filename: ops.PathStr, key: str):
   return pd.read_csv(filename).rename(columns={key: 'text'})
 @op("Create prompt")
-def create_prompt(input, *, template: ops.LongStr):
-  assert template, 'Please specify the template. Refer to columns using their names in uppercase.'
-  p = template
-  for k, v in input.items():
-    p = p.replace(k.upper(), str(v))
-  return p
 @op("Ask LLM")
 def ask_llm(input, *, model: str, accepted_regex: str = None, max_tokens: int = 100):
@@ -74,7 +76,7 @@ def view(input, *, _ctx: Context):
     v = {
       'dataframes': { 'df': {
         'columns': columns,
-        'data': [input[c] for c in columns],
       }}
     }
   return v
@@ -92,12 +94,30 @@ def loop(input, *, max_iterations: int = 3, _ctx: Context):
 @op('Branch', outputs=['true', 'false'])
 def branch(input, *, expression: str):
   res = eval(expression, input)
-  return Output(str(bool(res)).lower(), input)
 @ops.input_position(db="top")
 @op('RAG')
-def rag(input, db, *, closest_n: int=10):
-  return input
 @op('Run Python')
 def run_python(input, *, template: str):
@@ -107,16 +127,16 @@ def run_python(input, *, template: str):
     p = p.replace(k.upper(), str(v))
   return p
 @ops.register_executor(ENV)
 def execute(ws):
   catalog = ops.CATALOGS[ENV]
   nodes = {n.id: n for n in ws.nodes}
-  contexts = {n.id: Context(n) for n in ws.nodes}
   edges = {n.id: [] for n in ws.nodes}
   for e in ws.edges:
-    edges[e.source].append(e.target)
   tasks = {}
   NO_INPUT = object() # Marker for initial tasks.
   for node in ws.nodes:
@@ -125,39 +145,54 @@ def execute(ws):
     # Start tasks for nodes that have no inputs.
     if not op.inputs:
       tasks[node.id] = [NO_INPUT]
   # Run the rest until we run out of tasks.
-  while tasks:
-    n, ts = tasks.popitem()
-    node = nodes[n]
-    data = node.data
-    op = catalog[data.title]
-    params = {**data.params}
-    if has_ctx(op):
-      params['_ctx'] = contexts[node.id]
-    results = []
-    for task in ts:
-      try:
-        if task is NO_INPUT:
-          result = op(**params)
-        else:
-          # TODO: Tasks with multiple inputs?
-          result = op(task, **params)
-      except Exception as e:
-        traceback.print_exc()
-        data.error = str(e)
-        break
-      contexts[node.id].last_result = result
-      # Returned lists and DataFrames are considered multiple tasks.
-      if isinstance(result, pd.DataFrame):
-        result = df_to_list(result)
-      elif not isinstance(result, list):
-        result = [result]
-      results.extend(result)
-    else: # Finished all tasks without errors.
-      if op.type == 'visualization' or op.type == 'table_view':
-        data.display = results
-      for target in edges[node.id]:
-        tasks.setdefault(target, []).extend(results)
 def df_to_list(df):
   return [dict(zip(df.columns, row)) for row in df.values]
@@ -165,3 +200,31 @@ def df_to_list(df):
 def has_ctx(op):
   sig = inspect.signature(op.func)
   return '_ctx' in sig.parameters

 '''For specifying an LLM agent logic flow.'''
 from . import ops
+import chromadb
+import fastapi.encoders
 import inspect
+import jinja2
 import json
 import openai
 import pandas as pd
 import traceback
+import typing
 from . import workspace
 client = openai.OpenAI(base_url="http://localhost:11434/v1")
+jinja = jinja2.Environment()
+chroma_client = chromadb.Client()
+LLM_CACHE = {}
 ENV = 'LLM logic'
 op = ops.op_registration(ENV)
+class Context(ops.BaseConfig):
   '''Passed to operation functions as "_ctx" if they have such a parameter.'''
   node: workspace.WorkspaceNode
+  last_result: typing.Any = None
+class Output(ops.BaseConfig):
   '''Return this to send values to specific outputs of a node.'''
   output_handle: str
   value: dict
 def chat(*args, **kwargs):
   key = json.dumps({'args': args, 'kwargs': kwargs})
+  if key not in LLM_CACHE:
     completion = client.chat.completions.create(*args, **kwargs)
+    LLM_CACHE[key] = [c.message.content for c in completion.choices]
+  return LLM_CACHE[key]
 @op("Input")
 def input(*, filename: ops.PathStr, key: str):
   return pd.read_csv(filename).rename(columns={key: 'text'})
 @op("Create prompt")
+def create_prompt(input, *, save_as='prompt', template: ops.LongStr):
+  assert template, 'Please specify the template. Refer to columns using the Jinja2 syntax.'
+  t = jinja.from_string(template)
+  prompt = t.render(**input)
+  return {**input, save_as: prompt}
 @op("Ask LLM")
 def ask_llm(input, *, model: str, accepted_regex: str = None, max_tokens: int = 100):
     v = {
       'dataframes': { 'df': {
         'columns': columns,
+        'data': [[input[c] for c in columns]],
       }}
     }
   return v
 @op('Branch', outputs=['true', 'false'])
 def branch(input, *, expression: str):
   res = eval(expression, input)
+  return Output(output_handle=str(bool(res)).lower(), value=input)
 @ops.input_position(db="top")
 @op('RAG')
+def rag(input, db, *, input_field='text', db_field='text', num_matches: int=10, _ctx: Context):
+  last = _ctx.last_result
+  if last:
+    collection = last['_collection']
+  else:
+    collection_name = _ctx.node.id.replace(' ', '_')
+    for c in chroma_client.list_collections():
+      if c.name == collection_name:
+        chroma_client.delete_collection(name=collection_name)
+    collection = chroma_client.create_collection(name=collection_name)
+    collection.add(
+      documents=[r[db_field] for r in db],
+      ids=[str(i) for i in range(len(db))],
+    )
+  results = collection.query(
+    query_texts=[input[input_field]],
+    n_results=num_matches,
+  )
+  results = [db[int(r)] for r in results['ids'][0]]
+  return {**input, 'rag': results, '_collection': collection}
 @op('Run Python')
 def run_python(input, *, template: str):
     p = p.replace(k.upper(), str(v))
   return p
+EXECUTOR_OUTPUT_CACHE = {}
 @ops.register_executor(ENV)
 def execute(ws):
   catalog = ops.CATALOGS[ENV]
   nodes = {n.id: n for n in ws.nodes}
+  contexts = {n.id: Context(node=n) for n in ws.nodes}
   edges = {n.id: [] for n in ws.nodes}
   for e in ws.edges:
+    edges[e.source].append(e)
   tasks = {}
   NO_INPUT = object() # Marker for initial tasks.
   for node in ws.nodes:
     # Start tasks for nodes that have no inputs.
     if not op.inputs:
       tasks[node.id] = [NO_INPUT]
+  batch_inputs = {}
   # Run the rest until we run out of tasks.
+  for stage in get_stages(ws):
+    next_stage = {}
+    while tasks:
+      n, ts = tasks.popitem()
+      if n not in stage:
+        next_stage.setdefault(n, []).extend(ts)
+        continue
+      node = nodes[n]
+      data = node.data
+      op = catalog[data.title]
+      params = {**data.params}
+      if has_ctx(op):
+        params['_ctx'] = contexts[node.id]
+      results = []
+      for task in ts:
+        try:
+          inputs = [
+            batch_inputs[(n, i.name)] if i.position == 'top' else task
+            for i in op.inputs.values()]
+          key = json.dumps(fastapi.encoders.jsonable_encoder((inputs, params)))
+          if key not in EXECUTOR_OUTPUT_CACHE:
+            EXECUTOR_OUTPUT_CACHE[key] = op.func(*inputs, **params)
+          result = EXECUTOR_OUTPUT_CACHE[key]
+        except Exception as e:
+          traceback.print_exc()
+          data.error = str(e)
+          break
+        contexts[node.id].last_result = result
+        # Returned lists and DataFrames are considered multiple tasks.
+        if isinstance(result, pd.DataFrame):
+          result = df_to_list(result)
+        elif not isinstance(result, list):
+          result = [result]
+        results.extend(result)
+      else: # Finished all tasks without errors.
+        if op.type == 'visualization' or op.type == 'table_view':
+          data.display = results[0]
+        for edge in edges[node.id]:
+          t = nodes[edge.target]
+          op = catalog[t.data.title]
+          i = op.inputs[edge.targetHandle]
+          if i.position == 'top':
+            batch_inputs.setdefault((edge.target, edge.targetHandle), []).extend(results)
+          else:
+            tasks.setdefault(edge.target, []).extend(results)
+    tasks = next_stage
 def df_to_list(df):
   return [dict(zip(df.columns, row)) for row in df.values]
 def has_ctx(op):
   sig = inspect.signature(op.func)
   return '_ctx' in sig.parameters
+def get_stages(ws):
+  '''Inputs on top are batch inputs. We decompose the graph into a DAG of components along these edges.'''
+  catalog = ops.CATALOGS[ENV]
+  nodes = {n.id: n for n in ws.nodes}
+  batch_inputs = {}
+  inputs = {}
+  for edge in ws.edges:
+    inputs.setdefault(edge.target, []).append(edge.source)
+    node = nodes[edge.target]
+    op = catalog[node.data.title]
+    i = op.inputs[edge.targetHandle]
+    if i.position == 'top':
+      batch_inputs.setdefault(edge.target, []).append(edge.source)
+  stages = []
+  for bt, bss in batch_inputs.items():
+    upstream = set(bss)
+    new = set(bss)
+    while new:
+      n = new.pop()
+      for i in inputs.get(n, []):
+        if i not in upstream:
+          upstream.add(i)
+          new.add(i)
+    stages.append(upstream)
+  stages.sort(key=lambda s: len(s))
+  stages.append(set(nodes))
+  return stages

server/ops.py CHANGED Viewed

@@ -1,11 +1,8 @@
 '''API for implementing LynxKite operations.'''
 from __future__ import annotations
-import dataclasses
 import enum
 import functools
 import inspect
-import networkx as nx
-import pandas as pd
 import pydantic
 import typing
 from typing_extensions import Annotated

 '''API for implementing LynxKite operations.'''
 from __future__ import annotations
 import enum
 import functools
 import inspect
 import pydantic
 import typing
 from typing_extensions import Annotated

server/test_llm_ops.py CHANGED Viewed

@@ -2,27 +2,49 @@ import unittest
 from . import llm_ops
 from . import workspace
 class LLMOpsTest(unittest.TestCase):
   def testExecute(self):
     ws = workspace.Workspace(env='LLM logic', nodes=[
-      workspace.WorkspaceNode(
-        id='0',
-        type='basic',
-        position=workspace.Position(x=0, y=0),
-        data=workspace.WorkspaceNodeData(title='Input', params={
-          'filename': '/Users/danieldarabos/Downloads/aimo-train.csv',
-          'key': 'problem',
-        })),
-      workspace.WorkspaceNode(
-        id='1',
-        type='table_view',
-        position=workspace.Position(x=0, y=0),
-        data=workspace.WorkspaceNodeData(title='View', params={})),
     ], edges=[
-      workspace.WorkspaceEdge(id='0-1', source='0', target='1', sourceHandle='', targetHandle=''),
     ])
     llm_ops.execute(ws)
     self.assertEqual('', ws.nodes[1].data.display)
 if __name__ == '__main__':
   unittest.main()

 from . import llm_ops
 from . import workspace
+def make_node(id, op, type='basic', **params):
+  return workspace.WorkspaceNode(
+    id=id,
+    type=type,
+    position=workspace.Position(x=0, y=0),
+    data=workspace.WorkspaceNodeData(title=op, params=params),
+  )
+def make_input(id):
+  return make_node(
+    id, 'Input',
+    filename='/Users/danieldarabos/Downloads/aimo-train.csv',
+    key='problem')
+def make_edge(source, target, targetHandle='input'):
+  return workspace.WorkspaceEdge(
+    id=f'{source}-{target}', source=source, target=target, sourceHandle='', targetHandle=targetHandle)
 class LLMOpsTest(unittest.TestCase):
   def testExecute(self):
     ws = workspace.Workspace(env='LLM logic', nodes=[
+      make_node(
+        '0', 'Input',
+        filename='/Users/danieldarabos/Downloads/aimo-train.csv',
+        key='problem'),
+      make_node(
+        '1', 'View', type='table_view'),
     ], edges=[
+      make_edge('0', '1')
     ])
     llm_ops.execute(ws)
     self.assertEqual('', ws.nodes[1].data.display)
+  def testStages(self):
+    ws = workspace.Workspace(env='LLM logic', nodes=[
+      make_input('in1'), make_input('in2'), make_input('in3'),
+      make_node('rag1', 'RAG'), make_node('rag2', 'RAG'),
+      make_node('p1', 'Create prompt'), make_node('p2', 'Create prompt'),
+    ], edges=[
+      make_edge('in1', 'rag1', 'db'), make_edge('in2', 'rag1'),
+      make_edge('rag1', 'p1'), make_edge('p1', 'rag2', 'db'),
+      make_edge('in3', 'p2'), make_edge('p3', 'rag2'),
+    ])
+    stages = llm_ops.get_stages(ws)
+    self.assertEqual('', stages)
 if __name__ == '__main__':
   unittest.main()

server/workspace.py CHANGED Viewed

@@ -4,7 +4,6 @@ import dataclasses
 import os
 import pydantic
 import tempfile
-import traceback
 from . import ops
 class BaseConfig(pydantic.BaseModel):

 import os
 import pydantic
 import tempfile
 from . import ops
 class BaseConfig(pydantic.BaseModel):