Spaces:

sail
/

zero-bubble-pipeline-parallellism

Running

App Files Files Community

Wan Xinyi commited on Nov 29, 2023

Commit

4b2c8d9

1 Parent(s): 933f413

initial commit

Browse files

Files changed (3) hide show

app.py +126 -0
auto_schedule.py +564 -0
v_schedule.py +461 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import gradio as gr
+import auto_schedule
+import v_schedule
+def greet(name, is_morning, temperature):
+    salutation = "Good morning" if is_morning else "Good evening"
+    greeting = f"{salutation} {name}. It is {temperature} degrees today"
+    celsius = (temperature - 32) * 5 / 9
+    return greeting, round(celsius, 2)
+def percentage(x):
+  return f"{x*100:.2f}%"
+def get_schedule_time_and_image(result):
+  result = [
+    list(filter(lambda x: x.type in {'F', 'B', 'W'}, r)) for r in result
+  ]
+  time = max(
+    [
+      max([x.completion_time for x in stage]) - min([x.start_time for x in stage]) for stage in result
+    ]
+  )
+  return time, None
+def calculate(p, m, f, b, w, c, mem):
+  baseline_time=(f+b+w)*m + (f+b+w+c)*(p-1)
+  baseline_bubble=percentage(baseline_time/(f+b+w)/m - 1)
+  baseline_acceleration=percentage(0)
+  baseline_image=None
+  zb_result = auto_schedule.auto_schedule(p, m, auto_schedule.GraphConfig(
+        cost_f=f,
+        cost_b=b,
+        cost_w=w,
+        cost_comm=c,
+        max_mem=mem * 2,
+        print_scaling=1000
+  ))
+  zb_time,zb_image=get_schedule_time_and_image(zb_result)
+  zb_bubble=percentage(zb_time/(f+b+w)/m - 1)
+  zb_acceleration=percentage(baseline_time/zb_time - 1)
+  zbv_graph = v_schedule.PipelineGraph(
+                n_stage=p,
+                n_micro=m,
+                f_cost=f/2,
+                b_cost=b/2,
+                w_cost=w/2,
+                c_cost=c,
+                f_mem=2,
+                b_mem=-1,
+                w_mem=-1,
+                max_mem=mem * 4,
+  )
+  zbv_result = zbv_graph.get_v_schedule()
+  zbv_time,zbv_image = get_schedule_time_and_image(zbv_result)
+  zbv_bubble=percentage(zbv_time/(f+b+w)/m - 1)
+  zbv_acceleration=percentage(baseline_time/zbv_time - 1)
+  zbv_image=None
+  return [baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image]
+with gr.Blocks() as demo:
+  gr.Markdown("Zero bubble pipeline parallel bubble calculator")
+  with gr.Row():
+    with gr.Column(scale=1):
+      with gr.Group():
+        gr.Markdown("Basic Parameters")
+        with gr.Row():
+          p=gr.Number(label="Number of stages (p)", value=4, interactive=True, precision=0)
+          m=gr.Number(label="Number of microbatches (m)", value=12, interactive=True, precision=0)
+    with gr.Column(scale=2):
+      with gr.Group():
+        gr.Markdown("Costs. All costs are used as integers. For ZBV schedules, this is the time of two virtual stages on a stage combined.")
+        with gr.Row():
+          f=gr.Number(label="Time of F", value=8, interactive=True, precision=0)
+          b=gr.Number(label="Time of B", value=8, interactive=True, precision=0)
+          w=gr.Number(label="Time of W", value=8, interactive=True, precision=0)
+          c=gr.Number(label="Time of one P2P communication", value=1, interactive=True, precision=0)
+  with gr.Group():
+    gr.Markdown("Activation memory limit.")
+    def update_mem(p, s, mem):
+      print("update")
+      if s=="custom":
+        return mem
+      return p*int(s[:-1])
+    memsel=gr.Radio(choices=["1p", "2p", "3p", "custom"], value="1p")
+    mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For ZBV schedules, this is relative to two virtual stages on a stage combined.", value=p.value, interactive=True, precision=0)
+    memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
+    p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
+  button=gr.Button("Calculate")
+  with gr.Group():
+    gr.Markdown("1F1B")
+    with gr.Row():
+      with gr.Column(scale=1):
+        baseline_time=gr.Textbox("", label="Longest Stage Time")
+        baseline_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
+        baseline_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+      with gr.Column(scale=4):
+        baseline_image=gr.Image(None, interactive=False, label="Schedule Image")
+  with gr.Group():
+    gr.Markdown("Zero Bubble Schedule")
+    with gr.Row():
+      with gr.Column(scale=1):
+        zb_time=gr.Textbox("", label="Longest Stage Time")
+        zb_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
+        zb_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+      with gr.Column(scale=4):
+        zb_image=gr.Image(None, interactive=False, label="Schedule Image")
+  with gr.Group():
+    gr.Markdown("Zero Bubble V Schedule")
+    with gr.Row():
+      with gr.Column(scale=1):
+        zbv_time=gr.Textbox("", label="Longest Stage Time")
+        zbv_bubble=gr.Textbox("", label="Bubble Rate. Calculated as (1 - longest stage time/(F+B+W)/m).")
+        zbv_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
+      with gr.Column(scale=4):
+        zbv_image=gr.Image(None, interactive=False, label="Schedule Image")
+    button.click(calculate, inputs=[p, m, f, b, w, c, mem], outputs=[baseline_time, baseline_bubble, baseline_acceleration, baseline_image, zb_time, zb_bubble, zb_acceleration, zb_image, zbv_time, zbv_bubble, zbv_acceleration, zbv_image])
+demo.launch()

auto_schedule.py ADDED Viewed

	@@ -0,0 +1,564 @@

+from dataclasses import dataclass
+from typing import List, Set
+@dataclass
+class GraphConfig:
+    mem_f: float = 2
+    mem_b: float = -1
+    mem_w: float = -1
+    max_mem: float = None
+    cost_f: int = 1
+    cost_b: int = 1
+    cost_w: int = 1
+    cost_comm: int = 0
+    print_scaling: int = 1
+    def __post_init__(self):
+        assert type(self.cost_f) is int
+        assert type(self.cost_b) is int
+        assert type(self.cost_w) is int
+        assert type(self.cost_comm) is int
+        assert self.mem_f + self.mem_b + self.mem_w == 0
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+    rollback: bool = False
+@dataclass
+class Graph:
+    nstages: int
+    nmb: int
+    nnodes: int
+    config: GraphConfig
+    parents: List[Set[int]] = None
+    name: List[str] = None
+    # ID mapping:
+    # F[stage][minibatch]: 0..STAGE* MB
+    # B[stage][minibatch]: STAGE* MB .. 2 * STAGE * MB
+    # W[stage][minibatch]: 2 * STAGE* MB .. 3 * STAGE * MB
+    def get_id(self, type, stage, mb):
+        return type * (self.nstages * self.nmb) + stage * self.nmb + mb
+    def get_stage(self, id):
+        return (id // self.nmb) % self.nstages
+    def get_cost(self, id):
+        type = id // (self.nstages * self.nmb)
+        return [self.config.cost_f, self.config.cost_b, self.config.cost_w][type]
+    def get_mem(self, id):
+        type = id // (self.nstages * self.nmb)
+        return [self.config.mem_f, self.config.mem_b, self.config.mem_w][type]
+    @classmethod
+    def build_graph(cls, nstages, nmb, config):
+        nnodes = nstages * nmb * 3
+        g = Graph(nstages=nstages, nmb=nmb, nnodes=nnodes, config=config)
+        parents = []
+        name = []
+        for type in range(3):
+            for stage in range(nstages):
+                for mb in range(nmb):
+                    p = set()
+                    if type == 0:
+                        name.append(f'F{mb}')
+                        if stage > 0:
+                            p.add(g.get_id(type, stage - 1, mb))
+                        if mb > 0:
+                            p.add(g.get_id(type, stage, mb - 1))
+                    elif type == 1:
+                        name.append(f'B{mb}')
+                        if stage == nstages - 1:
+                            p.add(g.get_id(0, stage, mb))
+                        else:
+                            p.add(g.get_id(type, stage + 1, mb))
+                        if mb > 0:
+                            p.add(g.get_id(type, stage, mb - 1))
+                    elif type == 2:
+                        name.append(f'W{mb}')
+                        p.add(g.get_id(1, stage, mb))
+                        if mb > 0:
+                            p.add(g.get_id(type, stage, mb - 1))
+                    else:
+                        assert False
+                    parents.append(p)
+        g.name = name
+        g.parents = parents
+        return g
+    # Manual ordering producing this kind of schedule:
+    # fffffffbfbfbfbfbfbwbwbwbwbwbwbwwwwww
+    #  fffffbfbfbfbfbfbfbfbwbwbwbwbwwwwwwww
+    #   fffbfbfbfbfbfbfbfbfbfbwbwbwwwwwwwwww
+    #    fbfbfbfbfbfbfbfbfbfbfbfbwwwwwwwwwwww
+    # Returns the order index of each node on its own stage
+    def manual_order(
+        self, allow_bubble_before_first_b=False, prioritize_b=False, no_bubble_greedy=True
+    ):
+        order = [0] * self.nnodes
+        f = [0] * self.nstages
+        b = [0] * self.nstages
+        w = [0] * self.nstages
+        o = [0] * self.nstages
+        m = [0] * self.nstages
+        e = [0] * self.nstages
+        t = [0] * self.nnodes
+        max_mem = self.config.max_mem or self.get_mem(self.get_id(0, 0, 0)) * self.nmb * 3
+        comm = self.config.cost_comm
+        order_str = [""] * self.nstages
+        stage_bubble = [0] * self.nstages
+        def get_max_bubble():
+            max_bubble = 0
+            for bb in stage_bubble:
+                max_bubble = max(max_bubble, bb)
+            return max_bubble
+        def put(stage_j, type_k):
+            if type_k == 0:
+                _i = f[stage_j]
+            elif type_k == 1:
+                _i = b[stage_j]
+            else:
+                _i = w[stage_j]
+            _j = stage_j
+            _id = self.get_id(type_k, _j, _i)
+            _mem = self.get_mem(_id)
+            _cost = self.get_cost(_id)
+            assert m[_j] + _mem <= max_mem
+            tmp = e[_j] + _cost
+            no_bubble = tmp
+            if _j > 0 and type_k == 0:
+                tmp = max(tmp, t[self.get_id(0, _j - 1, _i)] + comm + _cost)
+            if _j < self.nstages - 1 and type_k == 1:
+                tmp = max(tmp, t[self.get_id(1, _j + 1, _i)] + comm + _cost)
+            if f[_j] > 0:
+                stage_bubble[_j] += tmp - no_bubble
+            e[_j] = tmp
+            t[_id] = tmp
+            m[_j] += _mem
+            order[_id] = o[_j]
+            if type_k == 0:
+                f[_j] += 1
+            elif type_k == 1:
+                b[_j] += 1
+            else:
+                w[_j] += 1
+            o[_j] += 1
+            fbw = "fbw"
+            order_str[stage_j] += fbw[type_k]
+        for i in range(self.nmb):
+            if i == 0:
+                for j in range(self.nstages):
+                    put(j, 0)
+                f_required = [0] * self.nstages
+                last_t = 0
+                for j in range(self.nstages - 1, -1, -1):
+                    if j == self.nstages - 1:
+                        last_t = t[self.get_id(0, j, i)] + self.get_cost(self.get_id(1, j, i))
+                        continue
+                    mem = m[j]
+                    cost = e[j]
+                    while True:
+                        f_id = self.get_id(0, j, f[j] + f_required[j])
+                        if f[j] + f_required[j] < self.nmb and mem + self.get_mem(f_id) <= max_mem:
+                            if allow_bubble_before_first_b:
+                                if cost + self.get_cost(f_id) > last_t + comm:
+                                    break
+                            else:
+                                if cost >= last_t + comm:
+                                    break
+                            mem += self.get_mem(f_id)
+                            cost += self.get_cost(f_id)
+                            f_required[j] += 1
+                        else:
+                            break
+                    last_t = max(cost, last_t + comm) + self.get_cost(self.get_id(1, j, i))
+                for j in range(self.nstages):
+                    while j > 0 and f_required[j] > 0 and f_required[j] >= f_required[j - 1] and f[j] + f_required[j] < self.nmb:
+                        f_required[j] -= 1
+                for j in range(self.nstages - 1, -1, -1):
+                    for _ in range(f_required[j]):
+                        put(j, 0)
+                    put(j, 1)
+                continue
+            f_required = [0] * self.nstages
+            for j in range(self.nstages):
+                if f[j] >= self.nmb:
+                    continue
+                if j + 1 < self.nstages and f[j] >= f[j + 1] + 2 and prioritize_b:
+                    next_plus_fw = (
+                        e[j + 1]
+                        + self.get_cost(self.get_id(0, j + 1, f[j + 1]))
+                        + self.get_cost(self.get_id(1, j + 1, b[j + 1]))
+                        + comm
+                    )
+                    if e[j] >= next_plus_fw:
+                        continue
+                    f_id = self.get_id(0, j, f[j])
+                    f_mem = self.get_mem(f_id)
+                    w_cost, w_cnt = 0, 0
+                    mem_with_w = m[j] + f_mem
+                    while mem_with_w > max_mem and w[j] + w_cnt < b[j]:
+                        w_id = self.get_id(2, j, w[j] + w_cnt)
+                        w_cost += self.get_cost(w_id)
+                        mem_with_w += self.get_mem(w_id)
+                        w_cnt += 1
+                    if e[j] + self.get_cost(f_id) + w_cost <= next_plus_fw:
+                        f_required[j] = 1
+                        continue
+                    w_cost, w_cnt = 0, 0
+                    # mem_with_w = m[j]
+                    # while w[j] + w_cnt < b[j]:
+                    #     w_id = self.get_id(2, j, w[j] + w_cnt)
+                    #     w_cost += self.get_cost(w_id)
+                    #     mem_with_w += self.get_mem(w_id)
+                    #     w_cnt += 1
+                    # if e[j] + w_cost >= next_plus_fw:
+                    #     continue
+                    if next_plus_fw - (e[j] + w_cost) <= get_max_bubble() - stage_bubble[j]:
+                        # TODO: can sample here
+                        continue
+                f_required[j] = 1
+            for j in range(self.nstages - 2, -1, -1):
+                f_required[j] = min(f_required[j], f_required[j + 1])
+            for j in range(self.nstages):
+                if f_required[j] == 0:
+                    continue
+                f_id = self.get_id(0, j, f[j])
+                mem = self.get_mem(f_id)
+                while m[j] + mem > max_mem:
+                    if w[j] >= b[j]:
+                        raise ValueError("Cannot fit memory")
+                    put(j, 2)
+                if j > 0:
+                    while (
+                        w[j] < b[j]
+                        and e[j] + self.get_cost(self.get_id(2, j, w[j]))
+                        <= t[self.get_id(0, j - 1, f[j])] + comm
+                    ):
+                        put(j, 2)
+                    if w[j] < b[j] and e[j] < t[self.get_id(0, j - 1, f[j])] + comm:
+                        # TODO: e[j] + self.get_cost(self.get_id(2, j, w[j])) > t[self.get_id(0, j - 1, f[j])] + comm
+                        if (
+                            t[self.get_id(0, j - 1, f[j])] + comm - e[j]
+                            <= get_max_bubble() - stage_bubble[j]
+                        ):
+                            # TODO: can sample here
+                            if no_bubble_greedy:
+                                put(j, 2)
+                        else:
+                            put(j, 2)
+                put(j, 0)
+            for j in range(self.nstages - 1, -1, -1):
+                assert b[j] == i
+                b_id = self.get_id(1, j, b[j])
+                mem = self.get_mem(b_id)
+                while m[j] + mem > max_mem:
+                    if w[j] >= b[j]:
+                        raise ValueError("Cannot fit memory")
+                    put(j, 2)
+                if j + 1 < self.nstages:
+                    while (
+                        w[j] < b[j]
+                        and e[j] + self.get_cost(self.get_id(2, j, w[j]))
+                        <= t[self.get_id(1, j + 1, i)] + comm
+                    ):
+                        put(j, 2)
+                    if w[j] < b[j] and e[j] < t[self.get_id(1, j + 1, i)] + comm:
+                        # TODO: e[j] + self.get_cost(self.get_id(2, j, w[j])) > t[self.get_id(1, j + 1, i)] + comm
+                        if (
+                            t[self.get_id(1, j + 1, i)] + comm - e[j]
+                            <= get_max_bubble() - stage_bubble[j]
+                        ):
+                            # TODO: can sample here
+                            if no_bubble_greedy:
+                                put(j, 2)
+                        else:
+                            put(j, 2)
+                if j == 0 and f[j] == self.nmb:
+                    while w[j] < b[j]:
+                        put(j, 2)
+                put(j, 1)
+        for i in range(self.nstages):
+            while w[i] < self.nmb:
+                put(i, 2)
+            # print(f"{' ' * i}{order_str[i]}  -> {e[i]}")
+        for i in range(self.nstages):
+            for j in range(self.nmb):
+                f_id = self.get_id(0, i, j)
+                b_id = self.get_id(1, i, j)
+                w_id = self.get_id(2, i, j)
+                f_cost = self.get_cost(f_id)
+                b_cost = self.get_cost(b_id)
+                w_cost = self.get_cost(w_id)
+                assert t[b_id] >= t[f_id] + b_cost
+                assert t[w_id] >= t[b_id] + w_cost, f"{i}-{j}, {t[w_id]} >= {t[b_id]} + {w_cost}"
+                if i > 0:
+                    assert t[f_id] >= t[self.get_id(0, i - 1, j)] + comm + f_cost, f"{i}-{j}"
+                if i < self.nstages - 1:
+                    assert t[b_id] >= t[self.get_id(1, i + 1, j)] + comm + b_cost
+        # print(order)
+        best_time = 0
+        for i in range(self.nstages):
+            time_i = (
+                t[self.get_id(2, i, self.nmb - 1)]
+                - t[self.get_id(0, i, 0)]
+                + self.get_cost(self.get_id(0, i, 0))
+            )
+            best_time = max(best_time, time_i)
+        return order, t, best_time
+def initial_solution(graph):
+    best_time, order, complete_time = None, None, None
+    for allow_bubble_before_first_b in [True, False]:
+        for prioritize_b in [True, False]:
+            for no_bubble_greedy in [True, False]:
+                order_t, complete_time_t, best_time_t = graph.manual_order(
+                    allow_bubble_before_first_b=allow_bubble_before_first_b,
+                    prioritize_b=prioritize_b,
+                    no_bubble_greedy=no_bubble_greedy,
+                )
+                if best_time is None or best_time_t < best_time:
+                    best_time = best_time_t
+                    order = order_t
+                    complete_time = complete_time_t
+    print_detail(graph, complete_time)
+    print("-" * 20, best_time, "-" * 20)
+    return best_time, order, complete_time
+def print_detail(graph, F):
+    typenames = ['F', 'B', 'W']
+    times = []
+    for stage in range(graph.nstages):
+        stage_str = ['.'] * int(F[graph.get_id(2, stage, graph.nmb - 1)] / graph.config.print_scaling)
+        for _type in range(3):
+            for _mb in range(graph.nmb):
+                _id = graph.get_id(_type, stage, _mb)
+                end = int(F[_id] / graph.config.print_scaling)
+                start = int((F[_id] - graph.get_cost(_id)) / graph.config.print_scaling)
+                for j in range(start, end):
+                    if j == start or j == end - 1:
+                        stage_str[j] = typenames[_type]
+                    elif j == start + 1:
+                        if _mb >= 10:
+                            stage_str[j] = str(_mb // 10)
+                        else:
+                            stage_str[j] = str(_mb)
+                    elif j == start + 2 and _mb >= 10:
+                        stage_str[j] = str(_mb % 10)
+                    else:
+                        stage_str[j] = "-"
+        _str = ""
+        for _c in stage_str:
+            _str += _c
+        times.append(
+            F[graph.get_id(2, stage, graph.nmb - 1)]
+            - F[graph.get_id(0, stage, 0)]
+            + graph.get_cost(graph.get_id(0, stage, 0))
+        )
+        print(_str)
+    print('Longest stage time: ', max(times))
+def ilp_results(graph, F):
+    typenames = ['F', 'B', 'W']
+    local_order = []
+    end_time = []
+    for i in range(graph.nnodes):
+        end_time.append(F[i])
+    for stage in range(graph.nstages):
+        order = []
+        for type in range(3):
+            for mb in range(graph.nmb):
+                id = graph.get_id(type, stage, mb)
+                order.append(
+                    ScheduledNode(
+                        type=typenames[type],
+                        stage=stage,
+                        minibatch=mb,
+                        start_time=end_time[id] - graph.get_cost(id),
+                        completion_time=F[id],
+                    )
+                )
+        local_order.append(order)
+    # For each F/B, append a send/recv node. The timestamp of recv node is the same as send node to guarrentee a global order.
+    comm_id = {}
+    comm_id_counter = 0
+    post_validation_time = 0
+    for i in range(graph.nstages - 1, -1, -1):
+        warmup_f_count = -1
+        first_b_end = end_time[graph.get_id(1, i, 0)]
+        for j in range(graph.nmb):
+            if end_time[graph.get_id(0, i, j)] < first_b_end:
+                warmup_f_count += 1
+        assert warmup_f_count >= 0
+        pv_id = warmup_f_count
+        _id = graph.get_id(0, i, pv_id)
+        _cost = graph.get_cost(_id)
+        post_validation_time = max(post_validation_time, end_time[_id] - _cost - graph.config.cost_comm)
+        # post_validation_time = 0
+        # print(i, pv_id, post_validation_time)
+        for it in ["RECV_", "SEND_", ""]:
+            if i == 0 and it == "SEND_":
+                continue
+            if i == graph.nstages - 1 and it == "RECV_":
+                continue
+            # stage_ = i - 1 if it == "RECV_" else i
+            stage_ = i
+            local_order[stage_].append(ScheduledNode(
+                type=it + "POST_VALIDATION",
+                stage=stage_,
+                minibatch=0,
+                start_time=post_validation_time,
+                completion_time=post_validation_time,
+            ))
+            comm_id[local_order[stage_][-1]] = comm_id_counter
+            comm_id_counter += 1
+    for stage in range(graph.nstages):
+        for node in local_order[stage]:
+            if node.type == 'F' and node.stage != graph.nstages - 1:
+                local_order[stage].append(
+                    ScheduledNode(
+                        type='SEND_FORWARD',
+                        stage=stage,
+                        minibatch=node.minibatch,
+                        start_time=node.completion_time,
+                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
+                    )
+                )
+                local_order[stage + 1].append(
+                    ScheduledNode(
+                        type='RECV_FORWARD',
+                        stage=stage + 1,
+                        minibatch=node.minibatch,
+                        start_time=node.completion_time,
+                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
+                    )
+                )
+                comm_id[local_order[stage][-1]] = comm_id_counter
+                comm_id[local_order[stage + 1][-1]] = comm_id_counter
+                comm_id_counter += 1
+            if node.type == 'B' and node.stage != 0:
+                local_order[stage].append(
+                    ScheduledNode(
+                        type='SEND_BACKWARD',
+                        stage=stage,
+                        minibatch=node.minibatch,
+                        start_time=node.completion_time,
+                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
+                    )
+                )
+                local_order[stage - 1].append(
+                    ScheduledNode(
+                        type='RECV_BACKWARD',
+                        stage=stage - 1,
+                        minibatch=node.minibatch,
+                        start_time=node.completion_time,
+                        completion_time=node.completion_time,  # TODO: consider comm cost in completion time
+                    )
+                )
+                comm_id[local_order[stage][-1]] = comm_id_counter
+                comm_id[local_order[stage - 1][-1]] = comm_id_counter
+                comm_id_counter += 1
+    for stage in range(graph.nstages):
+        # For nodes with the same timestamp on the same stage, communication will be prioritized.
+        def even_breaker(x: ScheduledNode):
+            # Compute nodes are always delayed.
+            if x.type in ['F', 'B', 'W']:
+                return comm_id_counter
+            # For comm nodes, order by their unique comm id
+            return comm_id[x]
+        local_order[stage] = list(sorted(
+            local_order[stage], key=lambda x: (x.start_time, even_breaker(x))
+        ))
+        # If a recv with intersects with previous computation, reorder them so that recv
+        # is executed before computation and hence can be overlapped.
+        for i in range(len(local_order[stage])):
+            if i > 0 and local_order[stage][i - 1].type in {'F', 'B', 'W'} and \
+                local_order[stage][i].type.startswith('RECV') and \
+                "POST_VALIDATION" not in local_order[stage][i].type and \
+                local_order[stage][i].start_time <= local_order[stage][i - 1].completion_time:
+                (local_order[stage][i], local_order[stage][i - 1]) = (local_order[stage][i - 1], local_order[stage][i])
+        # print([(x.type, x.start_time, x.completion_time) for x in local_order[stage]])
+    local_order_with_rollback = [[] for _ in range(graph.nstages)]
+    for rank in range(graph.nstages):
+        rollback_comm = set()
+        if rank > 0:
+            for node in local_order[rank - 1]:
+                if node.type == "POST_VALIDATION":
+                    break
+                if node.type == "SEND_FORWARD":
+                    rollback_comm.add(node.minibatch)
+        for node in local_order[rank]:
+            if node.type == "RECV_FORWARD" and node.minibatch in rollback_comm:
+                rollback = True
+                rollback_comm.remove(node.minibatch)
+            else:
+                rollback = False
+            local_order_with_rollback[rank].append(ScheduledNode(
+                type=node.type,
+                stage=node.stage,
+                minibatch=node.minibatch,
+                start_time=node.start_time,
+                completion_time=node.completion_time,
+                rollback=rollback,
+            ))
+        assert len(rollback_comm) == 0
+        # for node in local_order_with_rollback[rank]:
+        #     print(f"{node.type}-{node.minibatch}-{int(node.rollback)}", end=', ')
+        # print()
+    print_detail(graph, end_time)
+    return local_order_with_rollback
+def auto_schedule(nstages, nmb, config):
+    graph = Graph.build_graph(nstages, nmb, config)
+    best_time, order, complete_time = initial_solution(graph)
+    return ilp_results(graph, complete_time)
+if __name__ == "__main__":
+    # auto_schedule(4, 12, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=10))
+    # auto_schedule(4, 12, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=14))
+    auto_schedule(24, 72, GraphConfig(cost_f=5, cost_b=6, cost_w=4, cost_comm=0, max_mem=100))
+    auto_schedule(4, 12, GraphConfig(
+        cost_f=5478,
+        cost_b=5806,
+        cost_w=3534,
+        cost_comm=200,
+        max_mem=32,
+        print_scaling=1000
+    ))
+    auto_schedule(32, 16, GraphConfig(
+        cost_f=1,
+        cost_b=1,
+        cost_w=1,
+        cost_comm=0,
+        max_mem=64,
+    ))

v_schedule.py ADDED Viewed

	@@ -0,0 +1,461 @@

+from collections import deque
+from dataclasses import dataclass
+@dataclass(eq=True, frozen=True)
+class ScheduledNode:
+    type: str
+    chunk: int
+    stage: int
+    minibatch: int
+    start_time: int
+    completion_time: int
+    rollback: bool = False
+class PipelineGraph(object):
+    def __init__(
+        self, n_stage, n_micro, f_cost, b_cost, w_cost, c_cost,
+        f_mem, b_mem, w_mem, max_mem=None,
+    ):
+        self.n_node = 6 * n_stage * n_micro
+        self.n_stage = n_stage
+        self.n_micro = n_micro
+        self.f_cost = f_cost
+        self.b_cost = b_cost
+        self.w_cost = w_cost
+        self.c_cost = c_cost
+        self.f_mem = f_mem
+        self.b_mem = b_mem
+        self.w_mem = w_mem
+        self.fbw_cost = [f_cost, b_cost, w_cost]
+        self.fbw_mem = [f_mem, b_mem, w_mem]
+        self.max_mem = max_mem or f_mem * self.n_stage * 2
+    def get_id(self, cat, chunk, stage, micro):
+        return cat * 2 * self.n_stage * self.n_micro + \
+               chunk * self.n_stage * self.n_micro + \
+               stage * self.n_micro + \
+               micro
+    def try_v_schedule(self, fill_f=True, fill_b=True, approved_bubble=None):
+        count = []
+        for i in range(self.n_stage):
+            count.append([0] * 6)
+        end_time = [-1] * self.n_node
+        cur_time = [0] * self.n_stage
+        mem = [0] * self.n_stage
+        stage_bubble = [0] * self.n_stage
+        pending_w = [deque() for _ in range(self.n_stage)]
+        schedule = [[] for _ in range(self.n_stage)]
+        stage_str = ["    " * i for i in range(self.n_stage)]
+        if approved_bubble is None:
+            approved_bubble = [-1] * self.n_stage
+        max_approved_bubble = max(approved_bubble)
+        def get_max_stage_bubble(stage=-1):
+            max_stage_bubble = 0
+            for bb in stage_bubble:
+                max_stage_bubble = max(max_stage_bubble, bb)
+            if stage >= 0:
+                max_stage_bubble = max(max_stage_bubble, max_approved_bubble - approved_bubble[stage])
+            return max_stage_bubble
+        def put_w(stage):
+            assert len(pending_w[stage]) > 0
+            _, chunk_, _ = pending_w[stage].popleft()
+            put(2, chunk_, stage)
+        def put(cat, chunk, stage, assert_cnt=True):
+            _tmp = _no_bubble = cur_time[stage] + self.fbw_cost[cat]
+            _cnt = count[stage][cat * 2 + chunk]
+            # assert _cnt < self.n_micro
+            if _cnt >= self.n_micro:
+                if not assert_cnt:
+                    stage_str[stage] += "    "
+                    cur_time[stage] = _tmp  # TODO
+                    return
+                assert False
+            assert mem[stage] + self.fbw_mem[cat] <= self.max_mem
+            stage_str[stage] += "FfBbWw"[cat * 2 + chunk] + str(_cnt + 1) + " " * (3 - len(str(_cnt + 1)))
+            if cat > 0 or chunk > 0:
+                last_id = cat * 2 + chunk - 1
+                if cat < 2:
+                    # if end_time[self.get_id(last_id // 2, last_id % 2, stage, _cnt)] < 0:
+                    #     print(cat, chunk, stage, _cnt)
+                    #     self.print_details(end_time)
+                    assert end_time[self.get_id(last_id // 2, last_id % 2, stage, _cnt)] >= 0
+                else:
+                    assert end_time[self.get_id(1, chunk, stage, _cnt)] >= 0
+            if chunk == 1 and cat < 2:
+                if stage < self.n_stage - 1:
+                    _fa_id = self.get_id(cat, chunk, stage + 1, _cnt)
+                    assert end_time[_fa_id] >= 0
+                    _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat])
+            if chunk == 0 and cat < 2:
+                if stage > 0:
+                    _fa_id = self.get_id(cat, chunk, stage - 1, _cnt)
+                    # if end_time[_fa_id] < 0:
+                    #     print(cat, chunk, stage, _cnt)
+                    #     self.print_details(end_time)
+                    assert end_time[_fa_id] >= 0, f"{cat}, {chunk}, {stage}, {_cnt}"
+                    _tmp = max(_tmp, end_time[_fa_id] + self.c_cost + self.fbw_cost[cat])
+            _id = self.get_id(cat, chunk, stage, _cnt)
+            if count[stage][0] > 0:
+                stage_bubble[stage] += _tmp - _no_bubble
+            end_time[_id] = _tmp
+            cur_time[stage] = _tmp
+            mem[stage] += self.fbw_mem[cat]
+            # noinspection PyTypeChecker
+            schedule[stage].append((cat, chunk, _cnt))
+            if cat == 1:
+                pending_w[stage].append((2, chunk, _cnt))
+            count[stage][cat * 2 + chunk] += 1
+        for _ in range(2 * self.n_stage):
+            for i in range(self.n_stage):
+                if count[i][1] >= count[i][0]:
+                    put(0, 0, i, assert_cnt=False)
+                    continue
+                if i == self.n_stage - 1:
+                    put(0, 1, i, assert_cnt=False)
+                    continue
+                fa_id = self.get_id(0, 1, i + 1, count[i][1])
+                if 0 <= end_time[fa_id] < cur_time[i + 1]:  # TODO
+                    put(0, 1, i, assert_cnt=False)
+                else:
+                    put(0, 0, i, assert_cnt=False)
+        # for i in range(self.n_stage):
+        #     put(0, 0, i)
+        # for i in range(self.n_stage - 1, -1, -1):
+        #     if i == self.n_stage - 1:
+        #         put(0, 1, i)
+        #         continue
+        #     tmp = end_time[self.get_id(0, 1, i + 1, 0)] + self.c_cost
+        #     while mem[i] + self.fbw_mem[0] * (2 + i * 2) <= self.max_mem and cur_time[i] + self.fbw_cost[0] <= tmp and count[i][0] < self.n_micro:
+        #         for j in range(i + 1):
+        #             put(0, 0, j)
+        #     put(0, 1, i)
+        # iter_chunk_ = 0
+        # end_tmp = 0
+        # for i in range(self.n_stage):
+        #     if i == 0:
+        #         end_tmp = cur_time[0] + self.fbw_cost[1]
+        #         continue
+        #     tmp = end_tmp + self.c_cost
+        #     while count[i][0] + count[i][1] < count[i - 1][0] + count[i - 1][1]:
+        #         for j in range(self.n_stage - 1, i - 1, -1):
+        #             if count[j][iter_chunk_] < self.n_micro:
+        #                 put(0, iter_chunk_, j)
+        #         iter_chunk_ = 1 - iter_chunk_
+        #     # while mem[i] + self.fbw_mem[0] <= self.max_mem and cur_time[i] + self.fbw_cost[0] <= tmp:
+        #     #     if iter_chunk_ == 0 and count[i][0] >= count[i - 1][0]:
+        #     #         break
+        #     #     for j in range(self.n_stage - 1, i - 1, -1):
+        #     #         if count[j][iter_chunk_] < self.n_micro:
+        #     #             put(0, iter_chunk_, j)
+        #     #     iter_chunk_ = 1 - iter_chunk_
+        #     # end_tmp = max(tmp, cur_time[i]) + self.fbw_cost[1]
+        # init_bubble = get_max_stage_bubble()
+        # print(stage_bubble)
+        for _ in range(2 * self.n_micro):
+            # check mem before putting b
+            for i in range(self.n_stage):
+                while mem[i] + self.fbw_mem[1] > self.max_mem:
+                    assert len(pending_w[i]) > 0
+                    put_w(i)
+            b0_ranks, b1_ranks = [], []
+            for i in range(self.n_stage):
+                if count[i][3] >= count[i][2]:
+                    b0_ranks.append(i)
+                elif i == self.n_stage - 1:
+                    b1_ranks.append(i)
+                else:
+                    fa_id = self.get_id(1, 1, i + 1, count[i][3])
+                    if end_time[fa_id] >= 0 or count[i][2] >= self.n_micro:
+                        b1_ranks.append(i)
+                    else:
+                        b0_ranks.append(i)
+            b_ranks = []
+            # put b1
+            for i in reversed(b1_ranks):
+                b_ranks.append((i, 1))
+            # put b0
+            for i in b0_ranks:
+                b_ranks.append((i, 0))
+            for i, _chunk_ in b_ranks:
+                fa_id = -1
+                if _chunk_ == 1 and i < self.n_stage - 1:
+                    fa_id = self.get_id(1, 1, i + 1, count[i][3])
+                if _chunk_ == 0 and i > 0:
+                    fa_id = self.get_id(1, 0, i - 1, count[i][2])
+                while len(pending_w[i]) > 0 and fa_id >= 0 and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2]:
+                    # fill the bubble
+                    put_w(i)
+                if len(pending_w[i]) > 0 and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i]:
+                    if _chunk_ == 1:
+                        put_w(i)
+                    elif fill_b:
+                        put_w(i)
+                put(1, _chunk_, i)
+            # put f
+            for i in range(self.n_stage):
+                if count[i][1] >= self.n_micro:
+                    continue
+                put_item = None
+                if count[i][1] >= count[i][0]:
+                    put_item = 0
+                elif i == self.n_stage - 1:
+                    put_item = 1
+                else:
+                    if end_time[self.get_id(0, 1, i + 1, count[i][1])] >= 0:
+                        put_item = 1
+                    elif count[i][0] < self.n_micro:
+                        if i == 0:
+                            put_item = 0
+                        elif end_time[self.get_id(0, 0, i - 1, count[i][0])] >= 0:
+                            put_item = 0
+                if put_item is None:
+                    continue
+                # check mem before putting f
+                while mem[i] + self.fbw_mem[0] > self.max_mem:
+                    assert len(pending_w[i]) > 0
+                    put_w(i)
+                fa_id = -1
+                if put_item == 0 and i > 0:
+                    fa_id = self.get_id(0, 0, i - 1, count[i][0])
+                if put_item == 1 and i < self.n_stage - 1:
+                    fa_id = self.get_id(0, 1, i + 1, count[i][1])
+                while len(pending_w[i]) > 0 and fa_id >= 0 and end_time[fa_id] + self.c_cost >= cur_time[i] + self.fbw_cost[2]:
+                    # fill the bubble
+                    put_w(i)
+                if len(pending_w[i]) > 0 and end_time[fa_id] + self.c_cost - cur_time[i] > get_max_stage_bubble(i) - stage_bubble[i]:
+                    if fill_f:
+                        put_w(i)
+                put(0, put_item, i)
+        for i in range(self.n_stage):
+            while len(pending_w[i]) > 0:
+                put_w(i)
+        # for i in range(self.n_stage):
+        #     print(stage_str[i])
+        max_bubble = get_max_stage_bubble()
+        expected_time = sum(self.fbw_cost) * self.n_micro * 2
+        bubble_rate = max_bubble / expected_time
+        # print("%6.4f" % bubble_rate, "->", stage_bubble)
+        if max_approved_bubble < 0 or max_bubble < max_approved_bubble:
+            _schedule, _end_time, _max_bubble = self.try_v_schedule(
+                fill_f=fill_f, fill_b=fill_b,
+                approved_bubble=stage_bubble,
+            )
+            if _max_bubble < max_bubble:
+                return _schedule, _end_time, _max_bubble
+        # print("%2d %3d, [%5d %5d %5d], %6d -> %6.4f %6.4f" % \
+        #       (self.n_stage, self.n_micro, *self.fbw_cost, self.max_mem // self.f_mem, init_bubble / expected_time, bubble_rate), max_bubble)
+        return schedule, end_time, max_bubble
+    def print_details(self, end_time, print_scaling=1):
+        for stage in range(self.n_stage):
+            stage_str = ['.'] * int(max(end_time) / print_scaling)
+            for _cat in range(3):
+                for _chunk in range(2):
+                    for _micro in range(self.n_micro):
+                        _id = self.get_id(_cat, _chunk, stage, _micro)
+                        if end_time[_id] < 0:
+                            continue
+                        end = int(end_time[_id] / print_scaling)
+                        start = int((end_time[_id] - self.fbw_cost[_cat]) / print_scaling)
+                        for j in range(start, end):
+                            if j == start or j == end - 1:
+                                stage_str[j] = "FfBbWw"[_cat * 2 + _chunk]
+                            elif j == start + 1:
+                                if _micro >= 10:
+                                    stage_str[j] = str(_micro // 10)
+                                else:
+                                    stage_str[j] = str(_micro)
+                            elif j == start + 2 and _micro >= 10:
+                                stage_str[j] = str(_micro % 10)
+                            else:
+                                stage_str[j] = "-"
+            _str = ""
+            for _c in stage_str:
+                _str += _c
+            print(_str)
+    def get_v_schedule(self):
+        schedule, end_time, max_bubble = None, None, None
+        expected_time = sum(self.fbw_cost) * self.n_micro * 2
+        for fill_b in [True, False]:
+            for fill_f in [True, False]:
+                _schedule, _end_time, _max_bubble = self.try_v_schedule(
+                    fill_b=fill_b, fill_f=fill_f
+                )
+                # print("")
+                if max_bubble is None or _max_bubble < max_bubble:
+                    max_bubble = _max_bubble
+                    schedule = _schedule
+                    end_time = _end_time
+        # self.print_details(end_time, print_scaling=1)
+        bubble_rate = max_bubble / expected_time
+        print("%2d %3d, [%5d %5d %5d], %6d -> %6.4f" % \
+              (self.n_stage, self.n_micro, *self.fbw_cost, self.max_mem // self.f_mem, bubble_rate))
+        local_order = [[] for _ in range(self.n_stage)]
+        comm_id = {}
+        comm_id_counter = 0
+        post_validation_time = 0
+        for i in range(self.n_stage - 1, -1, -1):
+            pv_id = min(2 * (self.n_stage - 1 - i), self.n_micro - 1)
+            post_validation_time = max(post_validation_time, end_time[self.get_id(0, 0, i, pv_id)] - self.fbw_cost[0] - self.c_cost)
+            # post_validation_time = 0
+            # print(i, pv_id, post_validation_time)
+            for it in ["RECV_", "SEND_", ""]:
+                if i == 0 and it == "SEND_":
+                    continue
+                if i == self.n_stage - 1 and it == "RECV_":
+                    continue
+                # stage_ = i - 1 if it == "RECV_" else i
+                stage_ = i
+                local_order[stage_].append(ScheduledNode(
+                    type=it + "POST_VALIDATION",
+                    chunk=0,
+                    stage=stage_,
+                    minibatch=0,
+                    start_time=post_validation_time,
+                    completion_time=post_validation_time,
+                ))
+                comm_id[local_order[stage_][-1]] = comm_id_counter
+                comm_id_counter += 1
+        for i in range(self.n_stage):
+            for _cat_, _chunk_, _micro_ in schedule[i]:
+                complete_time = end_time[self.get_id(_cat_, _chunk_, i, _micro_)]
+                local_order[i].append(ScheduledNode(
+                    type="FBW"[_cat_],
+                    chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_,
+                    stage=i,
+                    minibatch=_micro_,
+                    start_time=complete_time - self.fbw_cost[_cat_],
+                    completion_time=complete_time,
+                ))
+                if _cat_ == 2: # no communication for W
+                    continue
+                cat_str = "FORWARD" if _cat_ == 0 else "BACKWARD"
+                def communicate(send_recv, stage_):
+                   # noinspection PyTypeChecker
+                    local_order[stage_].append(ScheduledNode(
+                        type=send_recv + cat_str,
+                        chunk=_chunk_ if _cat_ == 0 else 1 - _chunk_,
+                        stage=stage_,
+                        minibatch=_micro_,
+                        start_time=complete_time,
+                        completion_time=complete_time,
+                    ))
+                    comm_id[local_order[stage_][-1]] = comm_id_counter
+                if _chunk_ == 1 and i > 0:
+                    communicate("SEND_", i)
+                    communicate("RECV_", i - 1)
+                if _chunk_ == 0 and i < self.n_stage - 1:
+                    communicate("SEND_", i)
+                    communicate("RECV_", i + 1)
+                comm_id_counter += 1
+        for rank in range(self.n_stage):
+            # For nodes with the same timestamp on the same stage, communication will be prioritized.
+            def even_breaker(x: ScheduledNode):
+                # Compute nodes are always delayed.
+                if x.type in ['F', 'B', 'W']:
+                    return comm_id_counter
+                # For comm nodes, order by their unique comm id
+                return comm_id[x]
+            local_order[rank] = list(sorted(
+                local_order[rank],
+                key=lambda x: (x.start_time, even_breaker(x))
+            ))
+            # If a recv with intersects with previous computation, reorder them so that recv
+            # is executed before computation and hence can be overlapped.
+            for i in range(len(local_order[rank])):
+                if i > 0 and local_order[rank][i - 1].type in {'F', 'B', 'W'} and \
+                    local_order[rank][i].type.startswith('RECV') and \
+                    "POST_VALIDATION" not in local_order[rank][i].type and \
+                    local_order[rank][i].start_time <= local_order[rank][i - 1].completion_time:
+                    local_order[rank][i], local_order[rank][i - 1] = local_order[rank][i - 1], local_order[rank][i]
+        local_order_with_rollback = [[] for _ in range(self.n_stage)]
+        for rank in range(self.n_stage):
+            rollback_comm = set()
+            if rank > 0:
+                for node in local_order[rank - 1]:
+                    if node.type == "POST_VALIDATION":
+                        break
+                    if node.type == "SEND_FORWARD":
+                        assert node.chunk == 0
+                        rollback_comm.add(node.minibatch)
+            for node in local_order[rank]:
+                if node.type == "RECV_FORWARD" and node.chunk == 0 and node.minibatch in rollback_comm:
+                    rollback = True
+                    rollback_comm.remove(node.minibatch)
+                else:
+                    rollback = False
+                local_order_with_rollback[rank].append(ScheduledNode(
+                    type=node.type,
+                    chunk=node.chunk,
+                    stage=node.stage,
+                    minibatch=node.minibatch,
+                    start_time=node.start_time,
+                    completion_time=node.completion_time,
+                    rollback=rollback,
+                ))
+            assert len(rollback_comm) == 0
+            for node in local_order_with_rollback[rank]:
+                print(f"{node.type}-{node.minibatch}-{int(node.rollback)}", end=', ')
+            print()
+        return local_order_with_rollback
+if __name__ == '__main__':
+    settings = [
+        # p,   n,     f,     b,     w,   c,    h,  a,  l
+        # (8, 24, 18522, 18086, 9337, 601, 2304, 24, 24),
+        # (8, 32, 18513, 18086, 9331, 626, 2304, 24, 24),
+        # (8, 64, 18546, 18097, 9321, 762, 2304, 24, 24),
+        # (8, 24, 29718, 29444, 19927, 527, 4096, 32, 32),
+        # (8, 32, 29802, 29428, 19530, 577, 4096, 32, 32),
+        # (8, 64, 29935, 29621, 19388, 535, 4096, 32, 32),
+        # (16, 48, 11347, 11248, 8132, 377, 5120, 40, 48),
+        # (16, 64, 11307, 11254, 8101, 379, 5120, 40, 48),
+        # (16, 128, 11325, 11308, 8109, 378, 5120, 40, 48),
+        # (32, 96, 10419, 10207, 7715, 408, 6144, 48, 64),
+        # (32, 128, 10408, 10204, 7703, 408, 6144, 48, 64),
+        # (32, 256, 10402, 10248, 7698, 460, 6144, 48, 64),
+        (4, 8, 6, 4, 4, 1, 4096, 32, 32),
+        # (8, 24, 29444, 29718, 19927, 527, 4096, 32, 32),
+    ]
+    s = 1024
+    # h, a, s = 4096, 32, 1024
+    # cost_f, cost_b, cost_w, cost_c = 29718, 29444, 19927, 527
+    for p, n, f, b, w, c, h, a, l in settings:
+        mem_f = 34 * h + 5 * a * s
+        mem_w = - 32 * h
+        mem_b = - mem_w - mem_f
+        for m_offset in range(p + 1):
+            graph = PipelineGraph(
+                n_stage=p,
+                n_micro=n,
+                f_cost=f,
+                b_cost=b,
+                w_cost=w,
+                c_cost=c,
+                f_mem=mem_f,
+                b_mem=mem_b,
+                w_mem=mem_w,
+                max_mem=mem_f * (p * 2 + m_offset),
+            )
+            graph.get_v_schedule()