File size: 19,859 Bytes
d1ed09d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
from __future__ import annotations

import asyncio
import logging
from collections import defaultdict, deque
from collections.abc import Container
from functools import partial
from math import log2
from time import time
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast

from tlz import topk

import dask
from dask.utils import parse_timedelta

from distributed.compatibility import PeriodicCallback
from distributed.core import CommClosedError
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.utils import log_errors, recursive_to_dict

if TYPE_CHECKING:
    # Recursive imports
    from distributed.scheduler import Scheduler, SchedulerState, TaskState, WorkerState

# Stealing requires multiple network bounces and if successful also task
# submission which may include code serialization. Therefore, be very
# conservative in the latency estimation to suppress too aggressive stealing
# of small tasks
LATENCY = 0.1

logger = logging.getLogger(__name__)


LOG_PDB = dask.config.get("distributed.admin.pdb-on-err")

_WORKER_STATE_CONFIRM = {
    "ready",
    "constrained",
    "waiting",
}

_WORKER_STATE_REJECT = {
    "memory",
    "executing",
    "long-running",
    "cancelled",
    "resumed",
}
_WORKER_STATE_UNDEFINED = {
    "released",
    None,
}


class InFlightInfo(TypedDict):
    victim: WorkerState
    thief: WorkerState
    victim_duration: float
    thief_duration: float
    stimulus_id: str


class WorkStealing(SchedulerPlugin):
    scheduler: Scheduler
    # {worker: ({ task states for level 0}, ..., {task states for level 14})}
    stealable: dict[str, tuple[set[TaskState], ...]]
    # { task state: (worker, level) }
    key_stealable: dict[TaskState, tuple[str, int]]
    # (multiplier for level 0, ... multiplier for level 14)
    cost_multipliers: ClassVar[tuple[float, ...]] = (1.0,) + tuple(
        1 + 2 ** (i - 6) for i in range(1, 15)
    )
    _callback_time: float
    count: int
    # { task state: <stealing info dict> }
    in_flight: dict[TaskState, InFlightInfo]
    # { worker state: occupancy }
    in_flight_occupancy: defaultdict[WorkerState, float]
    in_flight_tasks: defaultdict[WorkerState, int]
    metrics: dict[str, dict[int, float]]
    _in_flight_event: asyncio.Event
    _request_counter: int

    def __init__(self, scheduler: Scheduler):
        self.scheduler = scheduler
        self.stealable = {}
        self.key_stealable = {}

        for worker in scheduler.workers:
            self.add_worker(worker=worker)

        self._callback_time = cast(
            float,
            parse_timedelta(
                dask.config.get("distributed.scheduler.work-stealing-interval"),
                default="ms",
            ),
        )
        # `callback_time` is in milliseconds
        self.scheduler.add_plugin(self)
        self.scheduler.events["stealing"] = deque(maxlen=100000)
        self.count = 0
        self.in_flight = {}
        self.in_flight_occupancy = defaultdict(lambda: 0)
        self.in_flight_tasks = defaultdict(lambda: 0)
        self._in_flight_event = asyncio.Event()
        self.metrics = {
            "request_count_total": defaultdict(lambda: 0),
            "request_cost_total": defaultdict(lambda: 0),
        }
        self._request_counter = 0
        self.scheduler.stream_handlers["steal-response"] = self.move_task_confirm

    async def start(self, scheduler: Any = None) -> None:
        """Start the background coroutine to balance the tasks on the cluster.
        Idempotent.
        The scheduler argument is ignored. It is merely required to satisfy the
        plugin interface. Since this class is simultaneously an extension, the
        scheduler instance is already registered during initialization
        """
        if "stealing" in self.scheduler.periodic_callbacks:
            return
        pc = PeriodicCallback(
            callback=self.balance, callback_time=self._callback_time * 1000
        )
        pc.start()
        self.scheduler.periodic_callbacks["stealing"] = pc
        self._in_flight_event.set()

    async def stop(self) -> None:
        """Stop the background task balancing tasks on the cluster.
        This will block until all currently running stealing requests are
        finished. Idempotent
        """
        pc = self.scheduler.periodic_callbacks.pop("stealing", None)
        if pc:
            pc.stop()
        await self._in_flight_event.wait()

    def _to_dict_no_nest(self, *, exclude: Container[str] = ()) -> dict:
        """Dictionary representation for debugging purposes.
        Not type stable and not intended for roundtrips.

        See also
        --------
        Client.dump_cluster_state
        distributed.utils.recursive_to_dict
        """
        return recursive_to_dict(self, exclude=exclude, members=True)

    def log(self, msg: Any) -> None:
        return self.scheduler.log_event("stealing", msg)

    def add_worker(self, scheduler: Any = None, worker: Any = None) -> None:
        self.stealable[worker] = tuple(set() for _ in range(15))

    def remove_worker(self, scheduler: Scheduler, worker: str) -> None:
        del self.stealable[worker]

    def teardown(self) -> None:
        pcs = self.scheduler.periodic_callbacks
        if "stealing" in pcs:
            pcs["stealing"].stop()
            del pcs["stealing"]

    def transition(
        self,
        key: str,
        start: str,
        finish: str,
        compute_start: Any = None,
        compute_stop: Any = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        if finish == "processing":
            ts = self.scheduler.tasks[key]
            self.put_key_in_stealable(ts)
        elif start == "processing":
            ts = self.scheduler.tasks[key]
            self.remove_key_from_stealable(ts)
            self._remove_from_in_flight(ts)

    def _add_to_in_flight(self, ts: TaskState, info: InFlightInfo) -> None:
        self.in_flight[ts] = info
        self._in_flight_event.clear()
        thief = info["thief"]
        victim = info["victim"]
        self.in_flight_occupancy[victim] -= info["victim_duration"]
        self.in_flight_occupancy[thief] += info["thief_duration"]
        self.in_flight_tasks[victim] -= 1
        self.in_flight_tasks[thief] += 1

    def _remove_from_in_flight(self, ts: TaskState) -> InFlightInfo | None:
        info = self.in_flight.pop(ts, None)
        if info:
            thief = info["thief"]
            victim = info["victim"]
            self.in_flight_occupancy[thief] -= info["thief_duration"]
            self.in_flight_occupancy[victim] += info["victim_duration"]
            self.in_flight_tasks[victim] += 1
            self.in_flight_tasks[thief] -= 1
            if not self.in_flight:
                self.in_flight_occupancy.clear()
                self._in_flight_event.set()
        return info

    def recalculate_cost(self, ts: TaskState) -> None:
        if ts not in self.in_flight:
            self.remove_key_from_stealable(ts)
            self.put_key_in_stealable(ts)

    def put_key_in_stealable(self, ts: TaskState) -> None:
        cost_multiplier, level = self.steal_time_ratio(ts)
        if cost_multiplier is not None:
            assert level is not None
            assert ts.processing_on
            ws = ts.processing_on
            worker = ws.address
            self.stealable[worker][level].add(ts)
            self.key_stealable[ts] = (worker, level)

    def remove_key_from_stealable(self, ts: TaskState) -> None:
        result = self.key_stealable.pop(ts, None)
        if result is None:
            return

        worker, level = result
        try:
            self.stealable[worker][level].remove(ts)
        except KeyError:
            pass

    def steal_time_ratio(self, ts: TaskState) -> tuple[float, int] | tuple[None, None]:
        """The compute to communication time ratio of a key

        Returns
        -------
        cost_multiplier: The increased cost from moving this task as a factor.
        For example a result of zero implies a task without dependencies.
        level: The location within a stealable list to place this value
        """
        split = ts.prefix.name
        if split in fast_tasks:
            return None, None

        if not ts.dependencies:  # no dependencies fast path
            return 0, 0

        compute_time = self.scheduler.get_task_duration(ts)

        if not compute_time:
            # occupancy/ws.processing[ts] is only allowed to be zero for
            # long running tasks which cannot be stolen
            assert ts.processing_on
            assert ts in ts.processing_on.long_running
            return None, None

        nbytes = ts.get_nbytes_deps()
        transfer_time = nbytes / self.scheduler.bandwidth + LATENCY
        cost_multiplier = transfer_time / compute_time

        level = int(round(log2(cost_multiplier) + 6))

        if level < 1:
            level = 1
        elif level >= len(self.cost_multipliers):
            return None, None

        return cost_multiplier, level

    def move_task_request(
        self, ts: TaskState, victim: WorkerState, thief: WorkerState
    ) -> str:
        try:
            if ts in self.in_flight:
                return "in-flight"
            # Stimulus IDs are used to verify the response, see
            # `move_task_confirm`. Therefore, this must be truly unique.
            stimulus_id = f"steal-{self._request_counter}"
            self._request_counter += 1

            key = ts.key
            self.remove_key_from_stealable(ts)
            logger.debug(
                "Request move %s, %s: %2f -> %s: %2f",
                key,
                victim,
                victim.occupancy,
                thief,
                thief.occupancy,
            )

            # TODO: occupancy no longer concats linearly so we can't easily
            # assume that the network cost would go down by that much
            victim_duration = self.scheduler.get_task_duration(
                ts
            ) + self.scheduler.get_comm_cost(ts, victim)
            thief_duration = self.scheduler.get_task_duration(
                ts
            ) + self.scheduler.get_comm_cost(ts, thief)

            self.scheduler.stream_comms[victim.address].send(
                {"op": "steal-request", "key": key, "stimulus_id": stimulus_id}
            )
            info: InFlightInfo = {
                "victim": victim,  # guaranteed to be processing_on
                "thief": thief,
                "victim_duration": victim_duration,
                "thief_duration": thief_duration,
                "stimulus_id": stimulus_id,
            }
            self._add_to_in_flight(ts, info)
            return stimulus_id
        except CommClosedError:
            logger.info("Worker comm %r closed while stealing: %r", victim, ts)
            return "comm-closed"
        except Exception as e:  # pragma: no cover
            logger.exception(e)
            if LOG_PDB:
                import pdb

                pdb.set_trace()
            raise

    async def move_task_confirm(
        self, *, key: str, state: str, stimulus_id: str, worker: str | None = None
    ) -> None:
        try:
            ts = self.scheduler.tasks[key]
        except KeyError:
            logger.debug("Key released between request and confirm: %s", key)
            return
        try:
            if self.in_flight[ts]["stimulus_id"] != stimulus_id:
                self.log(("stale-response", key, state, worker, stimulus_id))
                return
        except KeyError:
            self.log(("already-aborted", key, state, worker, stimulus_id))
            return

        info = self._remove_from_in_flight(ts)
        assert info
        thief = info["thief"]
        victim = info["victim"]
        logger.debug("Confirm move %s, %s -> %s.  State: %s", key, victim, thief, state)

        if self.scheduler.validate:
            assert ts.processing_on == victim

        try:
            _log_msg = [key, state, victim.address, thief.address, stimulus_id]

            if (
                state in _WORKER_STATE_UNDEFINED
                # If our steal information is somehow stale we need to reschedule
                or state in _WORKER_STATE_CONFIRM
                and thief != self.scheduler.workers.get(thief.address)
            ):
                self.log(
                    (
                        "reschedule",
                        thief.address not in self.scheduler.workers,
                        *_log_msg,
                    )
                )
                self.scheduler._reschedule(key, stimulus_id=stimulus_id)
            # Victim had already started execution
            elif state in _WORKER_STATE_REJECT:
                self.log(("already-computing", *_log_msg))
            # Victim was waiting, has given up task, enact steal
            elif state in _WORKER_STATE_CONFIRM:
                self.remove_key_from_stealable(ts)
                ts.processing_on = thief
                victim.remove_from_processing(ts)
                thief.add_to_processing(ts)
                self.put_key_in_stealable(ts)

                self.scheduler.send_task_to_worker(thief.address, ts)
                self.log(("confirm", *_log_msg))
            else:
                raise ValueError(f"Unexpected task state: {state}")
        except Exception as e:  # pragma: no cover
            logger.exception(e)
            if LOG_PDB:
                import pdb

                pdb.set_trace()
            raise
        finally:
            self.scheduler.check_idle_saturated(thief)
            self.scheduler.check_idle_saturated(victim)

    def balance(self) -> None:
        s = self.scheduler
        log = []
        start = time()

        with log_errors():
            i = 0
            # Paused and closing workers must never become thieves
            potential_thieves = set(s.idle.values())
            if not potential_thieves or len(potential_thieves) == len(s.workers):
                return
            victim: WorkerState | None
            potential_victims: set[WorkerState] | list[WorkerState] = s.saturated
            if not potential_victims:
                potential_victims = topk(
                    10, s.workers.values(), key=self._combined_occupancy
                )
                potential_victims = [
                    ws
                    for ws in potential_victims
                    if self._combined_occupancy(ws) > 0.2
                    and self._combined_nprocessing(ws) > ws.nthreads
                    and ws not in potential_thieves
                ]
                if not potential_victims:
                    return
            if len(potential_victims) < 20:
                potential_victims = sorted(
                    potential_victims, key=self._combined_occupancy, reverse=True
                )
            assert potential_victims
            assert potential_thieves
            for level, _ in enumerate(self.cost_multipliers):
                if not potential_thieves:
                    break
                for victim in list(potential_victims):
                    stealable = self.stealable[victim.address][level]
                    if not stealable or not potential_thieves:
                        continue

                    for ts in list(stealable):
                        if not potential_thieves:
                            break
                        if (
                            ts not in self.key_stealable
                            or ts.processing_on is not victim
                            or ts not in victim.processing
                        ):
                            # FIXME: Instead of discarding here, clean up stealable properly
                            stealable.discard(ts)
                            continue
                        i += 1
                        if not (thief := _get_thief(s, ts, potential_thieves)):
                            continue

                        occ_thief = self._combined_occupancy(thief)
                        occ_victim = self._combined_occupancy(victim)
                        comm_cost_thief = self.scheduler.get_comm_cost(ts, thief)
                        comm_cost_victim = self.scheduler.get_comm_cost(ts, victim)
                        compute = self.scheduler.get_task_duration(ts)

                        if (
                            occ_thief + comm_cost_thief + compute
                            <= occ_victim - (comm_cost_victim + compute) / 2
                        ):
                            self.move_task_request(ts, victim, thief)
                            cost = compute + comm_cost_victim
                            log.append(
                                (
                                    start,
                                    level,
                                    ts.key,
                                    cost,
                                    victim.address,
                                    occ_victim,
                                    thief.address,
                                    occ_thief,
                                )
                            )
                            self.metrics["request_count_total"][level] += 1
                            self.metrics["request_cost_total"][level] += cost

                            occ_thief = self._combined_occupancy(thief)
                            nproc_thief = self._combined_nprocessing(thief)

                            if not self.scheduler.is_unoccupied(
                                thief, occ_thief, nproc_thief
                            ):
                                potential_thieves.discard(thief)
                            # FIXME: move_task_request already implements some logic
                            # for removing ts from stealable. If we made sure to
                            # properly clean up, we would not need this
                            stealable.discard(ts)
                    self.scheduler.check_idle_saturated(
                        victim, occ=self._combined_occupancy(victim)
                    )

            if log:
                self.log(("request", log))
                self.count += 1
            stop = time()
            if s.digests:
                s.digests["steal-duration"].add(stop - start)

    def _combined_occupancy(self, ws: WorkerState) -> float:
        return ws.occupancy + self.in_flight_occupancy[ws]

    def _combined_nprocessing(self, ws: WorkerState) -> int:
        return len(ws.processing) + self.in_flight_tasks[ws]

    def restart(self, scheduler: Any) -> None:
        for stealable in self.stealable.values():
            for s in stealable:
                s.clear()

        self.key_stealable.clear()

    def story(self, *keys_or_ts: str | TaskState) -> list:
        keys = {key.key if not isinstance(key, str) else key for key in keys_or_ts}
        out = []
        for _, L in self.scheduler.get_events(topic="stealing"):
            if L[0] == "request":
                L = L[1]
            else:
                L = [L]
            for t in L:
                if any(x in keys for x in t):
                    out.append(t)
        return out


def _get_thief(
    scheduler: SchedulerState, ts: TaskState, potential_thieves: set[WorkerState]
) -> WorkerState | None:
    valid_workers = scheduler.valid_workers(ts)
    if valid_workers is not None:
        valid_thieves = potential_thieves & valid_workers
        if valid_thieves:
            potential_thieves = valid_thieves
        elif not ts.loose_restrictions:
            return None
    return min(potential_thieves, key=partial(scheduler.worker_objective, ts))


fast_tasks = {"split-shuffle"}