-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathtask_worker.py
More file actions
707 lines (626 loc) · 24.4 KB
/
task_worker.py
File metadata and controls
707 lines (626 loc) · 24.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
import logging
import uuid
from collections.abc import Iterable, Mapping
from concurrent.futures import Future, ThreadPoolExecutor
from dataclasses import dataclass
from functools import partial
from queue import Full, Queue
from threading import Event, RLock
from typing import Any, TypeVar
from bluesky.protocols import Status
from observability_utils.tracing import (
add_span_attributes,
get_tracer,
setup_tracing,
start_as_current_span,
)
from opentelemetry.baggage import get_baggage
from opentelemetry.context import Context, get_current
from opentelemetry.trace import SpanKind
from pydantic import Field
from pydantic.json_schema import SkipJsonSchema
from super_state_machine.errors import TransitionError
from blueapi.core import (
OTLP_EXPORT_ENABLED,
BlueskyContext,
DataEvent,
EventPublisher,
EventStream,
WatchableStatus,
)
from blueapi.core.bluesky_event_loop import configure_bluesky_event_loop
from blueapi.log import plan_tag_filter_context
from blueapi.utils.base_model import BlueapiBaseModel
from blueapi.utils.thread_exception import handle_all_exceptions
from .event import (
ProgressEvent,
RawRunEngineState,
StatusView,
TaskStatus,
TaskStatusEnum,
WorkerEvent,
WorkerState,
)
from .task import Task
from .worker_errors import WorkerAlreadyStartedError, WorkerBusyError
LOGGER = logging.getLogger(__name__)
TRACER = get_tracer("task_worker")
""" Initialise a Tracer for this module provided by the app's global TracerProvider. """
DEFAULT_START_STOP_TIMEOUT: float = 30.0
WORKER_THREAD_STATE = "worker thread state"
T = TypeVar("T")
class TrackableTask(BlueapiBaseModel):
"""
A representation of a task that the worker recognizes
"""
task_id: str
task: Task
request_id: str | SkipJsonSchema[None] = None
is_complete: bool = False
is_pending: bool = True
errors: list[str] = Field(default_factory=list)
class TaskWorker:
"""
Worker wrapping BlueskyContext that can work in its own thread/process
Args:
ctx: Context to work with
stop_timeout: If the worker is told to stop, number of seconds to wait for
graceful shutdown before raising an exception. Defaults to 30.0.
"""
_ctx: BlueskyContext
_start_stop_timeout: float
_pending_tasks: dict[str, TrackableTask]
_completed_tasks: dict[str, TrackableTask]
_state: WorkerState
_errors: list[str]
_warnings: list[str]
# The queue is actually a channel between 2 threads
# most programming languages have a separate abstraction for this
# but Python reuses Queue
# So it's not used as a standard queue,
# but as a box in which to put the "current" task and nothing else
# So the calling thread can only ever submit one plan at a time.
_task_channel: Queue # type: ignore
_current: TrackableTask | None
_status_lock: RLock
_status_snapshot: dict[str, StatusView]
_completed_statuses: set[str]
_worker_events: EventPublisher[WorkerEvent]
_progress_events: EventPublisher[ProgressEvent]
_data_events: EventPublisher[DataEvent]
_started: Event
_stopping: Event
_stopped: Event
_current_task_otel_context: Context | None
def __init__(
self,
ctx: BlueskyContext,
start_stop_timeout: float = DEFAULT_START_STOP_TIMEOUT,
broadcast_statuses: bool = True,
) -> None:
self._ctx = ctx
self._start_stop_timeout = start_stop_timeout
self._pending_tasks = {}
self._completed_tasks = {}
assert ctx.run_engine.state is not None, "RunEngine state is not set"
state: RawRunEngineState = str(ctx.run_engine.state)
self._state = WorkerState.from_bluesky_state(state)
self._errors = []
self._warnings = []
self._task_channel = Queue(maxsize=1)
self._current = None
self._worker_events = EventPublisher()
self._progress_events = EventPublisher()
self._data_events = EventPublisher()
self._status_lock = RLock()
self._status_snapshot = {}
self._completed_statuses = set()
self._started = Event()
self._stopping = Event()
self._stopped = Event()
self._stopped.set()
self._broadcast_statuses = broadcast_statuses
self._current_task_otel_context = None
setup_tracing("BlueAPIWorker", OTLP_EXPORT_ENABLED)
@start_as_current_span(TRACER, "task_id")
def clear_task(self, task_id: str) -> str:
"""
Remove a task from the worker
Args:
task_id: The ID of the task to be removed
Returns:
task_id of the removed task
"""
pending = self._pending_tasks.pop(task_id, None)
task = pending or self._completed_tasks.pop(task_id)
return task.task_id
@start_as_current_span(TRACER)
def cancel_active_task(
self,
failure: bool = False,
reason: str | None = None,
) -> str:
"""
Remove the currently active task from the worker if there is one
Args:
failure: Flag cancellation as error
reason: Reason for cancellation
Returns:
The task_id of the active task
"""
if self._current is None:
# Persuades type checker that self._current is not None
# We only allow this method to be called if a Plan is active
raise TransitionError("Attempted to cancel while no active Task")
if failure:
default_reason = "Task failed for unknown reason"
self._ctx.run_engine.abort(reason or default_reason)
add_span_attributes({"Task aborted": reason or default_reason})
else:
self._ctx.run_engine.stop()
default_reason = "Cancellation successful: Task stopped without error"
add_span_attributes({"Task stopped": reason or default_reason})
return self._current.task_id
@start_as_current_span(TRACER)
def get_tasks(self) -> list[TrackableTask]:
"""
Return a list of all tasks on the worker,
any one of which can be triggered with begin_task.
Returns:
List[TrackableTask[T]]: List of task objects
"""
return list(self._pending_tasks.values()) + list(self._completed_tasks.values())
@start_as_current_span(TRACER, "task_id")
def get_task_by_id(self, task_id: str) -> TrackableTask | None:
"""
Returns a task matching the task ID supplied,
if the worker knows of it.
Args:
task_id: The ID of the task
Returns:
Optional[TrackableTask[T]]: The task matching the ID,
None if the task ID is unknown to the worker.
"""
return self._pending_tasks.get(task_id, None) or self._completed_tasks[task_id]
@start_as_current_span(TRACER, "status")
def get_tasks_by_status(self, status: TaskStatusEnum) -> list[TrackableTask]:
"""
Retrieve a list of tasks based on their status.
Args:
TaskStatusEnum: The status to filter tasks by.
Returns:
list[TrackableTask]: A list of tasks that match the given status.
"""
if status == TaskStatusEnum.RUNNING:
return [
task
for task in self._pending_tasks.values()
if not task.is_pending and not task.is_complete
]
elif status == TaskStatusEnum.PENDING:
return [task for task in self._pending_tasks.values() if task.is_pending]
elif status == TaskStatusEnum.COMPLETE:
return list(self._completed_tasks.values())
return []
@start_as_current_span(TRACER)
def get_active_task(self) -> TrackableTask | None:
"""
Returns the task the worker is currently running
Returns:
Optional[TrackableTask[Task]]: The current task,
None if the worker is idle.
"""
current = self._current
if current is not None:
add_span_attributes({"Active Task": current.task_id})
return current
@start_as_current_span(TRACER, "task_id")
def begin_task(self, task_id: str) -> None:
"""
Trigger a pending task. Will fail if the worker is busy.
Args:
task_id: The ID of the task to be triggered
Throws:
KeyError: If the task ID does not exist
"""
task = self._pending_tasks.get(task_id)
if task is None:
raise KeyError(f"No pending task with ID {task_id}")
else:
with plan_tag_filter_context(task.task.name, LOGGER):
self._submit_trackable_task(task)
@start_as_current_span(TRACER, "task.name", "task.params")
def submit_task(self, task: Task) -> str:
"""
Submit a task to be run on begin_task
Args:
task: A description of the task
Returns:
str: A unique ID to refer to this task
"""
task.prepare_params(self._ctx) # Will raise if parameters are invalid
task_id: str = str(uuid.uuid4())
add_span_attributes({"TaskId": task_id})
request_id = get_baggage("correlation_id")
# If request id is not a string, we do not pass it into a TrackableTask
if not isinstance(request_id, str):
LOGGER.warning(f"Invalid correlation id detected: {request_id}")
request_id = None
trackable_task = TrackableTask(
task_id=task_id,
request_id=request_id,
task=task,
)
self._pending_tasks[task_id] = trackable_task
return task_id
@start_as_current_span(
TRACER,
"trackable_task.task_id",
"trackable_task.task.name",
"trackable_task.task.params",
)
def _submit_trackable_task(self, trackable_task: TrackableTask) -> None:
if self.state is not WorkerState.IDLE:
raise WorkerBusyError(f"Worker is in state {self.state}")
if trackable_task.is_complete:
raise ValueError("Task has already been run")
task_started = Event()
def mark_task_as_started(event: WorkerEvent, _: str | None) -> None:
if (
event.task_status is not None
and event.task_status.task_id == trackable_task.task_id
):
task_started.set()
LOGGER.info(f"Submitting: {trackable_task}")
sub = self.worker_events.subscribe(mark_task_as_started)
try:
self._current_task_otel_context = get_current()
sub = self.worker_events.subscribe(mark_task_as_started)
""" Cache the current trace context as the one for this task id """
self._task_channel.put_nowait(trackable_task)
task_started.wait(timeout=5.0)
if not task_started.is_set():
raise TimeoutError("Failed to start plan within timeout")
except Full as f:
LOGGER.error("Cannot submit task while another is running")
raise WorkerBusyError("Cannot submit task while another is running") from f
finally:
self.worker_events.unsubscribe(sub)
@start_as_current_span(TRACER)
def start(self) -> None:
"""
Start worker in a new thread. Does not block, configures the bluesky
event loop in the new thread.
"""
if self._started.is_set():
raise WorkerAlreadyStartedError("Worker is already running")
self._wait_until_stopped()
run_worker_in_own_thread(self)
self._wait_until_started()
@start_as_current_span(TRACER)
def stop(self) -> None:
"""
Command the worker to gracefully stop. Blocks until it has shut down.
"""
LOGGER.info("Attempting to stop worker")
# If the worker has not yet started there is nothing to do.
if self._started.is_set():
self._task_channel.put(KillSignal())
else:
LOGGER.info("Stopping worker: nothing to do")
self._wait_until_stopped()
add_span_attributes({WORKER_THREAD_STATE: "STOPPED"})
LOGGER.info("Stopped")
@start_as_current_span(TRACER)
def _wait_until_started(self) -> None:
if not self._started.wait(timeout=self._start_stop_timeout):
raise TimeoutError(
f"Worker did not start within {self._start_stop_timeout} seconds"
)
@start_as_current_span(TRACER)
def _wait_until_stopped(self) -> None:
if not self._stopped.wait(timeout=self._start_stop_timeout):
raise TimeoutError(
f"Worker did not stop within {self._start_stop_timeout} seconds"
)
@property
def state(self) -> WorkerState:
"""
:return: state of the worker
"""
return self._state
@start_as_current_span(TRACER)
def run(self) -> None:
"""
Run all tasks that are submitted to the worker. Blocks thread.
"""
LOGGER.info("Worker starting")
self._ctx.run_engine.state_hook = self._on_state_change # type: ignore
subs = self._ctx.run_engine.subscribe(self._on_document)
if self._broadcast_statuses:
self._ctx.run_engine.waiting_hook = self._waiting_hook # type: ignore
self._stopped.clear()
self._started.set()
while not self._stopping.is_set():
self._cycle_with_error_handling()
self._started.clear()
self._stopping.clear()
self._stopped.set()
self._ctx.run_engine.unsubscribe(subs)
@start_as_current_span(TRACER, "defer")
def pause(self, defer=False):
"""
Command the worker to pause.
Args:
defer: Optional, if true wait till next checkpoint
"""
LOGGER.info("Requesting to pause the worker")
self._ctx.run_engine.request_pause(defer)
@start_as_current_span(TRACER)
def resume(self):
"""
Command the worker to resume
"""
LOGGER.info("Requesting to resume the worker")
self._ctx.run_engine.resume()
@start_as_current_span(TRACER)
def _cycle_with_error_handling(self) -> None:
try:
self._cycle()
except Exception as ex:
self._report_error(ex)
@start_as_current_span(TRACER)
def _cycle(self) -> None:
try:
LOGGER.info("Awaiting task")
next_task: TrackableTask | KillSignal = self._task_channel.get()
if isinstance(next_task, TrackableTask):
def process_task():
LOGGER.info(f"Got new task: {next_task}")
self._current = next_task
self._current.is_pending = False
self._current.task.do_task(self._ctx)
with plan_tag_filter_context(next_task.task.name, LOGGER):
if self._current_task_otel_context is not None:
with TRACER.start_as_current_span(
"_cycle",
context=self._current_task_otel_context,
kind=SpanKind.SERVER,
):
self._current_task_otel_context = get_current()
add_span_attributes(
{"next_task.task_id": next_task.task_id}
)
process_task()
else:
process_task()
elif isinstance(next_task, KillSignal):
# If we receive a kill signal we begin to shut the worker down.
# Note that the kill signal is explicitly not a type of task as we don't
# want it to be part of the worker's public API
self._stopping.set()
add_span_attributes({"server shutting down": "true"})
else:
raise KeyError(f"Unknown command: {next_task}")
except Exception as err:
self._report_error(err)
finally:
if self._current_task_otel_context is not None:
self._current_task_otel_context = None
if self._current is not None:
self._current.is_complete = True
self._pending_tasks.pop(self._current.task_id)
self._completed_tasks[self._current.task_id] = self._current
self._report_status()
self._errors.clear()
self._warnings.clear()
self._completed_statuses.clear()
@property
def worker_events(self) -> EventStream[WorkerEvent, int]:
"""
Events representing changes/errors in worker state
Returns:
EventStream[WorkerEvent, int]: Subscribable stream of events
"""
return self._worker_events
@property
def progress_events(self) -> EventStream[ProgressEvent, int]:
"""
Events representing progress in running a task
Returns:
EventStream[ProgressEvent, int]: Subscribable stream of events
"""
return self._progress_events
@property
def data_events(self) -> EventStream[DataEvent, int]:
"""
Events representing collection of data
Returns:
EventStream[DataEvent, int]: Subscribable stream of events
"""
return self._data_events
@start_as_current_span(TRACER, "raw_new_state", "raw_old_state")
def _on_state_change(
self,
raw_new_state: RawRunEngineState,
raw_old_state: RawRunEngineState | None = None,
) -> None:
new_state = WorkerState.from_bluesky_state(raw_new_state)
if raw_old_state:
old_state = WorkerState.from_bluesky_state(raw_old_state)
else:
old_state = WorkerState.UNKNOWN
LOGGER.debug(f"Notifying state change {old_state} -> {new_state}")
self._state = new_state
self._report_status()
def _report_error(self, err: Exception) -> None:
LOGGER.error(err, exc_info=True)
if self._current is not None:
self._current.errors.append(str(err))
self._errors.append(str(err))
@start_as_current_span(TRACER)
def _report_status(
self,
) -> None:
task_status: TaskStatus | None
errors = self._errors
warnings = self._warnings
if self._current is not None:
task_status = TaskStatus(
task_id=self._current.task_id,
task_complete=self._current.is_complete,
task_failed=bool(self._current.errors),
)
correlation_id = self._current.task_id
add_span_attributes(
{
"task_id": self._current.task_id,
"task_complete": self._current.is_complete,
"task_failed": self._current.errors,
}
)
else:
task_status = None
correlation_id = None
event = WorkerEvent(
state=self._state,
task_status=task_status,
errors=errors,
warnings=warnings,
)
self._worker_events.publish(event, correlation_id)
def _on_document(self, name: str, document: Mapping[str, Any]) -> None:
if self._current is not None:
if self._current_task_otel_context is not None:
with TRACER.start_as_current_span(
"_on_document",
context=self._current_task_otel_context,
kind=SpanKind.PRODUCER,
):
"""
Start a new span but inject the context cached when the current task
was created. This will make the documents received part of the same
trace.
"""
add_span_attributes(
{
"task_id": self._current.task_id,
"name": name,
"document": str(document),
}
)
correlation_id = self._current.request_id
self._data_events.publish(
DataEvent(
name=name, task_id=self._current.task_id, doc=document
),
correlation_id,
)
else:
raise ValueError(
"There is no context set for tracing despite the fact that a task"
" is running, something has gone wrong..."
)
else:
raise KeyError(
"Trying to emit a document despite the fact that the RunEngine is idle"
)
def _waiting_hook(self, statuses: Iterable[Status] | None) -> None:
if statuses is not None:
with self._status_lock:
for status in statuses:
self._monitor_status(status)
def _monitor_status(self, status: Status) -> None:
status_uuid = str(uuid.uuid4())
if isinstance(status, WatchableStatus) and not status.done:
LOGGER.info(f"Watching new status: {status_uuid}")
self._status_snapshot[status_uuid] = StatusView()
status.watch(partial(self._on_status_event, status, status_uuid))
# TODO: Maybe introduce an initial event, in which case move
# all of this code out of the if statement
def on_complete(status: Status) -> None:
self._on_status_event(status, status_uuid)
del self._status_snapshot[status_uuid]
self._completed_statuses.add(status_uuid)
status.add_callback(on_complete) # type: ignore
def _on_status_event(
self,
status: Status,
status_uuid: str,
*,
name: str | None = None,
current: float | None = None,
initial: float | None = None,
target: float | None = None,
unit: str | None = None,
precision: int | None = None,
fraction: float | None = None,
time_elapsed: float | None = None,
time_remaining: float | None = None,
) -> None:
if not status.done:
percentage = float(1.0 - fraction) if fraction is not None else None
else:
percentage = 1.0
view = StatusView(
display_name=name or "UNKNOWN",
current=current,
initial=initial,
target=target,
unit=unit or "units",
precision=precision or 3,
done=status.done,
percentage=percentage,
time_elapsed=time_elapsed,
time_remaining=time_remaining,
)
# Ensure completed statues are not re-added and published
if status_uuid not in self._completed_statuses:
self._status_snapshot[status_uuid] = view
self._publish_status_snapshot()
def _publish_status_snapshot(self) -> None:
if self._current is None:
raise ValueError("Got a status update without an active task!")
else:
self._progress_events.publish(
ProgressEvent(
task_id=self._current.task_id,
statuses=self._status_snapshot,
),
self._current.task_id,
)
@dataclass
class KillSignal:
"""
Object put in the worker's task queue to tell it to shut down.
"""
...
def run_worker_in_own_thread(
worker: TaskWorker, executor: ThreadPoolExecutor | None = None
) -> Future:
"""
Helper function, make a worker run in a new thread managed by a ThreadPoolExecutor
Args:
worker (TaskWorker): The worker to run
executor (Optional[ThreadPoolExecutor], optional): The executor to manage the
thread, defaults to None in
which case a new one is
created
Returns:
Future: Future representing worker stopping
"""
if executor is None:
executor = ThreadPoolExecutor(1, "run-engine-worker")
return executor.submit(_run_worker_thread, worker)
@handle_all_exceptions
def _run_worker_thread(worker: TaskWorker) -> None:
"""
Helper function, run a worker forever, includes support for
printing exceptions to stdout from a non-main thread.
Args:
worker (TaskWorker): The worker to run
"""
LOGGER.info("Setting up event loop")
configure_bluesky_event_loop()
LOGGER.info("Worker starting")
worker.run()