forked from patroni/patroni
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmultisite.py
More file actions
506 lines (421 loc) · 21.4 KB
/
multisite.py
File metadata and controls
506 lines (421 loc) · 21.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
import abc
import json
import logging
import time
from datetime import datetime, timezone
from threading import Event, Thread
from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING, Union
import six
import kubernetes
from . import global_config
from .dcs import AbstractDCS, Cluster, Member
from .dcs.kubernetes import catch_kubernetes_errors
from .exceptions import DCSError
if TYPE_CHECKING: # pragma: no cover
from .config import Config
logger = logging.getLogger(__name__)
@six.add_metaclass(abc.ABCMeta)
class AbstractSiteController(object):
# Set whether we are relying on this controller for providing standby config
is_active = False
dcs: AbstractDCS
def start(self):
pass
def shutdown(self):
pass
def get_active_standby_config(self) -> Union[Dict[str, Any], None]:
"""Returns currently active configuration for standby leader"""
return {}
def is_leader_site(self):
return self.get_active_standby_config() is None
def resolve_leader(self) -> Union[str, None]:
"""Try to become leader, update active config correspondingly.
Return error when unable to resolve"""
return None
def heartbeat(self):
""""Notify multisite mechanism that this site has a properly operating cluster mechanism.
Need to send out an async lease update. If that fails to complete within safety margin of ttl running
out then we need to update site config
"""
def release(self):
pass
def status(self) -> Dict[str, Any]:
return {}
def should_failover(self) -> bool:
return False
def on_shutdown(self, checkpoint_location: int, prev_location: int):
pass
class SingleSiteController(AbstractSiteController):
"""Do nothing controller for single site operation."""
def status(self):
return {"status": "Leader", "active": False}
class MultisiteController(Thread, AbstractSiteController):
is_active = True
def __init__(self, config: 'Config', on_change: Optional[Callable[..., None]] = None):
super().__init__()
self.stop_requested = False
self.on_change = on_change
msconfig, self.dcs = self.get_dcs_config(config)
self.config = msconfig
self.name = msconfig['name']
if msconfig.get('update_crd'):
self._state_updater = KubernetesStateManagement(
msconfig.get('update_crd'), # pyright: ignore [reportArgumentType]
msconfig.get('crd_uid'), # pyright: ignore [reportArgumentType]
reporter=self.name, # Use pod name?
crd_api=msconfig.get('crd_api', 'acid.zalan.do/v1'))
else:
self._state_updater = None
self.switchover_timeout = msconfig.get('switchover_timeout', 300)
self._heartbeat = Event()
self._standby_config = None
self._leader_resolved = Event()
self._has_leader = False
self._release = False
self._status = None
self._failover_target = None
self._failover_timeout = None
self.site_switches = None
self._dcs_error = None
@staticmethod
def get_dcs_config(config: 'Config') -> Tuple[Dict[str, Any], AbstractDCS]:
msconfig = config['multisite']
# Multisite configuration inherits values from main configuration
inherited_keys = ['name', 'scope', 'namespace', 'loop_wait', 'ttl', 'retry_timeout']
for key in inherited_keys:
if key not in msconfig and key in config:
msconfig[key] = config[key]
msconfig.setdefault('observe_interval', config.get('loop_wait'))
# TODO: fetch default host/port from postgresql section
if 'host' not in msconfig or 'port' not in msconfig:
raise Exception("Missing host or port from multisite configuration")
# Disable etcd3 lease ownership detection warning
msconfig['multisite'] = True
from .dcs import get_dcs
return msconfig, get_dcs(msconfig)
def status(self):
return {
"status": "Leader" if self._has_leader or self._standby_config is None else "Standby",
"active": True,
"name": self.name,
"standby_config": self.get_active_standby_config(),
}
def get_active_standby_config(self):
return self._standby_config
def resolve_leader(self):
"""Try to become leader, update active config correspondingly.
Must be called from Patroni main thread. After a successful return get_active_standby_config() will
return a value corresponding to a multisite status that was active after start of the call.
Returns error message encountered when unable to resolve leader status."""
self._leader_resolved.clear()
self._heartbeat.set()
self._leader_resolved.wait()
return self._dcs_error
def heartbeat(self):
"""Notify multisite mechanism that this site has a properly operating cluster mechanism.
Need to send out an async lease update. If that fails to complete within safety margin of ttl running
out then we need to demote.
"""
logger.info("Triggering multisite heartbeat")
self._heartbeat.set()
def release(self):
self._release = True
self._heartbeat.set()
def should_failover(self):
return self._failover_target is not None and self._failover_target != self.name
def on_shutdown(self, checkpoint_location: int, prev_location: int):
""" Called when shutdown for multisite failover has completed.
"""
# TODO: check if we replicated everything to standby site
self.release()
@property
def _replication_slot(self) -> Optional[str]:
site_config = global_config.sites.get(self.name)
return site_config and site_config.get('slot')
@property
def _restore_command(self) -> Optional[str]:
site_config = global_config.sites.get(self.name)
if site_config and 'restore_command' in site_config:
return site_config['restore_command']
return self.config.get('restore_command')
def _disconnected_operation(self):
self._standby_config = {'restore_command': 'false'}
@property
def is_follower(self):
"""Returns true if this site is following another site"""
cfg = self._standby_config # Fetch once for atomic access
return cfg is not None and 'host' in cfg
def _set_standby_config(self, other: Member):
other_address = ','.join([':'.join([i, other.data['port']]) for i in other.data['host']])
logger.info(f"Setting standby config to replicate from site {other.name} ({other_address})")
# TODO: add support for replication slots
try:
new_config = {
'host': other.data['host'],
'port': other.data['port'],
'create_replica_methods': ['basebackup'],
'leader_site': other.name,
}
slot = self._replication_slot
if slot:
new_config['primary_slot_name'] = slot
restore_command = self._restore_command
if restore_command:
new_config['restore_command'] = restore_command
old_conf, self._standby_config = self._standby_config, new_config
except KeyError:
old_conf = self._standby_config
self._disconnected_operation()
if old_conf != self._standby_config:
logger.info(f"Setting standby configuration to: {self._standby_config}")
return old_conf != self._standby_config
def _check_transition(self, leader: bool, note: str = ''):
if self._has_leader != leader:
logger.info("State transition")
self._has_leader = leader
if self.on_change:
self.on_change()
if self._state_updater and self._status != leader:
self._state_updater.state_transition('Leader' if leader else 'Standby', note)
self._status = leader
def _resolve_multisite_leader(self):
logger.info("Running multisite consensus")
try:
# Refresh the latest known state
cluster = self.dcs.get_cluster()
self._dcs_error = None
if not cluster.has_member(self.name):
self.touch_member()
if cluster.is_unlocked():
if self._release:
self._release = False
self._disconnected_operation()
return
if self._failover_target and self._failover_timeout and self._failover_timeout > time.time():
logger.info("Waiting for multisite failover to complete")
self._disconnected_operation()
return
# Became leader of unlocked cluster
if self.dcs.attempt_to_acquire_leader():
logger.info("Became multisite leader")
self._standby_config = None
self._check_transition(leader=True, note="Acquired multisite leader status")
if cluster.failover and cluster.failover.target_site and cluster.failover.target_site == self.name:
logger.info("Cleaning up multisite failover key after acquiring leader status")
self.dcs.manual_failover('', '')
# Failed to become leader, maybe someone else acquired lock, maybe we just failed
else:
logger.info("Failed to acquire multisite lock")
# Non-working standby config while we are resolving who to connect to
self._disconnected_operation()
self._check_transition(leader=False, note="Lost multisite leader status")
# Try to get new leader
cluster = self.dcs.get_cluster()
if cluster.leader and cluster.leader.name != self.name:
self._set_standby_config(cluster.leader.member)
else:
# There is a leader cluster
lock_owner = cluster.leader and cluster.leader.name
# The leader is us
if lock_owner == self.name:
logger.info("Multisite has a leader and it is us")
if self._release:
logger.info("Releasing multisite leader status")
self.dcs.delete_leader(cluster.leader)
self._release = False
self._disconnected_operation()
self._check_transition(leader=False, note="Released multisite leader status upon request")
return
if self.dcs.update_leader(cluster, None):
logger.info("Updated multisite leader lease")
# Make sure we are disabled from standby mode
self._standby_config = None
self._check_transition(leader=True, note="Already have multisite leader status")
self._check_for_failover(cluster)
else:
logger.error("Failed to update multisite leader status")
self._disconnected_operation()
self._check_transition(leader=False, note="Failed to update multisite leader status")
# Current leader is someone else
else:
logger.info(f"Multisite has a leader and it is {lock_owner}")
self._release = False
# Failover successful or someone else took over
if self._failover_target is not None:
self._failover_target = None
self._failover_timeout = None
if self._set_standby_config(cluster.leader.member): # pyright: ignore [reportOptionalMemberAccess]
# Wake up anyway to notice that we need to replicate from new leader. For the other case
# _check_transition() handles the wake.
if not self._has_leader:
self.on_change() # pyright: ignore [reportOptionalCall]
note = (f"Lost leader lock to {lock_owner}" if self._has_leader else
f"Current leader is {lock_owner}")
self._check_transition(leader=False, note=note)
except DCSError as e:
logger.error(f"Error accessing multisite DCS: {e}")
self._dcs_error = 'Multisite DCS cannot be reached'
if self._has_leader:
self._disconnected_operation()
self._has_leader = False
self.on_change() # pyright: ignore [reportOptionalCall]
if self._state_updater:
self._state_updater.state_transition('Standby', 'Unable to access multisite DCS')
else:
try:
self._update_history(cluster)
self.touch_member()
except DCSError:
pass
def _observe_leader(self):
"""
Observe multisite state and make sure standby_cluster setting gets updated
"""
try:
cluster = self.dcs.get_cluster()
if cluster.is_unlocked():
logger.info("Multisite has no leader because the cluster is unlocked")
self._disconnected_operation()
else:
# There is a leader cluster
lock_owner = cluster.leader and cluster.leader.name
# The leader is us
if lock_owner == self.name:
logger.info("Multisite leader is us")
self._standby_config = None
else:
logger.info(f"Multisite leader is {lock_owner}")
self._set_standby_config(cluster.leader.member) # pyright: ignore [reportOptionalMemberAccess]
except DCSError as e:
# On replicas we need to know the multisite status only for rewinding.
logger.warning(f"Error accessing multisite DCS: {e}")
def _update_history(self, cluster: Cluster):
# The history lines are of type dcs._HistoryTuple to match normal timeline history. The data stored by tuple
# index:
# 0: site switch count
# 1: 0 (constant) TODO: maybe store the LSN when the switch happened - in that case it will match the LSN of the
# timeline switch
# 2: site switch timestamp
# 3: new leader site name
#
# The full history is a list of the tuples described above, the latest one being the last element.
# The older implementation was a single item list of dict, we replace it with the list of tuples.
# TODO: once we are sure there are no such instances, the dict references can be removed alongside the ugly
# pyright repellant comments.
if cluster.history and cluster.history.lines:
if isinstance(cluster.history.lines[0], dict): # older implementation, will get replaced by this update
self.site_switches = cluster.history.lines[0].get('switches') # noqa: E501 # pyright: ignore [reportUnknownMemberType]
else:
self.site_switches = cluster.history.lines[-1][0]
if self._has_leader:
if cluster.history and cluster.history.lines:
if isinstance(cluster.history.lines[0], dict):
history_state = cluster.history.lines[0]
if history_state.get('last_leader') != self.name: # pyright: ignore [reportUnknownMemberType]
new_state = (history_state.get('switches', 0) + 1, 0, '', self.name) # noqa: E501 # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType]
self.dcs.set_history_value(json.dumps([new_state]))
return
else:
history_state = cluster.history.lines[-1]
if isinstance(history_state, (list, tuple)) and len(history_state) > 3: # noqa: E501 # pyright: ignore[reportUnnecessaryIsInstance]
if history_state[3] != self.name:
new_state = (history_state[0] + 1, 0, '', self.name)
self.dcs.set_history_value(json.dumps(cluster.history.lines + [new_state]))
return
# no history yet or broken history, set initial item
self.dcs.set_history_value(json.dumps([(0, 0, '', self.name)]))
def _check_for_failover(self, cluster: Cluster):
if cluster.failover and cluster.failover.target_site:
if cluster.failover.target_site == self.name:
logger.info("Cleaning up failover key targeting us")
self.dcs.manual_failover('', '')
elif not any(m.name == cluster.failover.target_site for m in cluster.members):
logger.info(f"Multisite failover target {cluster.failover.target_site} is not registered")
else:
if self._failover_target != cluster.failover.target_site:
logger.info(f"Initiating multisite failover to {cluster.failover.target_site}")
self._failover_timeout = time.time() + self.switchover_timeout
# TODO: need to set timeout in DCS for more than two sites to avoid wrong site taking over
self._failover_target = cluster.failover.target_site
else:
self._failover_target = None
self._failover_timeout = None
def touch_member(self):
data = {
'host': self.config['host'],
'port': self.config['port'],
}
address = ','.join([':'.join([i, data['port']]) for i in data['host'].split(',')])
logger.info(f"Registering site {self.name} in DCS with address {address}")
self.dcs.touch_member(data)
def run(self):
self._observe_leader()
while not self._heartbeat.wait(self.config['observe_interval']):
# Keep track of who is the leader even when we are not the primary node to be able to rewind from them
self._observe_leader()
while not self.stop_requested:
self._resolve_multisite_leader()
self._heartbeat.clear()
self._leader_resolved.set()
if self._state_updater:
self._state_updater.store_updates()
while not self._heartbeat.wait(self.config['observe_interval']):
self._observe_leader()
def shutdown(self):
self.stop_requested = True
self._heartbeat.set()
self.join()
class KubernetesStateManagement:
def __init__(self, crd_name: str, crd_uid: str, reporter: str, crd_api: str):
self.crd_namespace, self.crd_name = (['default'] + crd_name.rsplit('.', 1))[-2:]
self.crd_uid = crd_uid
self.reporter = reporter
self.crd_api_group, self.crd_api_version = crd_api.rsplit('/', 1)
# TODO: handle config loading when main DCS is not Kubernetes based
# apiclient = k8s_client.ApiClient(False)
kubernetes.config.load_incluster_config() # pyright: ignore [reportUnknownMemberType]
apiclient = kubernetes.client.ApiClient()
self._customobj_api = kubernetes.client.CustomObjectsApi(apiclient)
self._events_api = kubernetes.client.EventsV1Api(apiclient)
self._status_update = None
self._event_obj = None
def state_transition(self, new_state: str, note: str):
self._status_update = {"status": {"Multisite": new_state}}
failover_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
reason = 'Promote' if new_state == 'Leader' else 'Demote'
if note == '':
note = 'Acquired multisite leader lock' if new_state == 'Leader' else 'Became a standby site'
self._event_obj = kubernetes.client.EventsV1Event(
action='Failover',
event_time=failover_time,
type="Normal",
reporting_controller="patroni",
reporting_instance=self.reporter,
regarding=kubernetes.client.V1ObjectReference(
api_version="acid.zalan.do/v1",
kind="postgresql",
name=self.crd_name,
namespace=self.crd_namespace,
uid=self.crd_uid,
),
reason=reason, note=note,
metadata=kubernetes.client.V1ObjectMeta(namespace=self.crd_namespace, generate_name=self.crd_name)
)
def store_updates(self):
try:
if self._status_update:
self.update_crd_state(self._status_update)
self._status_update = None
if self._event_obj:
self.create_failover_event(self._event_obj)
self._event_obj = None
except Exception as e:
logger.warning("Unable to store Kubernetes status update: %s", e)
@catch_kubernetes_errors
def update_crd_state(self, update: Dict[str, Any]):
self._customobj_api.patch_namespaced_custom_object_status( # pyright: ignore [reportUnknownMemberType]
self.crd_api_group, self.crd_api_version, self.crd_namespace, 'postgresqls', self.crd_name + '/status',
update, field_manager='patroni')
return True
def create_failover_event(self, event: kubernetes.client.EventsV1Event):
self._events_api.create_namespaced_event(self.crd_namespace, event) # pyright: ignore [reportUnknownMemberType]