Skip to content

Commit f2a8850

Browse files
authored
[!] refactor etcd checker with etcd/client/v3.Watch(), closes #196 (#199)
1 parent f56cab7 commit f2a8850

3 files changed

Lines changed: 55 additions & 70 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ This is a list of all avaiable configuration items:
125125
`etcd-user` | `VIP_ETCD_USER` | no | patroni | A username that is allowed to look at the `trigger-key` in an etcd DCS. Optional when using `dcs-type=etcd` .
126126
`etcd-password` | `VIP_ETCD_PASSWORD` | no | snakeoil | The password for `etcd-user`. Optional when using `dcs-type=etcd` . Requires that `etcd-user` is also set.
127127
`consul-token` | `VIP_CONSUL_TOKEN` | no | snakeoil | A token that can be used with the consul-API for authentication. Optional when using `dcs-type=consul` .
128-
`interval` | `VIP_INTERVAL` | no | 1000 | The time vip-manager main loop sleeps before checking for changes. Measured in ms. Defaults to `1000`.
128+
`interval` | `VIP_INTERVAL` | no | 1000 | The time vip-manager main loop sleeps before checking for changes. Measured in ms. Defaults to `1000`. Doesn't affect etcd checker since v2.3.0.
129129
`retry-after` | `VIP_RETRY_AFTER` | no | 250 | The time to wait before retrying interactions with components outside of vip-manager. Measured in ms. Defaults to `250`.
130130
`retry-num` | `VIP_RETRY_NUM` | no | 3 | The number of times interactions with components outside of vip-manager are retried. Defaults to `3`.
131131
`etcd-ca-file` | `VIP_ETCD_CA_FILE` | no | /etc/etcd/ca.cert.pem | A certificate authority file that can be used to verify the certificate provided by etcd endpoints. Make sure to change `dcs-endpoints` to reflect that `https` is used.

checker/etcd_leader_checker.go

Lines changed: 52 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -10,22 +10,35 @@ import (
1010
"time"
1111

1212
"github.com/cybertec-postgresql/vip-manager/vipconfig"
13-
client "go.etcd.io/etcd/client/v3"
13+
clientv3 "go.etcd.io/etcd/client/v3"
1414
)
1515

1616
// EtcdLeaderChecker is used to check state of the leader key in Etcd
1717
type EtcdLeaderChecker struct {
18-
key string
19-
nodename string
20-
kapi client.KV
18+
*vipconfig.Config
19+
*clientv3.Client
2120
}
2221

23-
// naming this c_conf to avoid conflict with conf in etcd_leader_checker.go
24-
var eConf *vipconfig.Config
22+
// NewEtcdLeaderChecker returns a new instance
23+
func NewEtcdLeaderChecker(conf *vipconfig.Config) (*EtcdLeaderChecker, error) {
24+
tlsConfig, err := getTransport(conf)
25+
if err != nil {
26+
return nil, err
27+
}
28+
cfg := clientv3.Config{
29+
Endpoints: conf.Endpoints,
30+
TLS: tlsConfig,
31+
DialKeepAliveTimeout: 5 * time.Second,
32+
DialKeepAliveTime: 5 * time.Second,
33+
Username: conf.EtcdUser,
34+
Password: conf.EtcdPassword,
35+
}
36+
c, err := clientv3.New(cfg)
37+
return &EtcdLeaderChecker{conf, c}, err
38+
}
2539

2640
func getTransport(conf *vipconfig.Config) (*tls.Config, error) {
2741
var caCertPool *x509.CertPool
28-
2942
// create valid CertPool only if the ca certificate file exists
3043
if conf.EtcdCAFile != "" {
3144
caCert, err := os.ReadFile(conf.EtcdCAFile)
@@ -36,9 +49,7 @@ func getTransport(conf *vipconfig.Config) (*tls.Config, error) {
3649
caCertPool = x509.NewCertPool()
3750
caCertPool.AppendCertsFromPEM(caCert)
3851
}
39-
4052
var certificates []tls.Certificate
41-
4253
// create valid []Certificate only if the client cert and key files exists
4354
if conf.EtcdCertFile != "" && conf.EtcdKeyFile != "" {
4455
cert, err := tls.LoadX509KeyPair(conf.EtcdCertFile, conf.EtcdKeyFile)
@@ -48,83 +59,57 @@ func getTransport(conf *vipconfig.Config) (*tls.Config, error) {
4859

4960
certificates = []tls.Certificate{cert}
5061
}
51-
5262
tlsClientConfig := new(tls.Config)
53-
5463
if caCertPool != nil {
5564
tlsClientConfig.RootCAs = caCertPool
5665
if certificates != nil {
5766
tlsClientConfig.Certificates = certificates
5867
}
5968
}
60-
6169
return tlsClientConfig, nil
6270
}
6371

64-
// NewEtcdLeaderChecker returns a new instance
65-
func NewEtcdLeaderChecker(con *vipconfig.Config) (*EtcdLeaderChecker, error) {
66-
eConf = con
67-
e := &EtcdLeaderChecker{key: eConf.Key, nodename: eConf.Nodename}
68-
69-
tlsConfig, err := getTransport(eConf)
72+
// init gets the current leader from etcd
73+
func (elc *EtcdLeaderChecker) init(ctx context.Context, out chan<- bool) {
74+
resp, err := elc.Get(ctx, elc.Key)
7075
if err != nil {
71-
return nil, err
72-
}
73-
74-
cfg := client.Config{
75-
Endpoints: eConf.Endpoints,
76-
TLS: tlsConfig,
77-
DialKeepAliveTimeout: 5 * time.Second,
78-
DialKeepAliveTime: 5 * time.Second,
79-
Username: eConf.EtcdUser,
80-
Password: eConf.EtcdPassword,
76+
log.Printf("etcd error: %s", err)
77+
out <- false
78+
return
8179
}
82-
c, err := client.New(cfg)
83-
if err != nil {
84-
return nil, err
80+
for _, kv := range resp.Kvs {
81+
log.Printf("Current Leader from DCS: %s", kv.Value)
82+
out <- string(kv.Value) == elc.Nodename
8583
}
86-
e.kapi = c.KV
87-
return e, nil
8884
}
8985

90-
// GetChangeNotificationStream checks the status in the loop
91-
func (e *EtcdLeaderChecker) GetChangeNotificationStream(ctx context.Context, out chan<- bool) error {
92-
var state bool
93-
var alreadyConnected = false
94-
checkLoop:
86+
// watch monitors the leader change from etcd
87+
func (elc *EtcdLeaderChecker) watch(ctx context.Context, out chan<- bool) error {
88+
watchChan := elc.Watch(ctx, elc.Key)
89+
log.Println("set WATCH on " + elc.Key)
9590
for {
96-
resp, err := e.kapi.Get(ctx, e.key)
97-
98-
if err != nil {
99-
if ctx.Err() != nil {
100-
break checkLoop
101-
}
102-
log.Printf("etcd error: %s", err)
103-
out <- false
104-
time.Sleep(time.Duration(eConf.Interval) * time.Millisecond)
105-
continue
106-
}
107-
108-
if !alreadyConnected {
109-
log.Printf("etcd checker started up, found key %s", e.key)
110-
alreadyConnected = true
111-
}
112-
113-
for _, kv := range resp.Kvs {
114-
if eConf.Verbose {
115-
log.Println("Leader from DCS:", string(kv.Value))
116-
}
117-
state = string(kv.Value) == e.nodename
118-
}
119-
12091
select {
12192
case <-ctx.Done():
122-
break checkLoop
123-
case out <- state:
124-
time.Sleep(time.Duration(eConf.Interval) * time.Millisecond)
125-
continue
93+
return ctx.Err()
94+
case watchResp := <-watchChan:
95+
if err := watchResp.Err(); err != nil {
96+
log.Printf("etcd watcher returned error: %s", err)
97+
out <- false
98+
continue
99+
}
100+
for _, event := range watchResp.Events {
101+
out <- string(event.Kv.Value) == elc.Nodename
102+
log.Printf("Current Leader from DCS: %s", event.Kv.Value)
103+
}
126104
}
127105
}
106+
}
128107

129-
return ctx.Err()
108+
// GetChangeNotificationStream monitors the leader in etcd
109+
func (elc *EtcdLeaderChecker) GetChangeNotificationStream(ctx context.Context, out chan<- bool) error {
110+
defer elc.Close()
111+
go elc.init(ctx, out)
112+
wctx, cancel := context.WithCancel(ctx)
113+
defer cancel()
114+
return elc.watch(wctx, out)
130115
}

test/behaviour_test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ trap cleanup EXIT
5353

5454
# run etcd with podman/docker maybe?
5555
# podman rm etcd || true
56-
# podman run -d --name etcd -p 2379:2379 -e "ETCD_ENABLE_V2=true" -e "ALLOW_NONE_AUTHENTICATION=yes" bitnami/etcd
56+
# podman run -d --name etcd -p 2379:2379 -e "ALLOW_NONE_AUTHENTICATION=yes" bitnami/etcd
5757

5858
# run etcd locally maybe?
5959
etcd &
@@ -67,7 +67,7 @@ echo $! > .ncatPid
6767
etcdctl del service/pgcluster/leader || true
6868

6969
touch .failed
70-
./vip-manager --interface $dev --ip $vip --netmask 32 --trigger-key service/pgcluster/leader --trigger-value $HOSTNAME & #2>&1 &
70+
./vip-manager --interval 3000 --interface $dev --ip $vip --netmask 32 --trigger-key service/pgcluster/leader --trigger-value $HOSTNAME & #2>&1 &
7171
echo $! > .vipPid
7272
sleep 2
7373

0 commit comments

Comments
 (0)