From 520a9ce75fb03d7ad1a8e1c50cd594a97eb24997 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Thu, 30 Apr 2026 11:16:43 +0200 Subject: [PATCH 1/8] kms/health: add KMSv2 health monitor library Introduces pkg/operator/encryption/kms/health/: - Probe calls Status() on an injected kmsservice.Service (mirrors preflight's pattern) and classifies the response as ok, unhealthy:not-ok:, or unhealthy:rpc-error:. Never returns an error; failures are baked into HealthStatus.Healthz so the monitor loop stays flat. - StatusWriter / ConfigMapWriter updates an existing ConfigMap in place. Hard-fails on miss so misconfiguration is loud. - Monitor drives the tick loop: probe, stamp observer pod, write, sleep on healthy/unhealthy interval, repeat until ctx cancel. Write errors are logged and tolerated. - health.NewCommand(ctx) wires flags, validates inputs, builds a rest.Config, dials the UDS via the same kmsv2 client preflight uses, and hands a Probe + ConfigMapWriter to the Monitor. --output-mode=condition is reserved and explicitly rejected so the OpenShift track can add it without silent behavior change. Tests use FakeClock with the HasWaiters() barrier for race-free stepping through healthy->unhealthy->healthy transitions. --- pkg/operator/encryption/kms/health/cmd.go | 238 ++++++++++++++++ .../encryption/kms/health/cmd_test.go | 180 ++++++++++++ pkg/operator/encryption/kms/health/doc.go | 5 + pkg/operator/encryption/kms/health/monitor.go | 78 ++++++ .../encryption/kms/health/monitor_test.go | 261 ++++++++++++++++++ pkg/operator/encryption/kms/health/probe.go | 89 ++++++ .../encryption/kms/health/probe_test.go | 219 +++++++++++++++ pkg/operator/encryption/kms/health/types.go | 49 ++++ pkg/operator/encryption/kms/health/writer.go | 10 + .../encryption/kms/health/writer_configmap.go | 100 +++++++ .../kms/health/writer_configmap_test.go | 244 ++++++++++++++++ 11 files changed, 1473 insertions(+) create mode 100644 pkg/operator/encryption/kms/health/cmd.go create mode 100644 pkg/operator/encryption/kms/health/cmd_test.go create mode 100644 pkg/operator/encryption/kms/health/doc.go create mode 100644 pkg/operator/encryption/kms/health/monitor.go create mode 100644 pkg/operator/encryption/kms/health/monitor_test.go create mode 100644 pkg/operator/encryption/kms/health/probe.go create mode 100644 pkg/operator/encryption/kms/health/probe_test.go create mode 100644 pkg/operator/encryption/kms/health/types.go create mode 100644 pkg/operator/encryption/kms/health/writer.go create mode 100644 pkg/operator/encryption/kms/health/writer_configmap.go create mode 100644 pkg/operator/encryption/kms/health/writer_configmap_test.go diff --git a/pkg/operator/encryption/kms/health/cmd.go b/pkg/operator/encryption/kms/health/cmd.go new file mode 100644 index 0000000000..afbb7bec51 --- /dev/null +++ b/pkg/operator/encryption/kms/health/cmd.go @@ -0,0 +1,238 @@ +package health + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/spf13/cobra" + "github.com/spf13/pflag" + + k8senvelopekmsv2 "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" +) + +const ( + outputModeConfigMap = "configmap" + outputModeCondition = "condition" + + providerName = "kms-health-monitor" +) + +type commandOptions struct { + kmsSocket string + probeInterval time.Duration + probeIntervalUnhealthy time.Duration + probeTimeout time.Duration + writeTimeout time.Duration + outputMode string + configmapNamespace string + configmapName string + observerPodName string + kubeconfig string +} + +// NewCommand wires the cobra command. ctx is owned by main() so signal +// handling lives there. +func NewCommand(ctx context.Context) *cobra.Command { + o := &commandOptions{ + kmsSocket: "/var/run/kmsplugin/kms.sock", + probeInterval: 60 * time.Second, + probeIntervalUnhealthy: 10 * time.Second, + probeTimeout: 3 * time.Second, + writeTimeout: 5 * time.Second, + outputMode: outputModeConfigMap, + } + cmd := &cobra.Command{ + Use: "kms-health-monitor", + Short: "Observes a co-located KMSv2 plugin and publishes a health status.", + RunE: func(cmd *cobra.Command, args []string) error { + if err := o.validate(); err != nil { + return err + } + return o.run(ctx) + }, + } + o.addFlags(cmd.Flags()) + return cmd +} + +func (o *commandOptions) addFlags(fs *pflag.FlagSet) { + fs.StringVar( + &o.kmsSocket, + "kms-socket", + o.kmsSocket, + "filesystem path to the KMSv2 plugin UDS", + ) + fs.DurationVar( + &o.probeInterval, + "probe-interval", + o.probeInterval, + "probe cadence while the plugin is healthy", + ) + fs.DurationVar( + &o.probeIntervalUnhealthy, + "probe-interval-unhealthy", + o.probeIntervalUnhealthy, + "probe cadence after an unhealthy result until recovery", + ) + fs.DurationVar( + &o.probeTimeout, + "probe-timeout", + o.probeTimeout, + "deadline for each Status RPC", + ) + fs.DurationVar( + &o.writeTimeout, + "write-timeout", + o.writeTimeout, + "deadline for each status write (e.g. ConfigMap Update); should fit inside --probe-interval-unhealthy to preserve cadence under apiserver slowness", + ) + fs.StringVar( + &o.outputMode, + "output-mode", + o.outputMode, + "status sink: configmap (condition is reserved for the OpenShift track)", + ) + fs.StringVar( + &o.configmapNamespace, + "configmap-namespace", + "", + "namespace of the status ConfigMap (required when output-mode=configmap; "+ + "caller must have RBAC to patch ConfigMaps in this namespace)", + ) + fs.StringVar( + &o.configmapName, + "configmap-name", + "", + "name of the status ConfigMap; defaults to \"kms-health-${POD_NAME}\". "+ + "MUST be unique per monitor instance: concurrent writers on one CM "+ + "produce last-writer-wins flap on every key", + ) + fs.StringVar( + &o.observerPodName, + "observer-pod-name", + os.Getenv("POD_NAME"), + "pod name recorded in the status; defaults to $POD_NAME", + ) + fs.StringVar( + &o.kubeconfig, + "kubeconfig", + "", + "path to a kubeconfig; empty uses in-cluster config", + ) +} + +func (o *commandOptions) validate() error { + if o.kmsSocket == "" { + return fmt.Errorf("--kms-socket is required") + } + if o.probeInterval <= 0 { + return fmt.Errorf("--probe-interval must be positive") + } + if o.probeIntervalUnhealthy <= 0 { + return fmt.Errorf("--probe-interval-unhealthy must be positive") + } + if o.probeTimeout <= 0 { + return fmt.Errorf("--probe-timeout must be positive") + } + if o.writeTimeout <= 0 { + return fmt.Errorf("--write-timeout must be positive") + } + // $POD_NAME defaulting happens at flag-registration time in addFlags; + // by here observerPodName is the flag, env, or genuinely empty. + if o.observerPodName == "" { + return fmt.Errorf( + "--observer-pod-name is required (or set $POD_NAME)", + ) + } + switch o.outputMode { + case outputModeConfigMap: + if o.configmapNamespace == "" { + return fmt.Errorf( + "--configmap-namespace is required when --output-mode=%s", + outputModeConfigMap, + ) + } + if o.configmapName == "" { + // Wrap pod identity rather than using it bare. Advertises + // ownership and avoids colliding with a same-namespaced CM + // that some other component happens to name after the pod. + o.configmapName = "kms-health-" + o.observerPodName + } + case outputModeCondition: + return fmt.Errorf( + "--output-mode=%s is reserved for the OpenShift track and not implemented", + outputModeCondition, + ) + default: + return fmt.Errorf( + "--output-mode must be %q or %q (got %q)", + outputModeConfigMap, + outputModeCondition, + o.outputMode, + ) + } + return nil +} + +func (o *commandOptions) run(ctx context.Context) error { + cfg, err := buildRESTConfig(o.kubeconfig) + if err != nil { + return fmt.Errorf("build rest config: %w", err) + } + client, err := kubernetes.NewForConfig(cfg) + if err != nil { + return fmt.Errorf("build kubernetes client: %w", err) + } + + writer := NewConfigMapWriter(client, o.configmapNamespace, o.configmapName) + + // WaitForReady(true) is set inside the kmsv2 client, so Dial returns + // immediately even if the plugin isn't listening yet. + endpoint := "unix://" + o.kmsSocket + service, err := k8senvelopekmsv2.NewGRPCService( + ctx, + endpoint, + providerName, + o.probeTimeout, + ) + if err != nil { + return fmt.Errorf("dial KMS plugin at %q: %w", endpoint, err) + } + probe := NewProbe(service, 0) + + monitor := NewMonitor( + probe, + writer, + o.observerPodName, + o.probeInterval, + o.probeIntervalUnhealthy, + o.writeTimeout, + ) + + klog.InfoS("kms-health-monitor starting", + "socket", o.kmsSocket, + "configmap", o.configmapNamespace+"/"+o.configmapName, + "observerPod", o.observerPodName, + "healthyInterval", o.probeInterval, + "unhealthyInterval", o.probeIntervalUnhealthy, + "probeTimeout", o.probeTimeout, + "writeTimeout", o.writeTimeout, + ) + + monitor.Run(ctx) + klog.Info("kms-health-monitor stopping") + return nil +} + +func buildRESTConfig(kubeconfig string) (*rest.Config, error) { + if kubeconfig != "" { + return clientcmd.BuildConfigFromFlags("", kubeconfig) + } + return rest.InClusterConfig() +} diff --git a/pkg/operator/encryption/kms/health/cmd_test.go b/pkg/operator/encryption/kms/health/cmd_test.go new file mode 100644 index 0000000000..712a4bdd0c --- /dev/null +++ b/pkg/operator/encryption/kms/health/cmd_test.go @@ -0,0 +1,180 @@ +package health + +import ( + "strings" + "testing" + "time" + + "github.com/spf13/pflag" +) + +// validOpts is the minimal fully-valid commandOptions. Every test case +// below mutates a copy of this to isolate one validation rule at a time. +func validOpts() commandOptions { + return commandOptions{ + kmsSocket: "/var/run/kms.sock", + probeInterval: 60 * time.Second, + probeIntervalUnhealthy: 10 * time.Second, + probeTimeout: 3 * time.Second, + writeTimeout: 5 * time.Second, + outputMode: outputModeConfigMap, + configmapNamespace: "kms-health-test", + configmapName: "kms-health-master-0", + observerPodName: "master-0", + } +} + +func TestValidate_acceptsValidOptions(t *testing.T) { + o := validOpts() + if err := o.validate(); err != nil { + t.Fatalf("validate(valid): %v", err) + } +} + +func TestValidate_rejectsMissingOrZeroFields(t *testing.T) { + cases := []struct { + name string + mut func(*commandOptions) + wants string // substring expected in the error + }{ + {"empty kmsSocket", func(o *commandOptions) { o.kmsSocket = "" }, "--kms-socket"}, + {"zero probeInterval", func(o *commandOptions) { o.probeInterval = 0 }, "--probe-interval"}, + {"zero probeIntervalUnhealthy", func(o *commandOptions) { o.probeIntervalUnhealthy = 0 }, "--probe-interval-unhealthy"}, + {"zero probeTimeout", func(o *commandOptions) { o.probeTimeout = 0 }, "--probe-timeout"}, + {"zero writeTimeout", func(o *commandOptions) { o.writeTimeout = 0 }, "--write-timeout"}, + {"empty configmapNamespace", func(o *commandOptions) { o.configmapNamespace = "" }, "--configmap-namespace"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + o := validOpts() + tc.mut(&o) + err := o.validate() + if err == nil { + t.Fatalf("validate: want error containing %q, got nil", tc.wants) + } + if !strings.Contains(err.Error(), tc.wants) { + t.Errorf("validate: got %q, want substring %q", err.Error(), tc.wants) + } + }) + } +} + +func TestValidate_outputModeEnum(t *testing.T) { + cases := []struct { + name string + mode string + wantErr bool + wantSubstr string + }{ + {"configmap accepted", outputModeConfigMap, false, ""}, + {"condition rejected as reserved", outputModeCondition, true, "reserved for the OpenShift track"}, + {"unknown rejected with both options listed", "bogus", true, `must be "configmap" or "condition"`}, + {"empty rejected with both options listed", "", true, `must be "configmap" or "condition"`}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + o := validOpts() + o.outputMode = tc.mode + err := o.validate() + if tc.wantErr { + if err == nil { + t.Fatalf("validate: want error containing %q, got nil", tc.wantSubstr) + } + if !strings.Contains(err.Error(), tc.wantSubstr) { + t.Errorf("validate: got %q, want substring %q", err.Error(), tc.wantSubstr) + } + return + } + if err != nil { + t.Errorf("validate: got %v, want nil", err) + } + }) + } +} + +func TestAddFlags_observerPodNameDefaultsFromPodNameEnv(t *testing.T) { + t.Setenv("POD_NAME", "from-env-0") + o := commandOptions{} + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + o.addFlags(fs) + if err := fs.Parse(nil); err != nil { + t.Fatalf("parse: %v", err) + } + if o.observerPodName != "from-env-0" { + t.Errorf("observerPodName: got %q, want %q (env-derived flag default)", o.observerPodName, "from-env-0") + } +} + +func TestValidate_observerPodNameRequiredWhenEmpty(t *testing.T) { + o := validOpts() + o.observerPodName = "" + + err := o.validate() + if err == nil { + t.Fatal("validate: want error, got nil") + } + if !strings.Contains(err.Error(), "--observer-pod-name") { + t.Errorf("validate: got %q, want substring --observer-pod-name", err.Error()) + } +} + +func TestValidate_configmapNameDefaultsFromObserverPodName(t *testing.T) { + o := validOpts() + o.configmapName = "" // force the wrapped default + + if err := o.validate(); err != nil { + t.Fatalf("validate: %v", err) + } + want := "kms-health-master-0" + if o.configmapName != want { + t.Errorf("configmapName: got %q, want %q (wrapped default from observerPodName)", o.configmapName, want) + } +} + +func TestAddFlagsAndValidate_configmapNameDefaultChainsThroughEnv(t *testing.T) { + // Verifies the full chain (env → observerPodName → configmapName) + // across the two layers it now spans: addFlags reads $POD_NAME at + // flag-registration time, validate derives configmapName from the + // resolved observerPodName. + t.Setenv("POD_NAME", "kube-apiserver-3") + // Field defaults that addFlags wires to non-empty values + // (kmsSocket, probeInterval, etc.) come from this struct literal — + // addFlags references them as the flag default. Fields that addFlags + // defaults to "" (configmapNamespace, configmapName, observerPodName) + // have to be supplied via flag args, since addFlags ignores the + // struct value for those. + o := commandOptions{ + kmsSocket: "/var/run/kms.sock", + probeInterval: 60 * time.Second, + probeIntervalUnhealthy: 10 * time.Second, + probeTimeout: 3 * time.Second, + writeTimeout: 5 * time.Second, + outputMode: outputModeConfigMap, + } + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + o.addFlags(fs) + if err := fs.Parse([]string{"--configmap-namespace=kms-health-test"}); err != nil { + t.Fatalf("parse: %v", err) + } + if err := o.validate(); err != nil { + t.Fatalf("validate: %v", err) + } + if o.observerPodName != "kube-apiserver-3" { + t.Errorf("observerPodName: got %q, want kube-apiserver-3", o.observerPodName) + } + if o.configmapName != "kms-health-kube-apiserver-3" { + t.Errorf("configmapName: got %q, want kms-health-kube-apiserver-3", o.configmapName) + } +} + +func TestValidate_explicitConfigmapNameNotOverridden(t *testing.T) { + o := validOpts() + o.configmapName = "explicit-cm" + + if err := o.validate(); err != nil { + t.Fatalf("validate: %v", err) + } + if o.configmapName != "explicit-cm" { + t.Errorf("configmapName: got %q, want explicit-cm (default must not override caller-set value)", o.configmapName) + } +} diff --git a/pkg/operator/encryption/kms/health/doc.go b/pkg/operator/encryption/kms/health/doc.go new file mode 100644 index 0000000000..7b24795f2d --- /dev/null +++ b/pkg/operator/encryption/kms/health/doc.go @@ -0,0 +1,5 @@ +// Package health probes a co-located KMSv2 plugin on a cadence and +// publishes a classified, timestamped HealthStatus through a StatusWriter. +// Used by operators and condition reporters that need plugin health +// without dialing the socket themselves. +package health diff --git a/pkg/operator/encryption/kms/health/monitor.go b/pkg/operator/encryption/kms/health/monitor.go new file mode 100644 index 0000000000..5d16ee1314 --- /dev/null +++ b/pkg/operator/encryption/kms/health/monitor.go @@ -0,0 +1,78 @@ +package health + +import ( + "context" + "time" + + "k8s.io/klog/v2" + "k8s.io/utils/clock" +) + +type Prober interface { + Probe(ctx context.Context) HealthStatus +} + +// Monitor never exits on probe or write failures; the next tick is the +// retry, and ctx cancellation is the only exit path. Skipped writes +// (apiserver Forbidden, NotFound, transient network failure) leave the +// published status one tick stale, up to one healthyInterval; the next +// tick republishes. +type Monitor struct { + probe Prober + writer StatusWriter + observerPod string + healthyInterval time.Duration + unhealthyInterval time.Duration + // writeTimeout bounds each writer.Write call so apiserver slowness + // cannot stall the probe loop. probeTimeout + writeTimeout stack + // per tick. Keep their sum below unhealthyInterval, otherwise a + // stuck Write delays the next tick past the unhealthy cadence and + // the "tighten on unhealthy" mechanism collapses. Must be > 0. + writeTimeout time.Duration + clock clock.Clock +} + +func NewMonitor( + prober Prober, + writer StatusWriter, + observerPod string, + healthyInterval, unhealthyInterval, writeTimeout time.Duration, +) *Monitor { + return &Monitor{ + probe: prober, + writer: writer, + observerPod: observerPod, + healthyInterval: healthyInterval, + unhealthyInterval: unhealthyInterval, + writeTimeout: writeTimeout, + clock: clock.RealClock{}, + } +} + +// Run blocks until ctx is cancelled. +func (m *Monitor) Run(ctx context.Context) { + for { + status := m.probe.Probe(ctx) + status.ObserverPod = m.observerPod + + // Cancel inline rather than via defer: defers in unbounded loops + // accumulate until the function returns. + writeCtx, writeCancel := context.WithTimeout(ctx, m.writeTimeout) + err := m.writer.Write(writeCtx, status) + writeCancel() + if err != nil { + klog.ErrorS(err, "kms-health: writer failed; continuing") + } + + interval := m.healthyInterval + if !status.Healthz.IsOK() { + interval = m.unhealthyInterval + } + + select { + case <-ctx.Done(): + return + case <-m.clock.After(interval): + } + } +} diff --git a/pkg/operator/encryption/kms/health/monitor_test.go b/pkg/operator/encryption/kms/health/monitor_test.go new file mode 100644 index 0000000000..439266a0cd --- /dev/null +++ b/pkg/operator/encryption/kms/health/monitor_test.go @@ -0,0 +1,261 @@ +package health + +import ( + "context" + "runtime" + "sync" + "testing" + "time" + + testclock "k8s.io/utils/clock/testing" +) + +// recordingWriter buffers every HealthStatus it receives and lets the +// test block on each write via a channel. Bounded buffer is fine for +// unit-test scope; if a Monitor change starts writing way more than +// expected the test will deadlock — that's a useful failure. +type recordingWriter struct { + writes chan HealthStatus +} + +func newRecordingWriter() *recordingWriter { + return &recordingWriter{writes: make(chan HealthStatus, 32)} +} + +func (w *recordingWriter) Write(ctx context.Context, status HealthStatus) error { + w.writes <- status + return nil +} + +// scriptedProber hands out pre-arranged HealthStatus values in order. +// Each Probe call consumes one. Designed to block if the Monitor asks +// for more than the test scripted — surfaces unintended extra probes. +type scriptedProber struct { + mu sync.Mutex + results []HealthStatus + calls int +} + +func (s *scriptedProber) Probe(ctx context.Context) HealthStatus { + s.mu.Lock() + defer s.mu.Unlock() + if s.calls >= len(s.results) { + // Return something sane rather than panicking mid-test; the + // writer buffer will catch unexpected writes. + s.calls++ + return HealthStatus{Healthz: Healthz{Class: HealthClassOK}, Timestamp: time.Now()} + } + r := s.results[s.calls] + s.calls++ + return r +} + +// waitForWaiters polls until the FakeClock has a registered waiter, so +// the next Step() actually triggers the Monitor's next-tick timer +// rather than being lost to the void. Bounded so misbehaving Monitors +// fail the test instead of hanging. +func waitForWaiters(t *testing.T, clk *testclock.FakeClock) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for !clk.HasWaiters() { + if time.Now().After(deadline) { + t.Fatal("Monitor never registered a clock waiter — not scheduling next tick") + } + time.Sleep(time.Millisecond) + } +} + +func TestMonitor_transitionsHealthyUnhealthyHealthy(t *testing.T) { + t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) + clk := testclock.NewFakeClock(t0) + prober := &scriptedProber{ + results: []HealthStatus{ + {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}, + {Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, Timestamp: t0.Add(60 * time.Second)}, + {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0.Add(70 * time.Second)}, + }, + } + writer := newRecordingWriter() + + m := &Monitor{ + probe: prober, + writer: writer, + observerPod: "test-pod-1", + healthyInterval: 60 * time.Second, + unhealthyInterval: 10 * time.Second, + writeTimeout: 5 * time.Second, + clock: clk, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := make(chan struct{}) + go func() { + m.Run(ctx) + close(done) + }() + + // Tick 1: healthy. Probed immediately, no clock step needed first. + got1 := <-writer.writes + if !got1.Healthz.IsOK() { + t.Errorf("tick 1 Healthz: got %q, want IsOK", got1.Healthz) + } + if got1.ObserverPod != "test-pod-1" { + t.Errorf("tick 1 ObserverPod: got %q, want test-pod-1", got1.ObserverPod) + } + + // Should now be waiting for healthyInterval. + waitForWaiters(t, clk) + clk.Step(60 * time.Second) + + // Tick 2: unhealthy. Monitor must now switch to unhealthyInterval. + got2 := <-writer.writes + wantTick2 := Healthz{Class: HealthClassNotOK, Detail: "boom"} + if got2.Healthz != wantTick2 { + t.Errorf("tick 2 Healthz: got %q, want %q", got2.Healthz, wantTick2) + } + + // Should be waiting for unhealthyInterval (10s), not 60s. Verify by + // stepping only 10s and seeing the next probe fire. + waitForWaiters(t, clk) + clk.Step(10 * time.Second) + + // Tick 3: healthy again. + got3 := <-writer.writes + if !got3.Healthz.IsOK() { + t.Errorf("tick 3 Healthz: got %q, want IsOK", got3.Healthz) + } + + cancel() + <-done + + // Timestamps must be strictly advancing across the three recorded writes. + if !got2.Timestamp.After(got1.Timestamp) { + t.Errorf("timestamps not advancing: t1=%v t2=%v", got1.Timestamp, got2.Timestamp) + } + if !got3.Timestamp.After(got2.Timestamp) { + t.Errorf("timestamps not advancing: t2=%v t3=%v", got2.Timestamp, got3.Timestamp) + } +} + +func TestMonitor_stopsOnContextCancel(t *testing.T) { + t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) + clk := testclock.NewFakeClock(t0) + prober := &scriptedProber{ + results: []HealthStatus{{Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}}, + } + writer := newRecordingWriter() + + m := &Monitor{ + probe: prober, + writer: writer, + observerPod: "p", + healthyInterval: 60 * time.Second, + unhealthyInterval: 10 * time.Second, + writeTimeout: 5 * time.Second, + clock: clk, + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + m.Run(ctx) + close(done) + }() + + <-writer.writes // first probe happened + waitForWaiters(t, clk) + cancel() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("Monitor did not stop within 2s of context cancel") + } +} + +// runMonitorCycle starts a Monitor, waits for the first write + clock +// waiter, then cancels and waits for Run to return. Shared by the leak +// test so its iteration body stays focused on the leak assertion. +func runMonitorCycle(t *testing.T) { + t.Helper() + t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) + clk := testclock.NewFakeClock(t0) + prober := &scriptedProber{ + results: []HealthStatus{ + {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}, + }, + } + writer := newRecordingWriter() + + m := &Monitor{ + probe: prober, + writer: writer, + observerPod: "p", + healthyInterval: 60 * time.Second, + unhealthyInterval: 10 * time.Second, + writeTimeout: 5 * time.Second, + clock: clk, + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + m.Run(ctx) + close(done) + }() + + <-writer.writes + waitForWaiters(t, clk) + cancel() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("Monitor did not stop within 2s of context cancel") + } +} + +// TestMonitor_noGoroutineLeak verifies Run does not leak goroutines +// after ctx cancellation. Plan §14.2 calls out goroutine leakage as a +// real-class-of-bug for the gRPC dial path; the same risk applies to +// the loop that owns the long-lived select on ctx.Done() and +// clock.After. +// +// The test runs many cycles after a warm-up snapshot. A single-shot +// check would miss per-iteration accumulation (e.g. an orphaned +// goroutine spawned inside Run that exits on the *next* tick rather +// than on ctx.Done()). +func TestMonitor_noGoroutineLeak(t *testing.T) { + // Warm up before snapshotting: first run pulls in lazy klog buffers, + // fake-clock test internals, etc. Snapshotting beforehand would + // surface a one-time bump as a leak. + runMonitorCycle(t) + + runtime.GC() + baseline := runtime.NumGoroutine() + + const iterations = 20 + for range iterations { + runMonitorCycle(t) + } + + // Cleanup is scheduler-dependent; poll up to a deadline rather than + // using a fixed sleep that flakes on a loaded CI runner. + runtime.GC() + deadline := time.Now().Add(2 * time.Second) + for runtime.NumGoroutine() > baseline && time.Now().Before(deadline) { + time.Sleep(20 * time.Millisecond) + runtime.GC() + } + + if got := runtime.NumGoroutine(); got > baseline { + // Dump live stacks so a future regression is diagnosable from CI + // output alone. Without this, "leaked N goroutines" gives no clue + // to the leaking site. + buf := make([]byte, 1<<16) + n := runtime.Stack(buf, true) + t.Errorf("goroutine leak after %d cycles: baseline=%d, got=%d\nstacks:\n%s", + iterations, baseline, got, buf[:n]) + } +} diff --git a/pkg/operator/encryption/kms/health/probe.go b/pkg/operator/encryption/kms/health/probe.go new file mode 100644 index 0000000000..962741dbbb --- /dev/null +++ b/pkg/operator/encryption/kms/health/probe.go @@ -0,0 +1,89 @@ +package health + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "strings" + "time" + + "google.golang.org/grpc/status" + kmsservice "k8s.io/kms/pkg/service" +) + +// healthzMaxBodyLen caps the plugin's Healthz body. Without it a +// misbehaving plugin could push multi-MB strings into our ConfigMap on +// every tick. +const healthzMaxBodyLen = 200 + +// Probe never returns an error; failures are classified into +// HealthStatus.Healthz so the Monitor loop stays flat. +type Probe struct { + service kmsservice.Service + timeout time.Duration + now func() time.Time +} + +// NewProbe wraps the per-probe Status RPC with timeout if positive; 0 +// relies solely on ctx (or the kmsv2 client's own deadline). +func NewProbe(service kmsservice.Service, timeout time.Duration) *Probe { + return &Probe{ + service: service, + timeout: timeout, + now: time.Now, + } +} + +// Probe blocks until Status returns or ctx / the per-probe timeout fires. +func (p *Probe) Probe(ctx context.Context) HealthStatus { + if p.timeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, p.timeout) + defer cancel() + } + resp, err := p.service.Status(ctx) + timestamp := p.now() + if err != nil { + return HealthStatus{ + Healthz: classifyRPCError(err), + Timestamp: timestamp, + } + } + return HealthStatus{ + Healthz: classifyHealthz(resp.Healthz), + KeyIDHash: hashKeyID(resp.KeyID), + Timestamp: timestamp, + } +} + +// classifyHealthz truncates overlong bodies on a UTF-8-safe boundary: +// the proto permits free-form text, ConfigMap.Data must be valid UTF-8, +// and naive byte-slicing at healthzMaxBodyLen could split a multi-byte +// rune. ToValidUTF8 drops any partial sequence at the cut. +func classifyHealthz(body string) Healthz { + if body == string(HealthClassOK) { + return Healthz{Class: HealthClassOK} + } + if len(body) > healthzMaxBodyLen { + body = strings.ToValidUTF8(body[:healthzMaxBodyLen], "") + } + return Healthz{Class: HealthClassNotOK, Detail: body} +} + +// classifyRPCError uses status.Code, which returns codes.Unknown for +// non-gRPC errors and the real code for gRPC status errors. Covers +// transport failures and deadline expirations without extra branching. +func classifyRPCError(err error) Healthz { + return Healthz{ + Class: HealthClassRPCError, + Detail: status.Code(err).String(), + } +} + +func hashKeyID(keyID string) string { + if keyID == "" { + return "" + } + sum := sha256.Sum256([]byte(keyID)) + return hex.EncodeToString(sum[:]) +} diff --git a/pkg/operator/encryption/kms/health/probe_test.go b/pkg/operator/encryption/kms/health/probe_test.go new file mode 100644 index 0000000000..7966c9f56e --- /dev/null +++ b/pkg/operator/encryption/kms/health/probe_test.go @@ -0,0 +1,219 @@ +package health + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "strings" + "testing" + "time" + "unicode/utf8" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + kmsservice "k8s.io/kms/pkg/service" +) + +// testTimeout is a generous per-probe timeout for fast-fake tests that +// should never hit it. Tests that DO exercise the deadline path use a +// short timeout locally. +const testTimeout = 10 * time.Second + +// fakeService injects scripted Status responses. Same shape as +// preflight's fake (checker_test.go:14-30) — only StatusFn is exercised +// here because the Probe never calls Encrypt/Decrypt. +type fakeService struct { + StatusFn func(ctx context.Context) (*kmsservice.StatusResponse, error) +} + +func (f *fakeService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) { + return f.StatusFn(ctx) +} + +func (f *fakeService) Encrypt(ctx context.Context, uid string, data []byte) (*kmsservice.EncryptResponse, error) { + panic("Encrypt not expected") +} + +func (f *fakeService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) { + panic("Decrypt not expected") +} + +func sha256Hex(s string) string { + sum := sha256.Sum256([]byte(s)) + return hex.EncodeToString(sum[:]) +} + +func TestProbe_healthy(t *testing.T) { + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return &kmsservice.StatusResponse{ + Healthz: "ok", + Version: "v2", + KeyID: "test-key-1", + }, nil + }, + } + + probe := NewProbe(fake, testTimeout) + got := probe.Probe(context.Background()) + + if !got.Healthz.IsOK() { + t.Errorf("Healthz: got %q, want IsOK", got.Healthz) + } + wantHash := sha256Hex("test-key-1") + if got.KeyIDHash != wantHash { + t.Errorf("KeyIDHash: got %q, want %q", got.KeyIDHash, wantHash) + } + if got.Timestamp.IsZero() { + t.Error("Timestamp: expected non-zero") + } +} + +func TestProbe_notOk_classifiesAsUnhealthy(t *testing.T) { + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return &kmsservice.StatusResponse{ + Healthz: "not-ready", + KeyID: "test-key-1", + }, nil + }, + } + + got := NewProbe(fake, testTimeout).Probe(context.Background()) + + want := Healthz{Class: HealthClassNotOK, Detail: "not-ready"} + if got.Healthz != want { + t.Errorf("Healthz: got %q, want %q", got.Healthz, want) + } +} + +func TestProbe_notOk_truncatesTo200Chars(t *testing.T) { + // 260-char ASCII body — exercise the byte-length cap. + longBody := strings.Repeat("x", 260) + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return &kmsservice.StatusResponse{ + Healthz: longBody, + KeyID: "test-key-1", + }, nil + }, + } + + got := NewProbe(fake, testTimeout).Probe(context.Background()) + + if got.Healthz.Class != HealthClassNotOK { + t.Errorf("Healthz class: got %q, want %q", got.Healthz.Class, HealthClassNotOK) + } + if len(got.Healthz.Detail) != healthzMaxBodyLen { + t.Errorf("Healthz detail length: got %d, want %d", len(got.Healthz.Detail), healthzMaxBodyLen) + } +} + +func TestProbe_notOk_truncatesSafelyAtMultiByteBoundary(t *testing.T) { + // 198 ASCII bytes + thumbs-up emoji (4 bytes) + tail. Naive byte- + // slicing at healthzMaxBodyLen=200 would land between bytes 2 and 3 + // of the emoji, producing invalid UTF-8 — apiserver rejects + // ConfigMap Updates with invalid UTF-8 in cm.Data values, which + // would freeze the published status indefinitely. + body := strings.Repeat("x", 198) + "👍" + "tail" + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return &kmsservice.StatusResponse{ + Healthz: body, + KeyID: "test-key-1", + }, nil + }, + } + + got := NewProbe(fake, testTimeout).Probe(context.Background()) + + if !utf8.ValidString(got.Healthz.Detail) { + t.Errorf("Healthz detail is not valid UTF-8: %q", got.Healthz.Detail) + } + if len(got.Healthz.Detail) > healthzMaxBodyLen { + t.Errorf("Healthz detail too long: got %d bytes, want <= %d", len(got.Healthz.Detail), healthzMaxBodyLen) + } +} + +func TestProbe_respectsPerProbeTimeout(t *testing.T) { + // Fake blocks until ctx fires; Probe's own WithTimeout should cut it + // off after probeTimeout. The fake wraps ctx.Err() exactly the way + // gRPC's transport does in production (status.FromContextError), so + // classification sees DeadlineExceeded. + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + <-ctx.Done() + return nil, status.FromContextError(ctx.Err()).Err() + }, + } + + const probeTimeout = 20 * time.Millisecond + probe := NewProbe(fake, probeTimeout) + + start := time.Now() + // Parent context has NO deadline — if Probe doesn't apply its own + // timeout, this test hangs and we'd notice via go test -timeout. + got := probe.Probe(context.Background()) + elapsed := time.Since(start) + + // Budget: probeTimeout + generous slack for CI. If we see >1s we're + // honoring the parent ctx instead of our own timeout. + if elapsed > 500*time.Millisecond { + t.Errorf("probe elapsed %v — per-probe timeout not applied", elapsed) + } + want := Healthz{Class: HealthClassRPCError, Detail: "DeadlineExceeded"} + if got.Healthz != want { + t.Errorf("Healthz: got %q, want %q", got.Healthz, want) + } +} + +func TestProbe_rpcError(t *testing.T) { + // The real k8senvelopekmsv2 client returns status errors from gRPC + // calls; generic (non-status) errors are treated by grpc/status as code + // Unknown. See vendor/google.golang.org/grpc/status/status.go. + scenarios := []struct { + name string + fakeErr error + wantCode string + }{ + { + name: "generic non-gRPC error classifies as Unknown", + fakeErr: errors.New("connection refused"), + wantCode: "Unknown", + }, + { + name: "gRPC Unavailable propagates through classification", + fakeErr: status.Error(codes.Unavailable, "server gone"), + wantCode: "Unavailable", + }, + { + name: "gRPC DeadlineExceeded propagates through classification", + fakeErr: status.Error(codes.DeadlineExceeded, "too slow"), + wantCode: "DeadlineExceeded", + }, + } + + for _, tc := range scenarios { + t.Run(tc.name, func(t *testing.T) { + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return nil, tc.fakeErr + }, + } + + got := NewProbe(fake, testTimeout).Probe(context.Background()) + + want := Healthz{Class: HealthClassRPCError, Detail: tc.wantCode} + if got.Healthz != want { + t.Errorf("Healthz: got %q, want %q", got.Healthz, want) + } + if got.KeyIDHash != "" { + t.Errorf("KeyIDHash: got %q, want empty (unreachable plugin)", got.KeyIDHash) + } + if got.Timestamp.IsZero() { + t.Error("Timestamp: expected non-zero even on RPC error") + } + }) + } +} diff --git a/pkg/operator/encryption/kms/health/types.go b/pkg/operator/encryption/kms/health/types.go new file mode 100644 index 0000000000..0825f6595a --- /dev/null +++ b/pkg/operator/encryption/kms/health/types.go @@ -0,0 +1,49 @@ +package health + +import "time" + +// HealthClass is the closed-set classifier for Healthz. +type HealthClass string + +const ( + HealthClassOK HealthClass = "ok" + HealthClassNotOK HealthClass = "not-ok" + HealthClassRPCError HealthClass = "rpc-error" +) + +type Healthz struct { + Class HealthClass + Detail string +} + +// IsOK is the canonical health predicate. Prefer this over comparing +// Class directly so consumers don't depend on the internal shape. +func (h Healthz) IsOK() bool { return h.Class == HealthClassOK } + +// String renders the wire format: "ok" or "unhealthy::". +func (h Healthz) String() string { + if h.Class == HealthClassOK { + return string(HealthClassOK) + } + return "unhealthy:" + string(h.Class) + ":" + h.Detail +} + +type HealthStatus struct { + Healthz Healthz + + // KeyIDHash is the sha256 hex of the KeyID the plugin returned, or + // empty if the probe could not reach Status. Hashing avoids leaking + // key material; consumers can still diff hashes across instances to + // detect rotation skew. + KeyIDHash string + + // Timestamp: RFC3339 on the wire. + Timestamp time.Time + + // ObserverPod is stable across deployment stages. ConfigMapWriter + // does not strictly need it (one CM per monitor, identity is in the + // CM name); the OpenShift CRD-condition writer does (one CR + // aggregates conditions from N pods, and identity must travel with + // each entry). + ObserverPod string +} diff --git a/pkg/operator/encryption/kms/health/writer.go b/pkg/operator/encryption/kms/health/writer.go new file mode 100644 index 0000000000..a14b45a5fe --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer.go @@ -0,0 +1,10 @@ +package health + +import "context" + +// StatusWriter ships each HealthStatus produced by the Monitor. Write errors +// should be logged and tolerated by the Monitor. A missed publish is strictly +// less bad than crashing the sidecar. +type StatusWriter interface { + Write(ctx context.Context, status HealthStatus) error +} diff --git a/pkg/operator/encryption/kms/health/writer_configmap.go b/pkg/operator/encryption/kms/health/writer_configmap.go new file mode 100644 index 0000000000..f831035d29 --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer_configmap.go @@ -0,0 +1,100 @@ +package health + +import ( + "context" + "encoding/json" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" +) + +// ConfigMapWriter uses merge-patch so a tick of class X updates data.X +// and leaves the other classes' keys alone. Consumers determine the +// "current" class by max data..timestamp; there is no separate +// pointer key. +// +// On the wire (data is map[string]string; values shown decoded): +// +// data.ok = {"timestamp":"...","observerPod":"...","keyIDHash":"..."} +// data.not-ok = {"timestamp":"...","observerPod":"...","detail":"...","keyIDHash":"..."} +// data.rpc-error = {"timestamp":"...","observerPod":"...","detail":"..."} +// +// Concurrency contract: one ConfigMap per monitor instance. Two +// monitors writing the same CM produce last-writer-wins on every key. +// Callers MUST encode instance identity in the CM name (the cmd-layer +// default is "kms-health-${POD_NAME}"). +type ConfigMapWriter struct { + client kubernetes.Interface + namespace string + name string +} + +func NewConfigMapWriter(client kubernetes.Interface, namespace, name string) *ConfigMapWriter { + return &ConfigMapWriter{ + client: client, + namespace: namespace, + name: name, + } +} + +type classEntry struct { + Timestamp string `json:"timestamp"` + ObserverPod string `json:"observerPod"` + Detail string `json:"detail,omitempty"` + KeyIDHash string `json:"keyIDHash,omitempty"` +} + +type configMapDataPatch struct { + Data map[string]string `json:"data"` +} + +func (w *ConfigMapWriter) Write(ctx context.Context, status HealthStatus) error { + entry := classEntry{ + Timestamp: status.Timestamp.UTC().Format(time.RFC3339), + ObserverPod: status.ObserverPod, + Detail: status.Healthz.Detail, + KeyIDHash: status.KeyIDHash, + } + entryBytes, err := json.Marshal(entry) + if err != nil { + return fmt.Errorf("marshal entry for ConfigMap %s/%s: %w", w.namespace, w.name, err) + } + + data := map[string]string{string(status.Healthz.Class): string(entryBytes)} + patchBytes, err := json.Marshal(configMapDataPatch{Data: data}) + if err != nil { + return fmt.Errorf("marshal patch for ConfigMap %s/%s: %w", w.namespace, w.name, err) + } + + cms := w.client.CoreV1().ConfigMaps(w.namespace) + _, err = cms.Patch(ctx, w.name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) + if err == nil { + return nil + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("patch ConfigMap %s/%s: %w", w.namespace, w.name, err) + } + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: w.name, Namespace: w.namespace}, + Data: data, + } + _, err = cms.Create(ctx, cm, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("create ConfigMap %s/%s: %w", w.namespace, w.name, err) + } + + // Race: another writer created the CM between our Patch and Create. + if _, err := cms.Patch(ctx, w.name, types.MergePatchType, patchBytes, metav1.PatchOptions{}); err != nil { + return fmt.Errorf("patch ConfigMap %s/%s after create race: %w", w.namespace, w.name, err) + } + return nil +} diff --git a/pkg/operator/encryption/kms/health/writer_configmap_test.go b/pkg/operator/encryption/kms/health/writer_configmap_test.go new file mode 100644 index 0000000000..a572db5c68 --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer_configmap_test.go @@ -0,0 +1,244 @@ +package health + +import ( + "context" + "encoding/json" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +const ( + testNamespace = "kms-health-test" + testCMName = "kms-health-status" +) + +func newTestConfigMap() *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCMName, + Namespace: testNamespace, + }, + } +} + +// decodeEntry parses data. back into a classEntry. Comparing +// decoded structs avoids brittleness against JSON field ordering changes. +func decodeEntry(t *testing.T, raw string) classEntry { + t.Helper() + var e classEntry + if err := json.Unmarshal([]byte(raw), &e); err != nil { + t.Fatalf("decode classEntry %q: %v", raw, err) + } + return e +} + +func TestConfigMapWriter_writesHealthyEntryUnderOKKey(t *testing.T) { + client := fake.NewClientset(newTestConfigMap()) + w := NewConfigMapWriter(client, testNamespace, testCMName) + + status := HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "abc123", + Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), + ObserverPod: "mock-kmsv2-provider-node", + } + if err := w.Write(context.Background(), status); err != nil { + t.Fatalf("Write: %v", err) + } + + got, err := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Get: %v", err) + } + + raw, ok := got.Data[string(HealthClassOK)] + if !ok { + t.Fatalf("data[%q] missing; data=%v", HealthClassOK, got.Data) + } + entry := decodeEntry(t, raw) + + want := classEntry{ + Timestamp: "2026-04-24T12:00:00Z", + ObserverPod: "mock-kmsv2-provider-node", + KeyIDHash: "abc123", + } + if entry != want { + t.Errorf("data[ok]: got %+v, want %+v", entry, want) + } + + // Other class keys must be absent: we never observed them. + for _, c := range []HealthClass{HealthClassNotOK, HealthClassRPCError} { + if _, present := got.Data[string(c)]; present { + t.Errorf("data[%q] present after only an OK probe; data=%v", c, got.Data) + } + } +} + +func TestConfigMapWriter_writesUnhealthyEntryUnderNotOKKey(t *testing.T) { + client := fake.NewClientset(newTestConfigMap()) + w := NewConfigMapWriter(client, testNamespace, testCMName) + + status := HealthStatus{ + Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, + Timestamp: time.Date(2026, 4, 24, 12, 1, 0, 0, time.UTC), + ObserverPod: "p", + } + if err := w.Write(context.Background(), status); err != nil { + t.Fatalf("Write: %v", err) + } + + got, _ := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + raw, ok := got.Data[string(HealthClassNotOK)] + if !ok { + t.Fatalf("data[%q] missing; data=%v", HealthClassNotOK, got.Data) + } + entry := decodeEntry(t, raw) + + want := classEntry{ + Timestamp: "2026-04-24T12:01:00Z", + ObserverPod: "p", + Detail: "boom", + } + if entry != want { + t.Errorf("data[not-ok]: got %+v, want %+v", entry, want) + } +} + +func TestConfigMapWriter_writesRPCErrorEntryUnderRPCErrorKey(t *testing.T) { + client := fake.NewClientset(newTestConfigMap()) + w := NewConfigMapWriter(client, testNamespace, testCMName) + + status := HealthStatus{ + Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, + Timestamp: time.Date(2026, 4, 24, 12, 2, 0, 0, time.UTC), + ObserverPod: "p", + } + if err := w.Write(context.Background(), status); err != nil { + t.Fatalf("Write: %v", err) + } + + got, _ := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + raw, ok := got.Data[string(HealthClassRPCError)] + if !ok { + t.Fatalf("data[%q] missing; data=%v", HealthClassRPCError, got.Data) + } + entry := decodeEntry(t, raw) + + want := classEntry{ + Timestamp: "2026-04-24T12:02:00Z", + ObserverPod: "p", + Detail: "Unavailable", + } + if entry != want { + t.Errorf("data[rpc-error]: got %+v, want %+v", entry, want) + } +} + +// TestConfigMapWriter_tickPreservesOtherClasses is the load-bearing +// test for the schema's whole point: a probe of class X must not +// clobber data.Y / data.Z. If a future refactor swaps merge-patch for +// Update or for a strategic-merge-with-replace directive, this test +// goes red. +func TestConfigMapWriter_tickPreservesOtherClasses(t *testing.T) { + // Pre-populate data.ok with a stale healthy entry; tick rpc-error; + // assert data.ok survived byte-for-byte. + staleOK := `{"timestamp":"2026-04-24T11:00:00Z","observerPod":"p","keyIDHash":"old"}` + existing := newTestConfigMap() + existing.Data = map[string]string{ + string(HealthClassOK): staleOK, + } + client := fake.NewClientset(existing) + w := NewConfigMapWriter(client, testNamespace, testCMName) + + status := HealthStatus{ + Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, + Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), + ObserverPod: "p", + } + if err := w.Write(context.Background(), status); err != nil { + t.Fatalf("Write: %v", err) + } + + got, _ := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + + if got.Data[string(HealthClassOK)] != staleOK { + t.Errorf("data[ok] mutated by rpc-error tick: got %q, want %q", + got.Data[string(HealthClassOK)], staleOK) + } + if _, ok := got.Data[string(HealthClassRPCError)]; !ok { + t.Errorf("data[rpc-error] not written; data=%v", got.Data) + } +} + +func TestConfigMapWriter_patchPreservesUnrelatedKeys(t *testing.T) { + // Same load-bearing property as tickPreservesOtherClasses, but for + // keys outside our schema entirely (e.g. an annotation-style key set + // by something else in the same CM). + existing := newTestConfigMap() + existing.Data = map[string]string{ + "unrelated-key": "set-by-something-else", + } + client := fake.NewClientset(existing) + w := NewConfigMapWriter(client, testNamespace, testCMName) + + if err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), + ObserverPod: "p", + }); err != nil { + t.Fatalf("Write: %v", err) + } + + got, _ := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + if got.Data["unrelated-key"] != "set-by-something-else" { + t.Errorf("merge patch dropped unrelated key: got %q", got.Data["unrelated-key"]) + } + if _, ok := got.Data[string(HealthClassOK)]; !ok { + t.Errorf("data[ok] not written; data=%v", got.Data) + } +} + +func TestConfigMapWriter_createsConfigMapWhenMissing(t *testing.T) { + // Empty clientset, no ConfigMap to patch. Write self-heals by + // creating the CM seeded with this tick's class entry; subsequent + // ticks would extend it via merge-patch. + client := fake.NewClientset() + w := NewConfigMapWriter(client, testNamespace, testCMName) + + status := HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "abc", + Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), + ObserverPod: "p", + } + if err := w.Write(context.Background(), status); err != nil { + t.Fatalf("Write on missing CM: %v", err) + } + + got, err := client.CoreV1().ConfigMaps(testNamespace). + Get(context.Background(), testCMName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Get: %v", err) + } + raw, ok := got.Data[string(HealthClassOK)] + if !ok { + t.Fatalf("data[%q] missing after create; data=%v", HealthClassOK, got.Data) + } + want := classEntry{ + Timestamp: "2026-04-24T12:00:00Z", + ObserverPod: "p", + KeyIDHash: "abc", + } + if entry := decodeEntry(t, raw); entry != want { + t.Errorf("data[ok]: got %+v, want %+v", entry, want) + } +} From 27214dde7926e5b18a8fe1de626fba0b8c5845f8 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Thu, 30 Apr 2026 15:51:31 +0200 Subject: [PATCH 2/8] kms/health: add container template GenerateContainerTemplate renders a corev1.Container sidecar. KMSPluginName suffixes container name, volumeMount, kms-socket path, and configmap name so multi-plugin coexists in one pod. Drive-by: ConfigMapWriter godoc updated to match the actual patch-then-create flow. --- .../health/assets/kms-health-container.yaml | 27 +++ pkg/operator/encryption/kms/health/bindata.go | 16 ++ .../kms/health/container_template.go | 119 ++++++++++++ .../kms/health/container_template_test.go | 171 ++++++++++++++++++ .../encryption/kms/health/writer_configmap.go | 4 + 5 files changed, 337 insertions(+) create mode 100644 pkg/operator/encryption/kms/health/assets/kms-health-container.yaml create mode 100644 pkg/operator/encryption/kms/health/bindata.go create mode 100644 pkg/operator/encryption/kms/health/container_template.go create mode 100644 pkg/operator/encryption/kms/health/container_template_test.go diff --git a/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml b/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml new file mode 100644 index 0000000000..8bceca8b2d --- /dev/null +++ b/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml @@ -0,0 +1,27 @@ +name: kms-health-monitor-{{.KMSPluginName}} +image: {{.OperatorImage}} +command: +{{- range .Command }} + - {{ . | toJson }} +{{- end }} +args: + - --kms-socket=/var/run/kmsplugin-{{.KMSPluginName}}/kms.sock + - --probe-interval={{.ProbeInterval}} + - --probe-interval-unhealthy={{.ProbeIntervalUnhealthy}} + - --probe-timeout={{.ProbeTimeout}} + - --write-timeout={{.WriteTimeout}} + - --output-mode=configmap + - --configmap-namespace={{.ConfigMapNamespace}} + - --configmap-name=kms-health-{{.KMSPluginName}} +env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name +volumeMounts: + - name: kms-plugin-socket-{{.KMSPluginName}} + mountPath: /var/run/kmsplugin-{{.KMSPluginName}} +resources: + requests: + memory: 50Mi + cpu: 5m diff --git a/pkg/operator/encryption/kms/health/bindata.go b/pkg/operator/encryption/kms/health/bindata.go new file mode 100644 index 0000000000..545d50cd77 --- /dev/null +++ b/pkg/operator/encryption/kms/health/bindata.go @@ -0,0 +1,16 @@ +package health + +import ( + "embed" +) + +//go:embed assets/* +var f embed.FS + +func mustAsset(name string) []byte { + data, err := f.ReadFile(name) + if err != nil { + panic(err) + } + return data +} diff --git a/pkg/operator/encryption/kms/health/container_template.go b/pkg/operator/encryption/kms/health/container_template.go new file mode 100644 index 0000000000..7a9256dbf7 --- /dev/null +++ b/pkg/operator/encryption/kms/health/container_template.go @@ -0,0 +1,119 @@ +package health + +import ( + "bytes" + "encoding/json" + "fmt" + "strings" + "text/template" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/validation" + "sigs.k8s.io/yaml" +) + +// ContainerOptions are the per-caller knobs for GenerateContainerTemplate. +// KMSPluginName is the cohering parameter; distinct names produce +// non-colliding sidecars so multiple plugins can be observed in one pod. +type ContainerOptions struct { + // Must be a DNS-1123 label; suffixed onto container/volume/CM names. + KMSPluginName string + + OperatorImage string + OperatorCommand []string + + ProbeInterval time.Duration + ProbeIntervalUnhealthy time.Duration + ProbeTimeout time.Duration + WriteTimeout time.Duration + + ConfigMapNamespace string +} + +// GenerateContainerTemplate renders the kms-health-monitor sidecar. +// Caller must add a matching Volume named kms-plugin-socket- +// to PodSpec.Volumes. The deprecated single-plugin +// AddKMSPluginVolumeAndMountToPodSpec helper is intentionally not used. +func GenerateContainerTemplate(opts ContainerOptions) (corev1.Container, error) { + if err := opts.validate(); err != nil { + return corev1.Container{}, err + } + + rawManifest := mustAsset("assets/kms-health-container.yaml") + + funcs := template.FuncMap{ + "toJson": func(v any) (string, error) { + b, err := json.Marshal(v) + if err != nil { + return "", err + } + return string(b), nil + }, + } + + tmpl, err := template.New("kms-health-container").Funcs(funcs).Parse(string(rawManifest)) + if err != nil { + return corev1.Container{}, fmt.Errorf("parse asset template: %w", err) + } + + values := struct { + KMSPluginName string + OperatorImage string + Command []string + ProbeInterval string + ProbeIntervalUnhealthy string + ProbeTimeout string + WriteTimeout string + ConfigMapNamespace string + }{ + KMSPluginName: opts.KMSPluginName, + OperatorImage: opts.OperatorImage, + Command: opts.OperatorCommand, + ProbeInterval: opts.ProbeInterval.String(), + ProbeIntervalUnhealthy: opts.ProbeIntervalUnhealthy.String(), + ProbeTimeout: opts.ProbeTimeout.String(), + WriteTimeout: opts.WriteTimeout.String(), + ConfigMapNamespace: opts.ConfigMapNamespace, + } + + var buf bytes.Buffer + if err := tmpl.Execute(&buf, values); err != nil { + return corev1.Container{}, fmt.Errorf("execute template: %w", err) + } + + var container corev1.Container + if err := yaml.Unmarshal(buf.Bytes(), &container); err != nil { + return corev1.Container{}, fmt.Errorf("parse rendered container: %w", err) + } + return container, nil +} + +func (o ContainerOptions) validate() error { + if errs := validation.IsDNS1123Label(o.KMSPluginName); len(errs) > 0 { + return fmt.Errorf("KMSPluginName %q is not a valid DNS-1123 label: %s", + o.KMSPluginName, strings.Join(errs, "; ")) + } + if o.OperatorImage == "" { + return fmt.Errorf("OperatorImage is required") + } + if len(o.OperatorCommand) == 0 { + return fmt.Errorf("OperatorCommand must have at least one element") + } + if o.ProbeInterval <= 0 { + return fmt.Errorf("ProbeInterval must be positive") + } + if o.ProbeIntervalUnhealthy <= 0 { + return fmt.Errorf("ProbeIntervalUnhealthy must be positive") + } + if o.ProbeTimeout <= 0 { + return fmt.Errorf("ProbeTimeout must be positive") + } + if o.WriteTimeout <= 0 { + return fmt.Errorf("WriteTimeout must be positive") + } + if o.ConfigMapNamespace == "" { + return fmt.Errorf("ConfigMapNamespace is required") + } + return nil +} diff --git a/pkg/operator/encryption/kms/health/container_template_test.go b/pkg/operator/encryption/kms/health/container_template_test.go new file mode 100644 index 0000000000..de68a11606 --- /dev/null +++ b/pkg/operator/encryption/kms/health/container_template_test.go @@ -0,0 +1,171 @@ +package health + +import ( + "strings" + "testing" + "time" + + "github.com/spf13/pflag" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + "sigs.k8s.io/yaml" +) + +const expectedContainerYAML = ` +name: kms-health-monitor-aws +image: quay.io/openshift/operator:latest +command: + - "operator" + - "kms-health-monitor" +args: + - --kms-socket=/var/run/kmsplugin-aws/kms.sock + - --probe-interval=1m0s + - --probe-interval-unhealthy=10s + - --probe-timeout=3s + - --write-timeout=5s + - --output-mode=configmap + - --configmap-namespace=openshift-kube-apiserver + - --configmap-name=kms-health-aws +env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name +volumeMounts: + - name: kms-plugin-socket-aws + mountPath: /var/run/kmsplugin-aws +resources: + requests: + memory: 50Mi + cpu: 5m +` + +func validContainerOptions() ContainerOptions { + return ContainerOptions{ + KMSPluginName: "aws", + OperatorImage: "quay.io/openshift/operator:latest", + OperatorCommand: []string{"operator", "kms-health-monitor"}, + ProbeInterval: 60 * time.Second, + ProbeIntervalUnhealthy: 10 * time.Second, + ProbeTimeout: 3 * time.Second, + WriteTimeout: 5 * time.Second, + ConfigMapNamespace: "openshift-kube-apiserver", + } +} + +func TestGenerateContainerTemplate(t *testing.T) { + got, err := GenerateContainerTemplate(validContainerOptions()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + var want corev1.Container + if err := yaml.Unmarshal([]byte(expectedContainerYAML), &want); err != nil { + t.Fatalf("parse expected: %v", err) + } + if !equality.Semantic.DeepEqual(got, want) { + t.Fatalf("rendered container does not match expected:\ngot: %+v\nwant: %+v", got, want) + } +} + +// Drift detector: rendered args must parse against cmd.go's flag set. +func TestRenderedArgsAreValidFlags(t *testing.T) { + c, err := GenerateContainerTemplate(validContainerOptions()) + if err != nil { + t.Fatalf("render: %v", err) + } + + fs := pflag.NewFlagSet("kms-health-monitor", pflag.ContinueOnError) + (&commandOptions{}).addFlags(fs) + if err := fs.Parse(c.Args); err != nil { + t.Fatalf("rendered args do not parse against cmd.go flag set: %v\nargs: %v", err, c.Args) + } +} + +// Distinct KMSPluginName must yield non-colliding container/volume/CM names. +func TestMultiplePluginsCoexist(t *testing.T) { + a, err := GenerateContainerTemplate(ContainerOptions{ + KMSPluginName: "aws", + OperatorImage: "img:1", + OperatorCommand: []string{"o"}, + ProbeInterval: 60 * time.Second, + ProbeIntervalUnhealthy: 10 * time.Second, + ProbeTimeout: 3 * time.Second, + WriteTimeout: 5 * time.Second, + ConfigMapNamespace: "ns", + }) + if err != nil { + t.Fatalf("render aws: %v", err) + } + b, err := GenerateContainerTemplate(ContainerOptions{ + KMSPluginName: "vault", + OperatorImage: "img:1", + OperatorCommand: []string{"o"}, + ProbeInterval: 60 * time.Second, + ProbeIntervalUnhealthy: 10 * time.Second, + ProbeTimeout: 3 * time.Second, + WriteTimeout: 5 * time.Second, + ConfigMapNamespace: "ns", + }) + if err != nil { + t.Fatalf("render vault: %v", err) + } + + if a.Name == b.Name { + t.Fatalf("container names collide: %q == %q", a.Name, b.Name) + } + if a.VolumeMounts[0].Name == b.VolumeMounts[0].Name { + t.Fatalf("volumeMount names collide: %q", a.VolumeMounts[0].Name) + } + if a.VolumeMounts[0].MountPath == b.VolumeMounts[0].MountPath { + t.Fatalf("mountPaths collide: %q", a.VolumeMounts[0].MountPath) + } + // Args must contain different --configmap-name and --kms-socket values. + if argsEqual(a.Args, b.Args) { + t.Fatalf("args identical for distinct plugin names") + } +} + +func argsEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func TestContainerOptionsValidate(t *testing.T) { + tests := []struct { + name string + mutate func(*ContainerOptions) + wantSub string + }{ + {"empty plugin name", func(o *ContainerOptions) { o.KMSPluginName = "" }, "KMSPluginName"}, + {"underscore in plugin name", func(o *ContainerOptions) { o.KMSPluginName = "aws_kms" }, "KMSPluginName"}, + {"uppercase in plugin name", func(o *ContainerOptions) { o.KMSPluginName = "AWS" }, "KMSPluginName"}, + {"empty image", func(o *ContainerOptions) { o.OperatorImage = "" }, "OperatorImage"}, + {"empty command", func(o *ContainerOptions) { o.OperatorCommand = nil }, "OperatorCommand"}, + {"zero probe-interval", func(o *ContainerOptions) { o.ProbeInterval = 0 }, "ProbeInterval"}, + {"zero probe-interval-unhealthy", func(o *ContainerOptions) { o.ProbeIntervalUnhealthy = 0 }, "ProbeIntervalUnhealthy"}, + {"zero probe-timeout", func(o *ContainerOptions) { o.ProbeTimeout = 0 }, "ProbeTimeout"}, + {"zero write-timeout", func(o *ContainerOptions) { o.WriteTimeout = 0 }, "WriteTimeout"}, + {"empty namespace", func(o *ContainerOptions) { o.ConfigMapNamespace = "" }, "ConfigMapNamespace"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + opts := validContainerOptions() + tt.mutate(&opts) + _, err := GenerateContainerTemplate(opts) + if err == nil { + t.Fatalf("expected error containing %q, got nil", tt.wantSub) + } + if !strings.Contains(err.Error(), tt.wantSub) { + t.Fatalf("error %q does not contain %q", err.Error(), tt.wantSub) + } + }) + } +} diff --git a/pkg/operator/encryption/kms/health/writer_configmap.go b/pkg/operator/encryption/kms/health/writer_configmap.go index f831035d29..7025192e07 100644 --- a/pkg/operator/encryption/kms/health/writer_configmap.go +++ b/pkg/operator/encryption/kms/health/writer_configmap.go @@ -24,6 +24,10 @@ import ( // data.not-ok = {"timestamp":"...","observerPod":"...","detail":"...","keyIDHash":"..."} // data.rpc-error = {"timestamp":"...","observerPod":"...","detail":"..."} // +// Self-heals on miss: if the CM is absent, Write creates it. Caller +// RBAC must therefore grant create in addition to get/update/patch on +// the named CM. +// // Concurrency contract: one ConfigMap per monitor instance. Two // monitors writing the same CM produce last-writer-wins on every key. // Callers MUST encode instance identity in the CM name (the cmd-layer From 3e971c52847ca7f19ece60535aa357884cba8d61 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Mon, 11 May 2026 14:06:13 +0200 Subject: [PATCH 3/8] kms/health: add operator condition writer --- .../kms/health/writer_operator_condition.go | 168 ++++++++ .../health/writer_operator_condition_test.go | 399 ++++++++++++++++++ 2 files changed, 567 insertions(+) create mode 100644 pkg/operator/encryption/kms/health/writer_operator_condition.go create mode 100644 pkg/operator/encryption/kms/health/writer_operator_condition_test.go diff --git a/pkg/operator/encryption/kms/health/writer_operator_condition.go b/pkg/operator/encryption/kms/health/writer_operator_condition.go new file mode 100644 index 0000000000..ae8d269baf --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer_operator_condition.go @@ -0,0 +1,168 @@ +package health + +import ( + "context" + "fmt" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" + operatorv1helpers "github.com/openshift/library-go/pkg/operator/v1helpers" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/clock" +) + +// Condition Type prefix and suffixes. Final form per writer: +// +// KMSv2HealthPlugin__Available +// KMSv2HealthPlugin__Degraded +// +// Pod names contain hyphens (kube-apiserver-master-0); they pass through +// verbatim because library-go's UnionCondition handles hyphens correctly. +const ( + conditionTypePrefix = "KMSv2HealthPlugin_" + conditionTypeAvailableSuffix = "_Available" + conditionTypeDegradedSuffix = "_Degraded" + + reasonAsExpected = "AsExpected" + reasonPluginUnhealthy = "PluginUnhealthy" + reasonProbeUnreachable = "ProbeUnreachable" + reasonHealthzNotOK = "HealthzNotOK" + reasonRPCError = "RPCError" + + // fieldManagerPrefix isolates each pod's two condition entries via SSA + // per-entry ownership (OperatorStatus.Conditions is listType=map keyed + // on Type). Two writers with different ObserverPods cannot collide. + fieldManagerPrefix = "kms-health-monitor-" +) + +// OperatorConditionWriter publishes KMSv2 plugin health to a +// *.operator.openshift.io/cluster CRD via the OperatorClient interface, so +// the same code works for KubeAPIServer, Authentication, OpenShiftAPIServer. +// +// Two condition entries are written per call: +// +// KMSv2HealthPlugin__Available // rolls up to ClusterOperator.Available +// KMSv2HealthPlugin__Degraded // rolls up to ClusterOperator.Degraded +// +// LastTransitionTime: read prior conditions and carry over the existing +// timestamp when Status is unchanged, otherwise stamp now. Mirrors +// v1helpers.SetOperatorCondition (helpers.go:58-100) for the SSA path so +// consumers don't see the field churn on every reconcile. +type OperatorConditionWriter struct { + client operatorv1helpers.OperatorClient + observerPod string + fieldManager string + clock clock.Clock +} + +func NewOperatorConditionWriter(client operatorv1helpers.OperatorClient, observerPod string) *OperatorConditionWriter { + return &OperatorConditionWriter{ + client: client, + observerPod: observerPod, + fieldManager: fieldManagerPrefix + observerPod, + clock: clock.RealClock{}, + } +} + +type conditionDecision struct { + status operatorv1.ConditionStatus + reason string +} + +// mapHealthClass implements the asymmetric mapping. Available expresses +// confirmed-good, so RPCError ("we don't know") is Unknown. Degraded +// expresses persistent-wrong, and RPCError IS wrong (we cannot reach the +// plugin), so Degraded is True. +func mapHealthClass(class HealthClass) (avail, degraded conditionDecision) { + switch class { + case HealthClassOK: + return conditionDecision{operatorv1.ConditionTrue, reasonAsExpected}, + conditionDecision{operatorv1.ConditionFalse, reasonAsExpected} + case HealthClassNotOK: + return conditionDecision{operatorv1.ConditionFalse, reasonPluginUnhealthy}, + conditionDecision{operatorv1.ConditionTrue, reasonHealthzNotOK} + case HealthClassRPCError: + return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, + conditionDecision{operatorv1.ConditionTrue, reasonRPCError} + } + // Closed set: HealthClass values come from classifyHealthz / classifyRPCError. + // Treat any unknown class as RPCError-equivalent rather than panicking. + return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, + conditionDecision{operatorv1.ConditionTrue, reasonRPCError} +} + +// buildMessage renders the wire format: +// +// healthy: "keyIDHash=" +// unhealthy with KeyIDHash: "keyIDHash= detail=" +// unhealthy without KeyIDHash: "detail=" +func buildMessage(status HealthStatus) string { + if status.Healthz.IsOK() { + return "keyIDHash=" + status.KeyIDHash + } + if status.KeyIDHash != "" { + return fmt.Sprintf("keyIDHash=%s detail=%s", status.KeyIDHash, status.Healthz.Detail) + } + return "detail=" + status.Healthz.Detail +} + +// resolveLastTransitionTime mirrors v1helpers.SetOperatorCondition LTT logic +// for the SSA path: brand-new condition or Status flip stamps now; +// Status-unchanged carries over the prior LastTransitionTime so consumers +// don't see churn on every reconcile. +func resolveLastTransitionTime(prior []operatorv1.OperatorCondition, conditionType string, newStatus operatorv1.ConditionStatus, now time.Time) metav1.Time { + for i := range prior { + if prior[i].Type != conditionType { + continue + } + if prior[i].Status == newStatus { + return prior[i].LastTransitionTime + } + break + } + return metav1.NewTime(now) +} + +func (w *OperatorConditionWriter) Write(ctx context.Context, status HealthStatus) error { + availType := conditionTypePrefix + w.observerPod + conditionTypeAvailableSuffix + degradedType := conditionTypePrefix + w.observerPod + conditionTypeDegradedSuffix + + avail, degraded := mapHealthClass(status.Healthz.Class) + msg := buildMessage(status) + + // Read prior state for LTT carry-over. A read failure is non-fatal: + // worst case we stamp LTT=now on a Status that didn't actually change, + // which self-corrects on the next successful tick. Skipping the publish + // would punish transient apiserver glitches harder than the contract + // asks for. + var priorConditions []operatorv1.OperatorCondition + if _, opStatus, _, err := w.client.GetOperatorState(); err == nil && opStatus != nil { + priorConditions = opStatus.Conditions + } + + now := w.clock.Now() + availLTT := resolveLastTransitionTime(priorConditions, availType, avail.status, now) + degradedLTT := resolveLastTransitionTime(priorConditions, degradedType, degraded.status, now) + + cfg := applyoperatorv1.OperatorStatus().WithConditions( + applyoperatorv1.OperatorCondition(). + WithType(availType). + WithStatus(avail.status). + WithReason(avail.reason). + WithMessage(msg). + WithLastTransitionTime(availLTT), + applyoperatorv1.OperatorCondition(). + WithType(degradedType). + WithStatus(degraded.status). + WithReason(degraded.reason). + WithMessage(msg). + WithLastTransitionTime(degradedLTT), + ) + + if err := w.client.ApplyOperatorStatus(ctx, w.fieldManager, cfg); err != nil { + return fmt.Errorf("apply operator-condition status (fieldManager=%s): %w", w.fieldManager, err) + } + return nil +} diff --git a/pkg/operator/encryption/kms/health/writer_operator_condition_test.go b/pkg/operator/encryption/kms/health/writer_operator_condition_test.go new file mode 100644 index 0000000000..19823675e6 --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer_operator_condition_test.go @@ -0,0 +1,399 @@ +package health + +import ( + "context" + "testing" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" + "github.com/openshift/library-go/pkg/apiserver/jsonpatch" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + clocktesting "k8s.io/utils/clock/testing" +) + +const ( + testObserverPod = "kube-apiserver-master-0" + testFieldManager = "kms-health-monitor-kube-apiserver-master-0" + testTypeAvailable = "KMSv2HealthPlugin_kube-apiserver-master-0_Available" + testTypeDegraded = "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded" +) + +// recordingOperatorClient is a focused test double for OperatorConditionWriter: +// it captures every ApplyOperatorStatus call AND merges the applied conditions +// (LastTransitionTime included) into an in-memory OperatorStatus that +// subsequent GetOperatorState reads return. v1helpers.fakeOperatorClient drops +// LastTransitionTime in its merge, which would defeat the LTT tests. +// +// Merge semantics mirror real SSA per-entry ownership: each apply upserts its +// own conditions by Type, leaving conditions written by other fieldManagers +// untouched. +type recordingOperatorClient struct { + status operatorv1.OperatorStatus + applies []recordedApply +} + +type recordedApply struct { + fieldManager string + cfg *applyoperatorv1.OperatorStatusApplyConfiguration +} + +func newRecordingOperatorClient() *recordingOperatorClient { + return &recordingOperatorClient{} +} + +func (c *recordingOperatorClient) GetOperatorState() (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { + return &operatorv1.OperatorSpec{}, c.status.DeepCopy(), "0", nil +} + +func (c *recordingOperatorClient) ApplyOperatorStatus(_ context.Context, fieldManager string, cfg *applyoperatorv1.OperatorStatusApplyConfiguration) error { + c.applies = append(c.applies, recordedApply{fieldManager: fieldManager, cfg: cfg}) + for _, ac := range cfg.Conditions { + merged := operatorv1.OperatorCondition{} + if ac.Type != nil { + merged.Type = *ac.Type + } + if ac.Status != nil { + merged.Status = *ac.Status + } + if ac.Reason != nil { + merged.Reason = *ac.Reason + } + if ac.Message != nil { + merged.Message = *ac.Message + } + if ac.LastTransitionTime != nil { + merged.LastTransitionTime = *ac.LastTransitionTime + } + c.upsertCondition(merged) + } + return nil +} + +func (c *recordingOperatorClient) upsertCondition(cond operatorv1.OperatorCondition) { + for i := range c.status.Conditions { + if c.status.Conditions[i].Type == cond.Type { + c.status.Conditions[i] = cond + return + } + } + c.status.Conditions = append(c.status.Conditions, cond) +} + +// Methods OperatorConditionWriter never calls. Panic-on-call signals if a +// future change accidentally introduces a dependency. +func (c *recordingOperatorClient) Informer() cache.SharedIndexInformer { return nil } +func (c *recordingOperatorClient) GetObjectMeta() (*metav1.ObjectMeta, error) { + return &metav1.ObjectMeta{}, nil +} +func (c *recordingOperatorClient) GetOperatorStateWithQuorum(ctx context.Context) (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { + return c.GetOperatorState() +} +func (c *recordingOperatorClient) UpdateOperatorSpec(context.Context, string, *operatorv1.OperatorSpec) (*operatorv1.OperatorSpec, string, error) { + panic("UpdateOperatorSpec not used by OperatorConditionWriter") +} +func (c *recordingOperatorClient) UpdateOperatorStatus(context.Context, string, *operatorv1.OperatorStatus) (*operatorv1.OperatorStatus, error) { + panic("UpdateOperatorStatus not used by OperatorConditionWriter") +} +func (c *recordingOperatorClient) ApplyOperatorSpec(context.Context, string, *applyoperatorv1.OperatorSpecApplyConfiguration) error { + panic("ApplyOperatorSpec not used by OperatorConditionWriter") +} +func (c *recordingOperatorClient) PatchOperatorStatus(context.Context, *jsonpatch.PatchSet) error { + panic("PatchOperatorStatus not used by OperatorConditionWriter") +} + +func findCondition(conds []operatorv1.OperatorCondition, t string) *operatorv1.OperatorCondition { + for i := range conds { + if conds[i].Type == t { + return &conds[i] + } + } + return nil +} + +// newTestWriter builds an OperatorConditionWriter with an injected fake clock +// so LTT-sensitive tests can advance time deterministically. +func newTestWriter(client *recordingOperatorClient, observerPod string, fc *clocktesting.FakeClock) *OperatorConditionWriter { + w := NewOperatorConditionWriter(client, observerPod) + w.clock = fc + return w +} + +func TestOperatorConditionWriter_healthyStatusWritesAvailableTrueAndDegradedFalse(t *testing.T) { + client := newRecordingOperatorClient() + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + w := newTestWriter(client, testObserverPod, fc) + + err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "abc123", + ObserverPod: testObserverPod, + }) + if err != nil { + t.Fatalf("Write: %v", err) + } + + a := findCondition(client.status.Conditions, testTypeAvailable) + if a == nil { + t.Fatalf("Available condition missing; conditions=%v", client.status.Conditions) + } + if a.Status != operatorv1.ConditionTrue { + t.Errorf("Available.Status: got %q, want True", a.Status) + } + if a.Reason != reasonAsExpected { + t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonAsExpected) + } + if got, want := a.Message, "keyIDHash=abc123"; got != want { + t.Errorf("Available.Message: got %q, want %q", got, want) + } + + d := findCondition(client.status.Conditions, testTypeDegraded) + if d == nil { + t.Fatalf("Degraded condition missing; conditions=%v", client.status.Conditions) + } + if d.Status != operatorv1.ConditionFalse { + t.Errorf("Degraded.Status: got %q, want False", d.Status) + } + if d.Reason != reasonAsExpected { + t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonAsExpected) + } + if got, want := d.Message, "keyIDHash=abc123"; got != want { + t.Errorf("Degraded.Message: got %q, want %q", got, want) + } + + if got, want := len(client.applies), 1; got != want { + t.Fatalf("apply call count: got %d, want %d", got, want) + } + if got, want := client.applies[0].fieldManager, testFieldManager; got != want { + t.Errorf("fieldManager: got %q, want %q", got, want) + } +} + +func TestOperatorConditionWriter_notOKStatusWritesAvailableFalseAndDegradedTrue(t *testing.T) { + client := newRecordingOperatorClient() + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + w := newTestWriter(client, testObserverPod, fc) + + err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassNotOK, Detail: "kid lookup failed"}, + KeyIDHash: "deadbeef", + ObserverPod: testObserverPod, + }) + if err != nil { + t.Fatalf("Write: %v", err) + } + + a := findCondition(client.status.Conditions, testTypeAvailable) + if a == nil { + t.Fatalf("Available missing") + } + if a.Status != operatorv1.ConditionFalse { + t.Errorf("Available.Status: got %q, want False", a.Status) + } + if a.Reason != reasonPluginUnhealthy { + t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonPluginUnhealthy) + } + want := "keyIDHash=deadbeef detail=kid lookup failed" + if a.Message != want { + t.Errorf("Available.Message: got %q, want %q", a.Message, want) + } + + d := findCondition(client.status.Conditions, testTypeDegraded) + if d == nil { + t.Fatalf("Degraded missing") + } + if d.Status != operatorv1.ConditionTrue { + t.Errorf("Degraded.Status: got %q, want True", d.Status) + } + if d.Reason != reasonHealthzNotOK { + t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonHealthzNotOK) + } + if d.Message != want { + t.Errorf("Degraded.Message: got %q, want %q", d.Message, want) + } +} + +func TestOperatorConditionWriter_rpcErrorStatusWritesAvailableUnknownAndDegradedTrue(t *testing.T) { + client := newRecordingOperatorClient() + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + w := newTestWriter(client, testObserverPod, fc) + + err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, + ObserverPod: testObserverPod, + }) + if err != nil { + t.Fatalf("Write: %v", err) + } + + a := findCondition(client.status.Conditions, testTypeAvailable) + if a == nil { + t.Fatalf("Available missing") + } + // Asymmetric: Available expresses confirmed-good. Lost contact is not + // "confirmed bad", so Unknown is the honest answer. + if a.Status != operatorv1.ConditionUnknown { + t.Errorf("Available.Status: got %q, want Unknown", a.Status) + } + if a.Reason != reasonProbeUnreachable { + t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonProbeUnreachable) + } + if got, want := a.Message, "detail=Unavailable"; got != want { + t.Errorf("Available.Message: got %q, want %q (no keyIDHash= prefix when KeyIDHash is empty)", got, want) + } + + d := findCondition(client.status.Conditions, testTypeDegraded) + if d == nil { + t.Fatalf("Degraded missing") + } + // Asymmetric: Degraded expresses persistent-wrong. Inability to reach + // the plugin IS wrong, so Degraded is True. + if d.Status != operatorv1.ConditionTrue { + t.Errorf("Degraded.Status: got %q, want True", d.Status) + } + if d.Reason != reasonRPCError { + t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonRPCError) + } + if got, want := d.Message, "detail=Unavailable"; got != want { + t.Errorf("Degraded.Message: got %q, want %q", got, want) + } +} + +// Repeated identical Write must not bump LastTransitionTime. If it did, +// every reconcile would churn the field and downstream consumers (rollout +// gates, alerts) would think transitions were happening when nothing has. +func TestOperatorConditionWriter_repeatedWriteSameStatusDoesNotBumpLastTransitionTime(t *testing.T) { + client := newRecordingOperatorClient() + t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) + fc := clocktesting.NewFakeClock(t1) + w := newTestWriter(client, testObserverPod, fc) + + healthy := HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "abc123", + ObserverPod: testObserverPod, + } + if err := w.Write(context.Background(), healthy); err != nil { + t.Fatalf("first Write: %v", err) + } + + fc.Step(5 * time.Minute) + if err := w.Write(context.Background(), healthy); err != nil { + t.Fatalf("second Write: %v", err) + } + + for _, ct := range []string{testTypeAvailable, testTypeDegraded} { + c := findCondition(client.status.Conditions, ct) + if c == nil { + t.Fatalf("%s missing", ct) + } + if !c.LastTransitionTime.Time.Equal(t1) { + t.Errorf("%s.LastTransitionTime: got %v, want %v (Status unchanged, must carry over)", ct, c.LastTransitionTime.Time, t1) + } + } +} + +// Status flip must bump LastTransitionTime to the current time. +func TestOperatorConditionWriter_statusFlipBumpsLastTransitionTime(t *testing.T) { + client := newRecordingOperatorClient() + t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) + fc := clocktesting.NewFakeClock(t1) + w := newTestWriter(client, testObserverPod, fc) + + if err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "abc", + ObserverPod: testObserverPod, + }); err != nil { + t.Fatalf("first Write: %v", err) + } + + t2 := t1.Add(5 * time.Minute) + fc.SetTime(t2) + if err := w.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, + KeyIDHash: "abc", + ObserverPod: testObserverPod, + }); err != nil { + t.Fatalf("second Write: %v", err) + } + + for _, ct := range []string{testTypeAvailable, testTypeDegraded} { + c := findCondition(client.status.Conditions, ct) + if c == nil { + t.Fatalf("%s missing", ct) + } + if !c.LastTransitionTime.Time.Equal(t2) { + t.Errorf("%s.LastTransitionTime: got %v, want %v (advanced on Status flip)", ct, c.LastTransitionTime.Time, t2) + } + } +} + +// Two writers with different ObserverPods write to the same CR without +// clobbering each other. This is the load-bearing property: per-pod +// fieldManager + per-entry SSA ownership keeps both pods' conditions +// coexisting in .status.conditions[]. +func TestOperatorConditionWriter_twoWritersDifferentObserverPodsDoNotCollide(t *testing.T) { + client := newRecordingOperatorClient() + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + + const ( + podA = "kube-apiserver-master-0" + podB = "kube-apiserver-master-1" + ) + wa := newTestWriter(client, podA, fc) + wb := newTestWriter(client, podB, fc) + + if err := wa.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassOK}, + KeyIDHash: "a", + ObserverPod: podA, + }); err != nil { + t.Fatalf("wa.Write: %v", err) + } + if err := wb.Write(context.Background(), HealthStatus{ + Healthz: Healthz{Class: HealthClassNotOK, Detail: "bad"}, + KeyIDHash: "b", + ObserverPod: podB, + }); err != nil { + t.Fatalf("wb.Write: %v", err) + } + + wantTypes := []string{ + "KMSv2HealthPlugin_kube-apiserver-master-0_Available", + "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded", + "KMSv2HealthPlugin_kube-apiserver-master-1_Available", + "KMSv2HealthPlugin_kube-apiserver-master-1_Degraded", + } + for _, ct := range wantTypes { + if findCondition(client.status.Conditions, ct) == nil { + t.Errorf("condition %q missing; conditions=%v", ct, client.status.Conditions) + } + } + if got, want := len(client.status.Conditions), 4; got != want { + t.Errorf("conditions count: got %d, want %d (no overwrites)", got, want) + } + + // Spot-check that A's OK status survived B's apply. + availA := findCondition(client.status.Conditions, "KMSv2HealthPlugin_kube-apiserver-master-0_Available") + if availA == nil || availA.Status != operatorv1.ConditionTrue { + t.Errorf("podA Available: got %v, want Status=True (writer B clobbered it)", availA) + } + availB := findCondition(client.status.Conditions, "KMSv2HealthPlugin_kube-apiserver-master-1_Available") + if availB == nil || availB.Status != operatorv1.ConditionFalse { + t.Errorf("podB Available: got %v, want Status=False (NotOK)", availB) + } + + // Per-pod fieldManager: each writer used its own. + if got, want := len(client.applies), 2; got != want { + t.Fatalf("apply count: got %d, want %d", got, want) + } + if got, want := client.applies[0].fieldManager, "kms-health-monitor-"+podA; got != want { + t.Errorf("apply[0].fieldManager: got %q, want %q", got, want) + } + if got, want := client.applies[1].fieldManager, "kms-health-monitor-"+podB; got != want { + t.Errorf("apply[1].fieldManager: got %q, want %q", got, want) + } +} From dcf239c06165925ed10d6f8b2546adc9654f1970 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Mon, 11 May 2026 14:32:30 +0200 Subject: [PATCH 4/8] kms/health: simplify to OperatorCondition-only sidecar Reviewer feedback: too large compared to the preflight package. Cuts the package roughly in half by dropping optionality we baked in but had no immediate consumer for: - Drop ConfigMapWriter; OperatorCondition is the surviving sink. - Drop StatusWriter and Prober interfaces (single impl each). - Drop dual cadence; use wait.UntilWithContext for the loop, gaining panic recovery for free. - Drop GenerateContainerTemplate and its asset; consumers compose the sidecar however they like. - cmd.go now wires an OperatorClient against one of three target CRs (kubeapiserver, authentication, openshiftapiserver) via a flag. - Tests trimmed to happy + load-bearing edge paths (UTF-8 truncation, per-probe timeout, LTT carry-over, per-pod fieldManager isolation). Snapshot of the prior shape preserved at tag snapshot/CNTRLPLANE-3234-pre-simplify for future follow-ups. --- .../health/assets/kms-health-container.yaml | 27 -- pkg/operator/encryption/kms/health/bindata.go | 16 - pkg/operator/encryption/kms/health/cmd.go | 234 ++++------ .../encryption/kms/health/cmd_test.go | 124 +----- ...ter_operator_condition.go => condition.go} | 58 +-- .../encryption/kms/health/condition_test.go | 243 +++++++++++ .../kms/health/container_template.go | 119 ------ .../kms/health/container_template_test.go | 171 -------- pkg/operator/encryption/kms/health/doc.go | 5 - pkg/operator/encryption/kms/health/monitor.go | 78 ++-- .../encryption/kms/health/monitor_test.go | 253 ++--------- pkg/operator/encryption/kms/health/probe.go | 28 +- .../encryption/kms/health/probe_test.go | 145 ++----- pkg/operator/encryption/kms/health/types.go | 29 +- pkg/operator/encryption/kms/health/writer.go | 10 - .../encryption/kms/health/writer_configmap.go | 104 ----- .../kms/health/writer_configmap_test.go | 244 ----------- .../health/writer_operator_condition_test.go | 399 ------------------ 18 files changed, 470 insertions(+), 1817 deletions(-) delete mode 100644 pkg/operator/encryption/kms/health/assets/kms-health-container.yaml delete mode 100644 pkg/operator/encryption/kms/health/bindata.go rename pkg/operator/encryption/kms/health/{writer_operator_condition.go => condition.go} (63%) create mode 100644 pkg/operator/encryption/kms/health/condition_test.go delete mode 100644 pkg/operator/encryption/kms/health/container_template.go delete mode 100644 pkg/operator/encryption/kms/health/container_template_test.go delete mode 100644 pkg/operator/encryption/kms/health/doc.go delete mode 100644 pkg/operator/encryption/kms/health/writer.go delete mode 100644 pkg/operator/encryption/kms/health/writer_configmap.go delete mode 100644 pkg/operator/encryption/kms/health/writer_configmap_test.go delete mode 100644 pkg/operator/encryption/kms/health/writer_operator_condition_test.go diff --git a/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml b/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml deleted file mode 100644 index 8bceca8b2d..0000000000 --- a/pkg/operator/encryption/kms/health/assets/kms-health-container.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: kms-health-monitor-{{.KMSPluginName}} -image: {{.OperatorImage}} -command: -{{- range .Command }} - - {{ . | toJson }} -{{- end }} -args: - - --kms-socket=/var/run/kmsplugin-{{.KMSPluginName}}/kms.sock - - --probe-interval={{.ProbeInterval}} - - --probe-interval-unhealthy={{.ProbeIntervalUnhealthy}} - - --probe-timeout={{.ProbeTimeout}} - - --write-timeout={{.WriteTimeout}} - - --output-mode=configmap - - --configmap-namespace={{.ConfigMapNamespace}} - - --configmap-name=kms-health-{{.KMSPluginName}} -env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name -volumeMounts: - - name: kms-plugin-socket-{{.KMSPluginName}} - mountPath: /var/run/kmsplugin-{{.KMSPluginName}} -resources: - requests: - memory: 50Mi - cpu: 5m diff --git a/pkg/operator/encryption/kms/health/bindata.go b/pkg/operator/encryption/kms/health/bindata.go deleted file mode 100644 index 545d50cd77..0000000000 --- a/pkg/operator/encryption/kms/health/bindata.go +++ /dev/null @@ -1,16 +0,0 @@ -package health - -import ( - "embed" -) - -//go:embed assets/* -var f embed.FS - -func mustAsset(name string) []byte { - data, err := f.ReadFile(name) - if err != nil { - panic(err) - } - return data -} diff --git a/pkg/operator/encryption/kms/health/cmd.go b/pkg/operator/encryption/kms/health/cmd.go index afbb7bec51..a0ef8b2454 100644 --- a/pkg/operator/encryption/kms/health/cmd.go +++ b/pkg/operator/encryption/kms/health/cmd.go @@ -4,52 +4,67 @@ import ( "context" "fmt" "os" + "sort" + "strings" "time" "github.com/spf13/cobra" "github.com/spf13/pflag" + applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" + "github.com/openshift/library-go/pkg/operator/genericoperatorclient" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" k8senvelopekmsv2 "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2" - "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" + "k8s.io/utils/clock" ) -const ( - outputModeConfigMap = "configmap" - outputModeCondition = "condition" - - providerName = "kms-health-monitor" -) +const providerName = "kms-health-monitor" + +var supportedOperators = map[string]struct { + GVR schema.GroupVersionResource + GVK schema.GroupVersionKind +}{ + "kubeapiserver": { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "kubeapiservers"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "KubeAPIServer"}, + }, + "authentication": { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "authentications"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "Authentication"}, + }, + "openshiftapiserver": { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "openshiftapiservers"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "OpenShiftAPIServer"}, + }, +} type commandOptions struct { - kmsSocket string - probeInterval time.Duration - probeIntervalUnhealthy time.Duration - probeTimeout time.Duration - writeTimeout time.Duration - outputMode string - configmapNamespace string - configmapName string - observerPodName string - kubeconfig string + kmsSocket string + probeInterval time.Duration + probeTimeout time.Duration + writeTimeout time.Duration + operatorResource string + observerPodName string + kubeconfig string } -// NewCommand wires the cobra command. ctx is owned by main() so signal -// handling lives there. +// NewCommand wires the cobra command. ctx is owned by the caller so +// signal handling lives there, not here. func NewCommand(ctx context.Context) *cobra.Command { o := &commandOptions{ - kmsSocket: "/var/run/kmsplugin/kms.sock", - probeInterval: 60 * time.Second, - probeIntervalUnhealthy: 10 * time.Second, - probeTimeout: 3 * time.Second, - writeTimeout: 5 * time.Second, - outputMode: outputModeConfigMap, + kmsSocket: "/var/run/kmsplugin/kms.sock", + probeInterval: 60 * time.Second, + probeTimeout: 3 * time.Second, + writeTimeout: 5 * time.Second, } cmd := &cobra.Command{ Use: "kms-health-monitor", - Short: "Observes a co-located KMSv2 plugin and publishes a health status.", + Short: "Observes a co-located KMSv2 plugin and publishes status as an OperatorCondition.", RunE: func(cmd *cobra.Command, args []string) error { if err := o.validate(); err != nil { return err @@ -62,69 +77,15 @@ func NewCommand(ctx context.Context) *cobra.Command { } func (o *commandOptions) addFlags(fs *pflag.FlagSet) { - fs.StringVar( - &o.kmsSocket, - "kms-socket", - o.kmsSocket, - "filesystem path to the KMSv2 plugin UDS", - ) - fs.DurationVar( - &o.probeInterval, - "probe-interval", - o.probeInterval, - "probe cadence while the plugin is healthy", - ) - fs.DurationVar( - &o.probeIntervalUnhealthy, - "probe-interval-unhealthy", - o.probeIntervalUnhealthy, - "probe cadence after an unhealthy result until recovery", - ) - fs.DurationVar( - &o.probeTimeout, - "probe-timeout", - o.probeTimeout, - "deadline for each Status RPC", - ) - fs.DurationVar( - &o.writeTimeout, - "write-timeout", - o.writeTimeout, - "deadline for each status write (e.g. ConfigMap Update); should fit inside --probe-interval-unhealthy to preserve cadence under apiserver slowness", - ) - fs.StringVar( - &o.outputMode, - "output-mode", - o.outputMode, - "status sink: configmap (condition is reserved for the OpenShift track)", - ) - fs.StringVar( - &o.configmapNamespace, - "configmap-namespace", - "", - "namespace of the status ConfigMap (required when output-mode=configmap; "+ - "caller must have RBAC to patch ConfigMaps in this namespace)", - ) - fs.StringVar( - &o.configmapName, - "configmap-name", - "", - "name of the status ConfigMap; defaults to \"kms-health-${POD_NAME}\". "+ - "MUST be unique per monitor instance: concurrent writers on one CM "+ - "produce last-writer-wins flap on every key", - ) - fs.StringVar( - &o.observerPodName, - "observer-pod-name", - os.Getenv("POD_NAME"), - "pod name recorded in the status; defaults to $POD_NAME", - ) - fs.StringVar( - &o.kubeconfig, - "kubeconfig", - "", - "path to a kubeconfig; empty uses in-cluster config", - ) + fs.StringVar(&o.kmsSocket, "kms-socket", o.kmsSocket, "filesystem path to the KMSv2 plugin UDS") + fs.DurationVar(&o.probeInterval, "probe-interval", o.probeInterval, "cadence between probes") + fs.DurationVar(&o.probeTimeout, "probe-timeout", o.probeTimeout, "deadline for each Status RPC") + fs.DurationVar(&o.writeTimeout, "write-timeout", o.writeTimeout, "deadline for each condition update; should fit inside --probe-interval") + fs.StringVar(&o.operatorResource, "operator-resource", o.operatorResource, + "target operator CRD: "+strings.Join(supportedOperatorKeys(), ", ")) + fs.StringVar(&o.observerPodName, "observer-pod-name", os.Getenv("POD_NAME"), + "pod name recorded in the condition; defaults to $POD_NAME") + fs.StringVar(&o.kubeconfig, "kubeconfig", "", "path to a kubeconfig; empty uses in-cluster config") } func (o *commandOptions) validate() error { @@ -134,48 +95,18 @@ func (o *commandOptions) validate() error { if o.probeInterval <= 0 { return fmt.Errorf("--probe-interval must be positive") } - if o.probeIntervalUnhealthy <= 0 { - return fmt.Errorf("--probe-interval-unhealthy must be positive") - } if o.probeTimeout <= 0 { return fmt.Errorf("--probe-timeout must be positive") } if o.writeTimeout <= 0 { return fmt.Errorf("--write-timeout must be positive") } - // $POD_NAME defaulting happens at flag-registration time in addFlags; - // by here observerPodName is the flag, env, or genuinely empty. if o.observerPodName == "" { - return fmt.Errorf( - "--observer-pod-name is required (or set $POD_NAME)", - ) + return fmt.Errorf("--observer-pod-name is required (or set $POD_NAME)") } - switch o.outputMode { - case outputModeConfigMap: - if o.configmapNamespace == "" { - return fmt.Errorf( - "--configmap-namespace is required when --output-mode=%s", - outputModeConfigMap, - ) - } - if o.configmapName == "" { - // Wrap pod identity rather than using it bare. Advertises - // ownership and avoids colliding with a same-namespaced CM - // that some other component happens to name after the pod. - o.configmapName = "kms-health-" + o.observerPodName - } - case outputModeCondition: - return fmt.Errorf( - "--output-mode=%s is reserved for the OpenShift track and not implemented", - outputModeCondition, - ) - default: - return fmt.Errorf( - "--output-mode must be %q or %q (got %q)", - outputModeConfigMap, - outputModeCondition, - o.outputMode, - ) + if _, ok := supportedOperators[o.operatorResource]; !ok { + return fmt.Errorf("--operator-resource must be one of %s (got %q)", + strings.Join(supportedOperatorKeys(), ", "), o.operatorResource) } return nil } @@ -185,42 +116,33 @@ func (o *commandOptions) run(ctx context.Context) error { if err != nil { return fmt.Errorf("build rest config: %w", err) } - client, err := kubernetes.NewForConfig(cfg) + + target := supportedOperators[o.operatorResource] + operatorClient, _, err := genericoperatorclient.NewClusterScopedOperatorClient( + clock.RealClock{}, cfg, target.GVR, target.GVK, + emptyOperatorSpec, emptyOperatorStatus, + ) if err != nil { - return fmt.Errorf("build kubernetes client: %w", err) + return fmt.Errorf("build operator client for %s: %w", o.operatorResource, err) } + writer := NewOperatorConditionWriter(operatorClient, o.observerPodName) - writer := NewConfigMapWriter(client, o.configmapNamespace, o.configmapName) - - // WaitForReady(true) is set inside the kmsv2 client, so Dial returns - // immediately even if the plugin isn't listening yet. + // kmsv2.NewGRPCService sets WaitForReady(true), so dial returns + // immediately even if the plugin socket isn't listening yet. endpoint := "unix://" + o.kmsSocket - service, err := k8senvelopekmsv2.NewGRPCService( - ctx, - endpoint, - providerName, - o.probeTimeout, - ) + service, err := k8senvelopekmsv2.NewGRPCService(ctx, endpoint, providerName, o.probeTimeout) if err != nil { return fmt.Errorf("dial KMS plugin at %q: %w", endpoint, err) } probe := NewProbe(service, 0) - monitor := NewMonitor( - probe, - writer, - o.observerPodName, - o.probeInterval, - o.probeIntervalUnhealthy, - o.writeTimeout, - ) + monitor := NewMonitor(probe, writer, o.observerPodName, o.probeInterval, o.writeTimeout) klog.InfoS("kms-health-monitor starting", "socket", o.kmsSocket, - "configmap", o.configmapNamespace+"/"+o.configmapName, + "operatorResource", o.operatorResource, "observerPod", o.observerPodName, - "healthyInterval", o.probeInterval, - "unhealthyInterval", o.probeIntervalUnhealthy, + "interval", o.probeInterval, "probeTimeout", o.probeTimeout, "writeTimeout", o.writeTimeout, ) @@ -236,3 +158,23 @@ func buildRESTConfig(kubeconfig string) (*rest.Config, error) { } return rest.InClusterConfig() } + +func supportedOperatorKeys() []string { + keys := make([]string, 0, len(supportedOperators)) + for k := range supportedOperators { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +// Empty extractors are sufficient: OperatorConditionWriter reads prior +// conditions via GetOperatorState (which does not use these) and writes +// via SSA with its own fieldManager. +func emptyOperatorSpec(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorSpecApplyConfiguration, error) { + return applyoperatorv1.OperatorSpec(), nil +} + +func emptyOperatorStatus(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorStatusApplyConfiguration, error) { + return applyoperatorv1.OperatorStatus(), nil +} diff --git a/pkg/operator/encryption/kms/health/cmd_test.go b/pkg/operator/encryption/kms/health/cmd_test.go index 712a4bdd0c..080a8ef481 100644 --- a/pkg/operator/encryption/kms/health/cmd_test.go +++ b/pkg/operator/encryption/kms/health/cmd_test.go @@ -8,19 +8,14 @@ import ( "github.com/spf13/pflag" ) -// validOpts is the minimal fully-valid commandOptions. Every test case -// below mutates a copy of this to isolate one validation rule at a time. func validOpts() commandOptions { return commandOptions{ - kmsSocket: "/var/run/kms.sock", - probeInterval: 60 * time.Second, - probeIntervalUnhealthy: 10 * time.Second, - probeTimeout: 3 * time.Second, - writeTimeout: 5 * time.Second, - outputMode: outputModeConfigMap, - configmapNamespace: "kms-health-test", - configmapName: "kms-health-master-0", - observerPodName: "master-0", + kmsSocket: "/var/run/kms.sock", + probeInterval: 60 * time.Second, + probeTimeout: 3 * time.Second, + writeTimeout: 5 * time.Second, + operatorResource: "kubeapiserver", + observerPodName: "master-0", } } @@ -35,14 +30,13 @@ func TestValidate_rejectsMissingOrZeroFields(t *testing.T) { cases := []struct { name string mut func(*commandOptions) - wants string // substring expected in the error + wants string }{ {"empty kmsSocket", func(o *commandOptions) { o.kmsSocket = "" }, "--kms-socket"}, {"zero probeInterval", func(o *commandOptions) { o.probeInterval = 0 }, "--probe-interval"}, - {"zero probeIntervalUnhealthy", func(o *commandOptions) { o.probeIntervalUnhealthy = 0 }, "--probe-interval-unhealthy"}, {"zero probeTimeout", func(o *commandOptions) { o.probeTimeout = 0 }, "--probe-timeout"}, {"zero writeTimeout", func(o *commandOptions) { o.writeTimeout = 0 }, "--write-timeout"}, - {"empty configmapNamespace", func(o *commandOptions) { o.configmapNamespace = "" }, "--configmap-namespace"}, + {"empty observerPodName", func(o *commandOptions) { o.observerPodName = "" }, "--observer-pod-name"}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -59,34 +53,34 @@ func TestValidate_rejectsMissingOrZeroFields(t *testing.T) { } } -func TestValidate_outputModeEnum(t *testing.T) { +func TestValidate_operatorResourceEnum(t *testing.T) { cases := []struct { - name string - mode string - wantErr bool - wantSubstr string + name string + resource string + wantErr bool }{ - {"configmap accepted", outputModeConfigMap, false, ""}, - {"condition rejected as reserved", outputModeCondition, true, "reserved for the OpenShift track"}, - {"unknown rejected with both options listed", "bogus", true, `must be "configmap" or "condition"`}, - {"empty rejected with both options listed", "", true, `must be "configmap" or "condition"`}, + {"kubeapiserver accepted", "kubeapiserver", false}, + {"authentication accepted", "authentication", false}, + {"openshiftapiserver accepted", "openshiftapiserver", false}, + {"unknown rejected", "bogus", true}, + {"empty rejected", "", true}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { o := validOpts() - o.outputMode = tc.mode + o.operatorResource = tc.resource err := o.validate() if tc.wantErr { if err == nil { - t.Fatalf("validate: want error containing %q, got nil", tc.wantSubstr) + t.Fatalf("validate(%q): want error, got nil", tc.resource) } - if !strings.Contains(err.Error(), tc.wantSubstr) { - t.Errorf("validate: got %q, want substring %q", err.Error(), tc.wantSubstr) + if !strings.Contains(err.Error(), "--operator-resource") { + t.Errorf("validate(%q): got %q, want substring --operator-resource", tc.resource, err.Error()) } return } if err != nil { - t.Errorf("validate: got %v, want nil", err) + t.Errorf("validate(%q): got %v, want nil", tc.resource, err) } }) } @@ -104,77 +98,3 @@ func TestAddFlags_observerPodNameDefaultsFromPodNameEnv(t *testing.T) { t.Errorf("observerPodName: got %q, want %q (env-derived flag default)", o.observerPodName, "from-env-0") } } - -func TestValidate_observerPodNameRequiredWhenEmpty(t *testing.T) { - o := validOpts() - o.observerPodName = "" - - err := o.validate() - if err == nil { - t.Fatal("validate: want error, got nil") - } - if !strings.Contains(err.Error(), "--observer-pod-name") { - t.Errorf("validate: got %q, want substring --observer-pod-name", err.Error()) - } -} - -func TestValidate_configmapNameDefaultsFromObserverPodName(t *testing.T) { - o := validOpts() - o.configmapName = "" // force the wrapped default - - if err := o.validate(); err != nil { - t.Fatalf("validate: %v", err) - } - want := "kms-health-master-0" - if o.configmapName != want { - t.Errorf("configmapName: got %q, want %q (wrapped default from observerPodName)", o.configmapName, want) - } -} - -func TestAddFlagsAndValidate_configmapNameDefaultChainsThroughEnv(t *testing.T) { - // Verifies the full chain (env → observerPodName → configmapName) - // across the two layers it now spans: addFlags reads $POD_NAME at - // flag-registration time, validate derives configmapName from the - // resolved observerPodName. - t.Setenv("POD_NAME", "kube-apiserver-3") - // Field defaults that addFlags wires to non-empty values - // (kmsSocket, probeInterval, etc.) come from this struct literal — - // addFlags references them as the flag default. Fields that addFlags - // defaults to "" (configmapNamespace, configmapName, observerPodName) - // have to be supplied via flag args, since addFlags ignores the - // struct value for those. - o := commandOptions{ - kmsSocket: "/var/run/kms.sock", - probeInterval: 60 * time.Second, - probeIntervalUnhealthy: 10 * time.Second, - probeTimeout: 3 * time.Second, - writeTimeout: 5 * time.Second, - outputMode: outputModeConfigMap, - } - fs := pflag.NewFlagSet("test", pflag.ContinueOnError) - o.addFlags(fs) - if err := fs.Parse([]string{"--configmap-namespace=kms-health-test"}); err != nil { - t.Fatalf("parse: %v", err) - } - if err := o.validate(); err != nil { - t.Fatalf("validate: %v", err) - } - if o.observerPodName != "kube-apiserver-3" { - t.Errorf("observerPodName: got %q, want kube-apiserver-3", o.observerPodName) - } - if o.configmapName != "kms-health-kube-apiserver-3" { - t.Errorf("configmapName: got %q, want kms-health-kube-apiserver-3", o.configmapName) - } -} - -func TestValidate_explicitConfigmapNameNotOverridden(t *testing.T) { - o := validOpts() - o.configmapName = "explicit-cm" - - if err := o.validate(); err != nil { - t.Fatalf("validate: %v", err) - } - if o.configmapName != "explicit-cm" { - t.Errorf("configmapName: got %q, want explicit-cm (default must not override caller-set value)", o.configmapName) - } -} diff --git a/pkg/operator/encryption/kms/health/writer_operator_condition.go b/pkg/operator/encryption/kms/health/condition.go similarity index 63% rename from pkg/operator/encryption/kms/health/writer_operator_condition.go rename to pkg/operator/encryption/kms/health/condition.go index ae8d269baf..746f636cde 100644 --- a/pkg/operator/encryption/kms/health/writer_operator_condition.go +++ b/pkg/operator/encryption/kms/health/condition.go @@ -13,13 +13,9 @@ import ( "k8s.io/utils/clock" ) -// Condition Type prefix and suffixes. Final form per writer: -// -// KMSv2HealthPlugin__Available -// KMSv2HealthPlugin__Degraded -// -// Pod names contain hyphens (kube-apiserver-master-0); they pass through -// verbatim because library-go's UnionCondition handles hyphens correctly. +// Per-pod condition Types: KMSv2HealthPlugin__Available and +// KMSv2HealthPlugin__Degraded. Pod-name hyphens pass through +// verbatim; library-go UnionCondition handles them. const ( conditionTypePrefix = "KMSv2HealthPlugin_" conditionTypeAvailableSuffix = "_Available" @@ -32,24 +28,10 @@ const ( reasonRPCError = "RPCError" // fieldManagerPrefix isolates each pod's two condition entries via SSA - // per-entry ownership (OperatorStatus.Conditions is listType=map keyed - // on Type). Two writers with different ObserverPods cannot collide. + // per-entry ownership (Conditions is listType=map keyed on Type). fieldManagerPrefix = "kms-health-monitor-" ) -// OperatorConditionWriter publishes KMSv2 plugin health to a -// *.operator.openshift.io/cluster CRD via the OperatorClient interface, so -// the same code works for KubeAPIServer, Authentication, OpenShiftAPIServer. -// -// Two condition entries are written per call: -// -// KMSv2HealthPlugin__Available // rolls up to ClusterOperator.Available -// KMSv2HealthPlugin__Degraded // rolls up to ClusterOperator.Degraded -// -// LastTransitionTime: read prior conditions and carry over the existing -// timestamp when Status is unchanged, otherwise stamp now. Mirrors -// v1helpers.SetOperatorCondition (helpers.go:58-100) for the SSA path so -// consumers don't see the field churn on every reconcile. type OperatorConditionWriter struct { client operatorv1helpers.OperatorClient observerPod string @@ -71,10 +53,9 @@ type conditionDecision struct { reason string } -// mapHealthClass implements the asymmetric mapping. Available expresses -// confirmed-good, so RPCError ("we don't know") is Unknown. Degraded -// expresses persistent-wrong, and RPCError IS wrong (we cannot reach the -// plugin), so Degraded is True. +// Asymmetric: Available expresses confirmed-good, so RPCError ("we don't +// know") maps to Unknown; Degraded expresses persistent-wrong, and an +// unreachable plugin IS wrong, so Degraded is True. func mapHealthClass(class HealthClass) (avail, degraded conditionDecision) { switch class { case HealthClassOK: @@ -87,17 +68,13 @@ func mapHealthClass(class HealthClass) (avail, degraded conditionDecision) { return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, conditionDecision{operatorv1.ConditionTrue, reasonRPCError} } - // Closed set: HealthClass values come from classifyHealthz / classifyRPCError. - // Treat any unknown class as RPCError-equivalent rather than panicking. + // Defensive default for an unknown class. HealthClass is a closed set, + // so this is unreachable today, but a future class would otherwise + // crash a sidecar. return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, conditionDecision{operatorv1.ConditionTrue, reasonRPCError} } -// buildMessage renders the wire format: -// -// healthy: "keyIDHash=" -// unhealthy with KeyIDHash: "keyIDHash= detail=" -// unhealthy without KeyIDHash: "detail=" func buildMessage(status HealthStatus) string { if status.Healthz.IsOK() { return "keyIDHash=" + status.KeyIDHash @@ -108,10 +85,9 @@ func buildMessage(status HealthStatus) string { return "detail=" + status.Healthz.Detail } -// resolveLastTransitionTime mirrors v1helpers.SetOperatorCondition LTT logic -// for the SSA path: brand-new condition or Status flip stamps now; -// Status-unchanged carries over the prior LastTransitionTime so consumers -// don't see churn on every reconcile. +// Carry over the prior LastTransitionTime when Status is unchanged so +// consumers don't see field churn on every reconcile. Mirrors +// v1helpers.SetOperatorCondition for the SSA path. func resolveLastTransitionTime(prior []operatorv1.OperatorCondition, conditionType string, newStatus operatorv1.ConditionStatus, now time.Time) metav1.Time { for i := range prior { if prior[i].Type != conditionType { @@ -132,11 +108,9 @@ func (w *OperatorConditionWriter) Write(ctx context.Context, status HealthStatus avail, degraded := mapHealthClass(status.Healthz.Class) msg := buildMessage(status) - // Read prior state for LTT carry-over. A read failure is non-fatal: - // worst case we stamp LTT=now on a Status that didn't actually change, - // which self-corrects on the next successful tick. Skipping the publish - // would punish transient apiserver glitches harder than the contract - // asks for. + // Read failure is non-fatal: worst case LTT=now on an unchanged Status, + // self-correcting next tick. Skipping the publish would punish transient + // apiserver glitches harder than the contract asks for. var priorConditions []operatorv1.OperatorCondition if _, opStatus, _, err := w.client.GetOperatorState(); err == nil && opStatus != nil { priorConditions = opStatus.Conditions diff --git a/pkg/operator/encryption/kms/health/condition_test.go b/pkg/operator/encryption/kms/health/condition_test.go new file mode 100644 index 0000000000..c0698d5ac6 --- /dev/null +++ b/pkg/operator/encryption/kms/health/condition_test.go @@ -0,0 +1,243 @@ +package health + +import ( + "context" + "testing" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" + "github.com/openshift/library-go/pkg/apiserver/jsonpatch" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" + clocktesting "k8s.io/utils/clock/testing" +) + +const ( + testObserverPod = "kube-apiserver-master-0" + testFieldManager = "kms-health-monitor-kube-apiserver-master-0" + testTypeAvailable = "KMSv2HealthPlugin_kube-apiserver-master-0_Available" + testTypeDegraded = "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded" +) + +// recordingOperatorClient merges applied conditions (LastTransitionTime +// included) into the in-memory status so GetOperatorState sees prior +// state. v1helpers.fakeOperatorClient drops LTT in its merge, which +// would defeat the LTT carry-over tests. +type recordingOperatorClient struct { + status operatorv1.OperatorStatus + applies []recordedApply +} + +type recordedApply struct { + fieldManager string + cfg *applyoperatorv1.OperatorStatusApplyConfiguration +} + +func (c *recordingOperatorClient) GetOperatorState() (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { + return &operatorv1.OperatorSpec{}, c.status.DeepCopy(), "0", nil +} + +func (c *recordingOperatorClient) ApplyOperatorStatus(_ context.Context, fieldManager string, cfg *applyoperatorv1.OperatorStatusApplyConfiguration) error { + c.applies = append(c.applies, recordedApply{fieldManager: fieldManager, cfg: cfg}) + for _, ac := range cfg.Conditions { + merged := operatorv1.OperatorCondition{} + if ac.Type != nil { + merged.Type = *ac.Type + } + if ac.Status != nil { + merged.Status = *ac.Status + } + if ac.Reason != nil { + merged.Reason = *ac.Reason + } + if ac.Message != nil { + merged.Message = *ac.Message + } + if ac.LastTransitionTime != nil { + merged.LastTransitionTime = *ac.LastTransitionTime + } + c.upsertCondition(merged) + } + return nil +} + +func (c *recordingOperatorClient) upsertCondition(cond operatorv1.OperatorCondition) { + for i := range c.status.Conditions { + if c.status.Conditions[i].Type == cond.Type { + c.status.Conditions[i] = cond + return + } + } + c.status.Conditions = append(c.status.Conditions, cond) +} + +func (c *recordingOperatorClient) Informer() cache.SharedIndexInformer { return nil } +func (c *recordingOperatorClient) GetObjectMeta() (*metav1.ObjectMeta, error) { + return &metav1.ObjectMeta{}, nil +} +func (c *recordingOperatorClient) GetOperatorStateWithQuorum(ctx context.Context) (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { + return c.GetOperatorState() +} +func (c *recordingOperatorClient) UpdateOperatorSpec(context.Context, string, *operatorv1.OperatorSpec) (*operatorv1.OperatorSpec, string, error) { + panic("not used") +} +func (c *recordingOperatorClient) UpdateOperatorStatus(context.Context, string, *operatorv1.OperatorStatus) (*operatorv1.OperatorStatus, error) { + panic("not used") +} +func (c *recordingOperatorClient) ApplyOperatorSpec(context.Context, string, *applyoperatorv1.OperatorSpecApplyConfiguration) error { + panic("not used") +} +func (c *recordingOperatorClient) PatchOperatorStatus(context.Context, *jsonpatch.PatchSet) error { + panic("not used") +} + +func findCondition(conds []operatorv1.OperatorCondition, t string) *operatorv1.OperatorCondition { + for i := range conds { + if conds[i].Type == t { + return &conds[i] + } + } + return nil +} + +func newTestWriter(client *recordingOperatorClient, observerPod string, fc *clocktesting.FakeClock) *OperatorConditionWriter { + w := NewOperatorConditionWriter(client, observerPod) + w.clock = fc + return w +} + +func TestOperatorConditionWriter_classMapping(t *testing.T) { + cases := []struct { + name string + status HealthStatus + availStatus operatorv1.ConditionStatus + availReason string + degrStatus operatorv1.ConditionStatus + degrReason string + wantMessage string + }{ + { + name: "OK", + status: HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "abc123"}, + availStatus: operatorv1.ConditionTrue, availReason: reasonAsExpected, + degrStatus: operatorv1.ConditionFalse, degrReason: reasonAsExpected, + wantMessage: "keyIDHash=abc123", + }, + { + name: "NotOK", + status: HealthStatus{Healthz: Healthz{Class: HealthClassNotOK, Detail: "kid lookup failed"}, KeyIDHash: "deadbeef"}, + availStatus: operatorv1.ConditionFalse, availReason: reasonPluginUnhealthy, + degrStatus: operatorv1.ConditionTrue, degrReason: reasonHealthzNotOK, + wantMessage: "keyIDHash=deadbeef detail=kid lookup failed", + }, + { + // Asymmetric: Available expresses confirmed-good (Unknown when we + // don't know), Degraded expresses persistent-wrong (True since + // unreachable IS wrong). + name: "RPCError", + status: HealthStatus{Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}}, + availStatus: operatorv1.ConditionUnknown, availReason: reasonProbeUnreachable, + degrStatus: operatorv1.ConditionTrue, degrReason: reasonRPCError, + wantMessage: "detail=Unavailable", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + client := &recordingOperatorClient{} + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + w := newTestWriter(client, testObserverPod, fc) + + tc.status.ObserverPod = testObserverPod + if err := w.Write(context.Background(), tc.status); err != nil { + t.Fatalf("Write: %v", err) + } + + a := findCondition(client.status.Conditions, testTypeAvailable) + if a == nil { + t.Fatalf("Available missing") + } + if a.Status != tc.availStatus || a.Reason != tc.availReason || a.Message != tc.wantMessage { + t.Errorf("Available: got %+v, want status=%s reason=%s message=%q", a, tc.availStatus, tc.availReason, tc.wantMessage) + } + d := findCondition(client.status.Conditions, testTypeDegraded) + if d == nil { + t.Fatalf("Degraded missing") + } + if d.Status != tc.degrStatus || d.Reason != tc.degrReason || d.Message != tc.wantMessage { + t.Errorf("Degraded: got %+v, want status=%s reason=%s message=%q", d, tc.degrStatus, tc.degrReason, tc.wantMessage) + } + if client.applies[0].fieldManager != testFieldManager { + t.Errorf("fieldManager: got %q, want %q", client.applies[0].fieldManager, testFieldManager) + } + }) + } +} + +// Status-unchanged reconciles must carry over LTT; otherwise downstream +// rollout gates and alerts would think a transition happened on every tick. +func TestOperatorConditionWriter_repeatedSameStatusCarriesOverLTT(t *testing.T) { + client := &recordingOperatorClient{} + t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) + fc := clocktesting.NewFakeClock(t1) + w := newTestWriter(client, testObserverPod, fc) + + healthy := HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "abc123", ObserverPod: testObserverPod} + if err := w.Write(context.Background(), healthy); err != nil { + t.Fatalf("first Write: %v", err) + } + fc.Step(5 * time.Minute) + if err := w.Write(context.Background(), healthy); err != nil { + t.Fatalf("second Write: %v", err) + } + + for _, ct := range []string{testTypeAvailable, testTypeDegraded} { + c := findCondition(client.status.Conditions, ct) + if c == nil { + t.Fatalf("%s missing", ct) + } + if !c.LastTransitionTime.Time.Equal(t1) { + t.Errorf("%s.LastTransitionTime: got %v, want %v (unchanged status must carry over)", ct, c.LastTransitionTime.Time, t1) + } + } +} + +// Per-pod fieldManager + per-entry SSA ownership: two writers with +// different ObserverPods coexist in the same CR without clobbering. +func TestOperatorConditionWriter_twoObserverPodsCoexist(t *testing.T) { + client := &recordingOperatorClient{} + fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + + const ( + podA = "kube-apiserver-master-0" + podB = "kube-apiserver-master-1" + ) + wa := newTestWriter(client, podA, fc) + wb := newTestWriter(client, podB, fc) + + if err := wa.Write(context.Background(), HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "a", ObserverPod: podA}); err != nil { + t.Fatalf("wa.Write: %v", err) + } + if err := wb.Write(context.Background(), HealthStatus{Healthz: Healthz{Class: HealthClassNotOK, Detail: "bad"}, KeyIDHash: "b", ObserverPod: podB}); err != nil { + t.Fatalf("wb.Write: %v", err) + } + + if got, want := len(client.status.Conditions), 4; got != want { + t.Errorf("conditions count: got %d, want %d", got, want) + } + availA := findCondition(client.status.Conditions, "KMSv2HealthPlugin_"+podA+"_Available") + if availA == nil || availA.Status != operatorv1.ConditionTrue { + t.Errorf("podA Available: got %v, want Status=True", availA) + } + availB := findCondition(client.status.Conditions, "KMSv2HealthPlugin_"+podB+"_Available") + if availB == nil || availB.Status != operatorv1.ConditionFalse { + t.Errorf("podB Available: got %v, want Status=False", availB) + } + if client.applies[0].fieldManager != "kms-health-monitor-"+podA { + t.Errorf("apply[0].fieldManager: got %q, want kms-health-monitor-%s", client.applies[0].fieldManager, podA) + } + if client.applies[1].fieldManager != "kms-health-monitor-"+podB { + t.Errorf("apply[1].fieldManager: got %q, want kms-health-monitor-%s", client.applies[1].fieldManager, podB) + } +} diff --git a/pkg/operator/encryption/kms/health/container_template.go b/pkg/operator/encryption/kms/health/container_template.go deleted file mode 100644 index 7a9256dbf7..0000000000 --- a/pkg/operator/encryption/kms/health/container_template.go +++ /dev/null @@ -1,119 +0,0 @@ -package health - -import ( - "bytes" - "encoding/json" - "fmt" - "strings" - "text/template" - "time" - - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/validation" - "sigs.k8s.io/yaml" -) - -// ContainerOptions are the per-caller knobs for GenerateContainerTemplate. -// KMSPluginName is the cohering parameter; distinct names produce -// non-colliding sidecars so multiple plugins can be observed in one pod. -type ContainerOptions struct { - // Must be a DNS-1123 label; suffixed onto container/volume/CM names. - KMSPluginName string - - OperatorImage string - OperatorCommand []string - - ProbeInterval time.Duration - ProbeIntervalUnhealthy time.Duration - ProbeTimeout time.Duration - WriteTimeout time.Duration - - ConfigMapNamespace string -} - -// GenerateContainerTemplate renders the kms-health-monitor sidecar. -// Caller must add a matching Volume named kms-plugin-socket- -// to PodSpec.Volumes. The deprecated single-plugin -// AddKMSPluginVolumeAndMountToPodSpec helper is intentionally not used. -func GenerateContainerTemplate(opts ContainerOptions) (corev1.Container, error) { - if err := opts.validate(); err != nil { - return corev1.Container{}, err - } - - rawManifest := mustAsset("assets/kms-health-container.yaml") - - funcs := template.FuncMap{ - "toJson": func(v any) (string, error) { - b, err := json.Marshal(v) - if err != nil { - return "", err - } - return string(b), nil - }, - } - - tmpl, err := template.New("kms-health-container").Funcs(funcs).Parse(string(rawManifest)) - if err != nil { - return corev1.Container{}, fmt.Errorf("parse asset template: %w", err) - } - - values := struct { - KMSPluginName string - OperatorImage string - Command []string - ProbeInterval string - ProbeIntervalUnhealthy string - ProbeTimeout string - WriteTimeout string - ConfigMapNamespace string - }{ - KMSPluginName: opts.KMSPluginName, - OperatorImage: opts.OperatorImage, - Command: opts.OperatorCommand, - ProbeInterval: opts.ProbeInterval.String(), - ProbeIntervalUnhealthy: opts.ProbeIntervalUnhealthy.String(), - ProbeTimeout: opts.ProbeTimeout.String(), - WriteTimeout: opts.WriteTimeout.String(), - ConfigMapNamespace: opts.ConfigMapNamespace, - } - - var buf bytes.Buffer - if err := tmpl.Execute(&buf, values); err != nil { - return corev1.Container{}, fmt.Errorf("execute template: %w", err) - } - - var container corev1.Container - if err := yaml.Unmarshal(buf.Bytes(), &container); err != nil { - return corev1.Container{}, fmt.Errorf("parse rendered container: %w", err) - } - return container, nil -} - -func (o ContainerOptions) validate() error { - if errs := validation.IsDNS1123Label(o.KMSPluginName); len(errs) > 0 { - return fmt.Errorf("KMSPluginName %q is not a valid DNS-1123 label: %s", - o.KMSPluginName, strings.Join(errs, "; ")) - } - if o.OperatorImage == "" { - return fmt.Errorf("OperatorImage is required") - } - if len(o.OperatorCommand) == 0 { - return fmt.Errorf("OperatorCommand must have at least one element") - } - if o.ProbeInterval <= 0 { - return fmt.Errorf("ProbeInterval must be positive") - } - if o.ProbeIntervalUnhealthy <= 0 { - return fmt.Errorf("ProbeIntervalUnhealthy must be positive") - } - if o.ProbeTimeout <= 0 { - return fmt.Errorf("ProbeTimeout must be positive") - } - if o.WriteTimeout <= 0 { - return fmt.Errorf("WriteTimeout must be positive") - } - if o.ConfigMapNamespace == "" { - return fmt.Errorf("ConfigMapNamespace is required") - } - return nil -} diff --git a/pkg/operator/encryption/kms/health/container_template_test.go b/pkg/operator/encryption/kms/health/container_template_test.go deleted file mode 100644 index de68a11606..0000000000 --- a/pkg/operator/encryption/kms/health/container_template_test.go +++ /dev/null @@ -1,171 +0,0 @@ -package health - -import ( - "strings" - "testing" - "time" - - "github.com/spf13/pflag" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - "sigs.k8s.io/yaml" -) - -const expectedContainerYAML = ` -name: kms-health-monitor-aws -image: quay.io/openshift/operator:latest -command: - - "operator" - - "kms-health-monitor" -args: - - --kms-socket=/var/run/kmsplugin-aws/kms.sock - - --probe-interval=1m0s - - --probe-interval-unhealthy=10s - - --probe-timeout=3s - - --write-timeout=5s - - --output-mode=configmap - - --configmap-namespace=openshift-kube-apiserver - - --configmap-name=kms-health-aws -env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name -volumeMounts: - - name: kms-plugin-socket-aws - mountPath: /var/run/kmsplugin-aws -resources: - requests: - memory: 50Mi - cpu: 5m -` - -func validContainerOptions() ContainerOptions { - return ContainerOptions{ - KMSPluginName: "aws", - OperatorImage: "quay.io/openshift/operator:latest", - OperatorCommand: []string{"operator", "kms-health-monitor"}, - ProbeInterval: 60 * time.Second, - ProbeIntervalUnhealthy: 10 * time.Second, - ProbeTimeout: 3 * time.Second, - WriteTimeout: 5 * time.Second, - ConfigMapNamespace: "openshift-kube-apiserver", - } -} - -func TestGenerateContainerTemplate(t *testing.T) { - got, err := GenerateContainerTemplate(validContainerOptions()) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - var want corev1.Container - if err := yaml.Unmarshal([]byte(expectedContainerYAML), &want); err != nil { - t.Fatalf("parse expected: %v", err) - } - if !equality.Semantic.DeepEqual(got, want) { - t.Fatalf("rendered container does not match expected:\ngot: %+v\nwant: %+v", got, want) - } -} - -// Drift detector: rendered args must parse against cmd.go's flag set. -func TestRenderedArgsAreValidFlags(t *testing.T) { - c, err := GenerateContainerTemplate(validContainerOptions()) - if err != nil { - t.Fatalf("render: %v", err) - } - - fs := pflag.NewFlagSet("kms-health-monitor", pflag.ContinueOnError) - (&commandOptions{}).addFlags(fs) - if err := fs.Parse(c.Args); err != nil { - t.Fatalf("rendered args do not parse against cmd.go flag set: %v\nargs: %v", err, c.Args) - } -} - -// Distinct KMSPluginName must yield non-colliding container/volume/CM names. -func TestMultiplePluginsCoexist(t *testing.T) { - a, err := GenerateContainerTemplate(ContainerOptions{ - KMSPluginName: "aws", - OperatorImage: "img:1", - OperatorCommand: []string{"o"}, - ProbeInterval: 60 * time.Second, - ProbeIntervalUnhealthy: 10 * time.Second, - ProbeTimeout: 3 * time.Second, - WriteTimeout: 5 * time.Second, - ConfigMapNamespace: "ns", - }) - if err != nil { - t.Fatalf("render aws: %v", err) - } - b, err := GenerateContainerTemplate(ContainerOptions{ - KMSPluginName: "vault", - OperatorImage: "img:1", - OperatorCommand: []string{"o"}, - ProbeInterval: 60 * time.Second, - ProbeIntervalUnhealthy: 10 * time.Second, - ProbeTimeout: 3 * time.Second, - WriteTimeout: 5 * time.Second, - ConfigMapNamespace: "ns", - }) - if err != nil { - t.Fatalf("render vault: %v", err) - } - - if a.Name == b.Name { - t.Fatalf("container names collide: %q == %q", a.Name, b.Name) - } - if a.VolumeMounts[0].Name == b.VolumeMounts[0].Name { - t.Fatalf("volumeMount names collide: %q", a.VolumeMounts[0].Name) - } - if a.VolumeMounts[0].MountPath == b.VolumeMounts[0].MountPath { - t.Fatalf("mountPaths collide: %q", a.VolumeMounts[0].MountPath) - } - // Args must contain different --configmap-name and --kms-socket values. - if argsEqual(a.Args, b.Args) { - t.Fatalf("args identical for distinct plugin names") - } -} - -func argsEqual(a, b []string) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if a[i] != b[i] { - return false - } - } - return true -} - -func TestContainerOptionsValidate(t *testing.T) { - tests := []struct { - name string - mutate func(*ContainerOptions) - wantSub string - }{ - {"empty plugin name", func(o *ContainerOptions) { o.KMSPluginName = "" }, "KMSPluginName"}, - {"underscore in plugin name", func(o *ContainerOptions) { o.KMSPluginName = "aws_kms" }, "KMSPluginName"}, - {"uppercase in plugin name", func(o *ContainerOptions) { o.KMSPluginName = "AWS" }, "KMSPluginName"}, - {"empty image", func(o *ContainerOptions) { o.OperatorImage = "" }, "OperatorImage"}, - {"empty command", func(o *ContainerOptions) { o.OperatorCommand = nil }, "OperatorCommand"}, - {"zero probe-interval", func(o *ContainerOptions) { o.ProbeInterval = 0 }, "ProbeInterval"}, - {"zero probe-interval-unhealthy", func(o *ContainerOptions) { o.ProbeIntervalUnhealthy = 0 }, "ProbeIntervalUnhealthy"}, - {"zero probe-timeout", func(o *ContainerOptions) { o.ProbeTimeout = 0 }, "ProbeTimeout"}, - {"zero write-timeout", func(o *ContainerOptions) { o.WriteTimeout = 0 }, "WriteTimeout"}, - {"empty namespace", func(o *ContainerOptions) { o.ConfigMapNamespace = "" }, "ConfigMapNamespace"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - opts := validContainerOptions() - tt.mutate(&opts) - _, err := GenerateContainerTemplate(opts) - if err == nil { - t.Fatalf("expected error containing %q, got nil", tt.wantSub) - } - if !strings.Contains(err.Error(), tt.wantSub) { - t.Fatalf("error %q does not contain %q", err.Error(), tt.wantSub) - } - }) - } -} diff --git a/pkg/operator/encryption/kms/health/doc.go b/pkg/operator/encryption/kms/health/doc.go deleted file mode 100644 index 7b24795f2d..0000000000 --- a/pkg/operator/encryption/kms/health/doc.go +++ /dev/null @@ -1,5 +0,0 @@ -// Package health probes a co-located KMSv2 plugin on a cadence and -// publishes a classified, timestamped HealthStatus through a StatusWriter. -// Used by operators and condition reporters that need plugin health -// without dialing the socket themselves. -package health diff --git a/pkg/operator/encryption/kms/health/monitor.go b/pkg/operator/encryption/kms/health/monitor.go index 5d16ee1314..4ad3c418d3 100644 --- a/pkg/operator/encryption/kms/health/monitor.go +++ b/pkg/operator/encryption/kms/health/monitor.go @@ -4,75 +4,47 @@ import ( "context" "time" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" - "k8s.io/utils/clock" ) -type Prober interface { - Probe(ctx context.Context) HealthStatus -} - -// Monitor never exits on probe or write failures; the next tick is the -// retry, and ctx cancellation is the only exit path. Skipped writes -// (apiserver Forbidden, NotFound, transient network failure) leave the -// published status one tick stale, up to one healthyInterval; the next -// tick republishes. type Monitor struct { - probe Prober - writer StatusWriter - observerPod string - healthyInterval time.Duration - unhealthyInterval time.Duration - // writeTimeout bounds each writer.Write call so apiserver slowness - // cannot stall the probe loop. probeTimeout + writeTimeout stack - // per tick. Keep their sum below unhealthyInterval, otherwise a - // stuck Write delays the next tick past the unhealthy cadence and - // the "tighten on unhealthy" mechanism collapses. Must be > 0. + probe *Probe + writer *OperatorConditionWriter + observerPod string + interval time.Duration writeTimeout time.Duration - clock clock.Clock } func NewMonitor( - prober Prober, - writer StatusWriter, + probe *Probe, + writer *OperatorConditionWriter, observerPod string, - healthyInterval, unhealthyInterval, writeTimeout time.Duration, + interval, writeTimeout time.Duration, ) *Monitor { return &Monitor{ - probe: prober, - writer: writer, - observerPod: observerPod, - healthyInterval: healthyInterval, - unhealthyInterval: unhealthyInterval, - writeTimeout: writeTimeout, - clock: clock.RealClock{}, + probe: probe, + writer: writer, + observerPod: observerPod, + interval: interval, + writeTimeout: writeTimeout, } } -// Run blocks until ctx is cancelled. +// Run blocks until ctx is cancelled. wait.UntilWithContext recovers panics +// inside tick so a misbehaving plugin or apiserver flake cannot crash the +// sidecar. func (m *Monitor) Run(ctx context.Context) { - for { - status := m.probe.Probe(ctx) - status.ObserverPod = m.observerPod - - // Cancel inline rather than via defer: defers in unbounded loops - // accumulate until the function returns. - writeCtx, writeCancel := context.WithTimeout(ctx, m.writeTimeout) - err := m.writer.Write(writeCtx, status) - writeCancel() - if err != nil { - klog.ErrorS(err, "kms-health: writer failed; continuing") - } + wait.UntilWithContext(ctx, m.tick, m.interval) +} - interval := m.healthyInterval - if !status.Healthz.IsOK() { - interval = m.unhealthyInterval - } +func (m *Monitor) tick(ctx context.Context) { + status := m.probe.Probe(ctx) + status.ObserverPod = m.observerPod - select { - case <-ctx.Done(): - return - case <-m.clock.After(interval): - } + writeCtx, cancel := context.WithTimeout(ctx, m.writeTimeout) + defer cancel() + if err := m.writer.Write(writeCtx, status); err != nil { + klog.ErrorS(err, "kms-health: writer failed; continuing") } } diff --git a/pkg/operator/encryption/kms/health/monitor_test.go b/pkg/operator/encryption/kms/health/monitor_test.go index 439266a0cd..30913e9257 100644 --- a/pkg/operator/encryption/kms/health/monitor_test.go +++ b/pkg/operator/encryption/kms/health/monitor_test.go @@ -2,201 +2,50 @@ package health import ( "context" - "runtime" - "sync" + "errors" "testing" "time" - testclock "k8s.io/utils/clock/testing" -) - -// recordingWriter buffers every HealthStatus it receives and lets the -// test block on each write via a channel. Bounded buffer is fine for -// unit-test scope; if a Monitor change starts writing way more than -// expected the test will deadlock — that's a useful failure. -type recordingWriter struct { - writes chan HealthStatus -} + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/openshift/library-go/pkg/operator/v1helpers" -func newRecordingWriter() *recordingWriter { - return &recordingWriter{writes: make(chan HealthStatus, 32)} -} + kmsservice "k8s.io/kms/pkg/service" +) -func (w *recordingWriter) Write(ctx context.Context, status HealthStatus) error { - w.writes <- status - return nil +type stubKMSService struct { + resp *kmsservice.StatusResponse + err error } -// scriptedProber hands out pre-arranged HealthStatus values in order. -// Each Probe call consumes one. Designed to block if the Monitor asks -// for more than the test scripted — surfaces unintended extra probes. -type scriptedProber struct { - mu sync.Mutex - results []HealthStatus - calls int +func (s *stubKMSService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) { + return s.resp, s.err } - -func (s *scriptedProber) Probe(ctx context.Context) HealthStatus { - s.mu.Lock() - defer s.mu.Unlock() - if s.calls >= len(s.results) { - // Return something sane rather than panicking mid-test; the - // writer buffer will catch unexpected writes. - s.calls++ - return HealthStatus{Healthz: Healthz{Class: HealthClassOK}, Timestamp: time.Now()} - } - r := s.results[s.calls] - s.calls++ - return r +func (s *stubKMSService) Encrypt(ctx context.Context, uid string, plaintext []byte) (*kmsservice.EncryptResponse, error) { + return nil, errors.New("not used") } - -// waitForWaiters polls until the FakeClock has a registered waiter, so -// the next Step() actually triggers the Monitor's next-tick timer -// rather than being lost to the void. Bounded so misbehaving Monitors -// fail the test instead of hanging. -func waitForWaiters(t *testing.T, clk *testclock.FakeClock) { - t.Helper() - deadline := time.Now().Add(2 * time.Second) - for !clk.HasWaiters() { - if time.Now().After(deadline) { - t.Fatal("Monitor never registered a clock waiter — not scheduling next tick") - } - time.Sleep(time.Millisecond) - } +func (s *stubKMSService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) { + return nil, errors.New("not used") } -func TestMonitor_transitionsHealthyUnhealthyHealthy(t *testing.T) { - t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) - clk := testclock.NewFakeClock(t0) - prober := &scriptedProber{ - results: []HealthStatus{ - {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}, - {Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, Timestamp: t0.Add(60 * time.Second)}, - {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0.Add(70 * time.Second)}, - }, - } - writer := newRecordingWriter() - - m := &Monitor{ - probe: prober, - writer: writer, - observerPod: "test-pod-1", - healthyInterval: 60 * time.Second, - unhealthyInterval: 10 * time.Second, - writeTimeout: 5 * time.Second, - clock: clk, - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - done := make(chan struct{}) - go func() { - m.Run(ctx) - close(done) - }() - - // Tick 1: healthy. Probed immediately, no clock step needed first. - got1 := <-writer.writes - if !got1.Healthz.IsOK() { - t.Errorf("tick 1 Healthz: got %q, want IsOK", got1.Healthz) - } - if got1.ObserverPod != "test-pod-1" { - t.Errorf("tick 1 ObserverPod: got %q, want test-pod-1", got1.ObserverPod) - } - - // Should now be waiting for healthyInterval. - waitForWaiters(t, clk) - clk.Step(60 * time.Second) - - // Tick 2: unhealthy. Monitor must now switch to unhealthyInterval. - got2 := <-writer.writes - wantTick2 := Healthz{Class: HealthClassNotOK, Detail: "boom"} - if got2.Healthz != wantTick2 { - t.Errorf("tick 2 Healthz: got %q, want %q", got2.Healthz, wantTick2) - } - - // Should be waiting for unhealthyInterval (10s), not 60s. Verify by - // stepping only 10s and seeing the next probe fire. - waitForWaiters(t, clk) - clk.Step(10 * time.Second) - - // Tick 3: healthy again. - got3 := <-writer.writes - if !got3.Healthz.IsOK() { - t.Errorf("tick 3 Healthz: got %q, want IsOK", got3.Healthz) - } - - cancel() - <-done - - // Timestamps must be strictly advancing across the three recorded writes. - if !got2.Timestamp.After(got1.Timestamp) { - t.Errorf("timestamps not advancing: t1=%v t2=%v", got1.Timestamp, got2.Timestamp) - } - if !got3.Timestamp.After(got2.Timestamp) { - t.Errorf("timestamps not advancing: t2=%v t3=%v", got2.Timestamp, got3.Timestamp) - } +func newTestMonitor(interval time.Duration) (*Monitor, v1helpers.OperatorClient) { + svc := &stubKMSService{resp: &kmsservice.StatusResponse{Healthz: "ok", KeyID: "k1"}} + client := v1helpers.NewFakeOperatorClient(&operatorv1.OperatorSpec{}, &operatorv1.OperatorStatus{}, nil) + writer := NewOperatorConditionWriter(client, "test-pod") + return NewMonitor(NewProbe(svc, time.Second), writer, "test-pod", interval, time.Second), client } -func TestMonitor_stopsOnContextCancel(t *testing.T) { - t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) - clk := testclock.NewFakeClock(t0) - prober := &scriptedProber{ - results: []HealthStatus{{Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}}, - } - writer := newRecordingWriter() +func TestMonitor_tickProbesAndWrites(t *testing.T) { + m, client := newTestMonitor(time.Second) + m.tick(context.Background()) - m := &Monitor{ - probe: prober, - writer: writer, - observerPod: "p", - healthyInterval: 60 * time.Second, - unhealthyInterval: 10 * time.Second, - writeTimeout: 5 * time.Second, - clock: clk, - } - - ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - m.Run(ctx) - close(done) - }() - - <-writer.writes // first probe happened - waitForWaiters(t, clk) - cancel() - - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("Monitor did not stop within 2s of context cancel") + _, status, _, _ := client.GetOperatorState() + if len(status.Conditions) == 0 { + t.Fatal("no conditions written after tick") } } -// runMonitorCycle starts a Monitor, waits for the first write + clock -// waiter, then cancels and waits for Run to return. Shared by the leak -// test so its iteration body stays focused on the leak assertion. -func runMonitorCycle(t *testing.T) { - t.Helper() - t0 := time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) - clk := testclock.NewFakeClock(t0) - prober := &scriptedProber{ - results: []HealthStatus{ - {Healthz: Healthz{Class: HealthClassOK}, Timestamp: t0}, - }, - } - writer := newRecordingWriter() - - m := &Monitor{ - probe: prober, - writer: writer, - observerPod: "p", - healthyInterval: 60 * time.Second, - unhealthyInterval: 10 * time.Second, - writeTimeout: 5 * time.Second, - clock: clk, - } +func TestMonitor_runStopsOnContextCancel(t *testing.T) { + m, _ := newTestMonitor(10 * time.Millisecond) ctx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) @@ -205,57 +54,11 @@ func runMonitorCycle(t *testing.T) { close(done) }() - <-writer.writes - waitForWaiters(t, clk) + time.Sleep(50 * time.Millisecond) cancel() - select { case <-done: case <-time.After(2 * time.Second): t.Fatal("Monitor did not stop within 2s of context cancel") } } - -// TestMonitor_noGoroutineLeak verifies Run does not leak goroutines -// after ctx cancellation. Plan §14.2 calls out goroutine leakage as a -// real-class-of-bug for the gRPC dial path; the same risk applies to -// the loop that owns the long-lived select on ctx.Done() and -// clock.After. -// -// The test runs many cycles after a warm-up snapshot. A single-shot -// check would miss per-iteration accumulation (e.g. an orphaned -// goroutine spawned inside Run that exits on the *next* tick rather -// than on ctx.Done()). -func TestMonitor_noGoroutineLeak(t *testing.T) { - // Warm up before snapshotting: first run pulls in lazy klog buffers, - // fake-clock test internals, etc. Snapshotting beforehand would - // surface a one-time bump as a leak. - runMonitorCycle(t) - - runtime.GC() - baseline := runtime.NumGoroutine() - - const iterations = 20 - for range iterations { - runMonitorCycle(t) - } - - // Cleanup is scheduler-dependent; poll up to a deadline rather than - // using a fixed sleep that flakes on a loaded CI runner. - runtime.GC() - deadline := time.Now().Add(2 * time.Second) - for runtime.NumGoroutine() > baseline && time.Now().Before(deadline) { - time.Sleep(20 * time.Millisecond) - runtime.GC() - } - - if got := runtime.NumGoroutine(); got > baseline { - // Dump live stacks so a future regression is diagnosable from CI - // output alone. Without this, "leaked N goroutines" gives no clue - // to the leaking site. - buf := make([]byte, 1<<16) - n := runtime.Stack(buf, true) - t.Errorf("goroutine leak after %d cycles: baseline=%d, got=%d\nstacks:\n%s", - iterations, baseline, got, buf[:n]) - } -} diff --git a/pkg/operator/encryption/kms/health/probe.go b/pkg/operator/encryption/kms/health/probe.go index 962741dbbb..b7f963d9f7 100644 --- a/pkg/operator/encryption/kms/health/probe.go +++ b/pkg/operator/encryption/kms/health/probe.go @@ -11,21 +11,21 @@ import ( kmsservice "k8s.io/kms/pkg/service" ) -// healthzMaxBodyLen caps the plugin's Healthz body. Without it a -// misbehaving plugin could push multi-MB strings into our ConfigMap on -// every tick. +// healthzMaxBodyLen caps the plugin's Healthz body so a misbehaving +// plugin cannot push multi-MB strings into the OperatorCondition.Message +// on every tick. const healthzMaxBodyLen = 200 -// Probe never returns an error; failures are classified into -// HealthStatus.Healthz so the Monitor loop stays flat. +// Probe never returns an error: failures classify into HealthStatus.Healthz +// so the Monitor loop stays flat. type Probe struct { service kmsservice.Service timeout time.Duration now func() time.Time } -// NewProbe wraps the per-probe Status RPC with timeout if positive; 0 -// relies solely on ctx (or the kmsv2 client's own deadline). +// NewProbe applies the timeout only when positive; 0 relies on ctx or +// the kmsv2 client's own deadline. func NewProbe(service kmsservice.Service, timeout time.Duration) *Probe { return &Probe{ service: service, @@ -34,7 +34,6 @@ func NewProbe(service kmsservice.Service, timeout time.Duration) *Probe { } } -// Probe blocks until Status returns or ctx / the per-probe timeout fires. func (p *Probe) Probe(ctx context.Context) HealthStatus { if p.timeout > 0 { var cancel context.CancelFunc @@ -56,10 +55,9 @@ func (p *Probe) Probe(ctx context.Context) HealthStatus { } } -// classifyHealthz truncates overlong bodies on a UTF-8-safe boundary: -// the proto permits free-form text, ConfigMap.Data must be valid UTF-8, -// and naive byte-slicing at healthzMaxBodyLen could split a multi-byte -// rune. ToValidUTF8 drops any partial sequence at the cut. +// classifyHealthz truncates on a UTF-8-safe boundary: the proto allows +// arbitrary bytes, but Condition.Message must be valid UTF-8; naive +// byte-slicing could split a multi-byte rune. func classifyHealthz(body string) Healthz { if body == string(HealthClassOK) { return Healthz{Class: HealthClassOK} @@ -70,9 +68,9 @@ func classifyHealthz(body string) Healthz { return Healthz{Class: HealthClassNotOK, Detail: body} } -// classifyRPCError uses status.Code, which returns codes.Unknown for -// non-gRPC errors and the real code for gRPC status errors. Covers -// transport failures and deadline expirations without extra branching. +// status.Code returns codes.Unknown for non-gRPC errors and the real code +// for gRPC status errors, covering transport failures and deadlines without +// extra branching. func classifyRPCError(err error) Healthz { return Healthz{ Class: HealthClassRPCError, diff --git a/pkg/operator/encryption/kms/health/probe_test.go b/pkg/operator/encryption/kms/health/probe_test.go index 7966c9f56e..856ba20187 100644 --- a/pkg/operator/encryption/kms/health/probe_test.go +++ b/pkg/operator/encryption/kms/health/probe_test.go @@ -15,14 +15,6 @@ import ( kmsservice "k8s.io/kms/pkg/service" ) -// testTimeout is a generous per-probe timeout for fast-fake tests that -// should never hit it. Tests that DO exercise the deadline path use a -// short timeout locally. -const testTimeout = 10 * time.Second - -// fakeService injects scripted Status responses. Same shape as -// preflight's fake (checker_test.go:14-30) — only StatusFn is exercised -// here because the Probe never calls Encrypt/Decrypt. type fakeService struct { StatusFn func(ctx context.Context) (*kmsservice.StatusResponse, error) } @@ -30,11 +22,9 @@ type fakeService struct { func (f *fakeService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) { return f.StatusFn(ctx) } - func (f *fakeService) Encrypt(ctx context.Context, uid string, data []byte) (*kmsservice.EncryptResponse, error) { panic("Encrypt not expected") } - func (f *fakeService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) { panic("Decrypt not expected") } @@ -47,40 +37,29 @@ func sha256Hex(s string) string { func TestProbe_healthy(t *testing.T) { fake := &fakeService{ StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{ - Healthz: "ok", - Version: "v2", - KeyID: "test-key-1", - }, nil + return &kmsservice.StatusResponse{Healthz: "ok", Version: "v2", KeyID: "test-key-1"}, nil }, } - - probe := NewProbe(fake, testTimeout) - got := probe.Probe(context.Background()) + got := NewProbe(fake, time.Second).Probe(context.Background()) if !got.Healthz.IsOK() { t.Errorf("Healthz: got %q, want IsOK", got.Healthz) } - wantHash := sha256Hex("test-key-1") - if got.KeyIDHash != wantHash { - t.Errorf("KeyIDHash: got %q, want %q", got.KeyIDHash, wantHash) + if got.KeyIDHash != sha256Hex("test-key-1") { + t.Errorf("KeyIDHash mismatch") } if got.Timestamp.IsZero() { t.Error("Timestamp: expected non-zero") } } -func TestProbe_notOk_classifiesAsUnhealthy(t *testing.T) { +func TestProbe_notOK(t *testing.T) { fake := &fakeService{ StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{ - Healthz: "not-ready", - KeyID: "test-key-1", - }, nil + return &kmsservice.StatusResponse{Healthz: "not-ready", KeyID: "test-key-1"}, nil }, } - - got := NewProbe(fake, testTimeout).Probe(context.Background()) + got := NewProbe(fake, time.Second).Probe(context.Background()) want := Healthz{Class: HealthClassNotOK, Detail: "not-ready"} if got.Healthz != want { @@ -88,45 +67,18 @@ func TestProbe_notOk_classifiesAsUnhealthy(t *testing.T) { } } -func TestProbe_notOk_truncatesTo200Chars(t *testing.T) { - // 260-char ASCII body — exercise the byte-length cap. - longBody := strings.Repeat("x", 260) - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{ - Healthz: longBody, - KeyID: "test-key-1", - }, nil - }, - } - - got := NewProbe(fake, testTimeout).Probe(context.Background()) - - if got.Healthz.Class != HealthClassNotOK { - t.Errorf("Healthz class: got %q, want %q", got.Healthz.Class, HealthClassNotOK) - } - if len(got.Healthz.Detail) != healthzMaxBodyLen { - t.Errorf("Healthz detail length: got %d, want %d", len(got.Healthz.Detail), healthzMaxBodyLen) - } -} - -func TestProbe_notOk_truncatesSafelyAtMultiByteBoundary(t *testing.T) { - // 198 ASCII bytes + thumbs-up emoji (4 bytes) + tail. Naive byte- - // slicing at healthzMaxBodyLen=200 would land between bytes 2 and 3 - // of the emoji, producing invalid UTF-8 — apiserver rejects - // ConfigMap Updates with invalid UTF-8 in cm.Data values, which - // would freeze the published status indefinitely. +// Naive byte-slicing at healthzMaxBodyLen would split a multi-byte rune; +// apiserver rejects Condition.Message containing invalid UTF-8 and would +// freeze the published status. The 198 + 4-byte emoji + tail boundary +// targets that exact slice point. +func TestProbe_notOK_truncatesSafelyAtMultiByteBoundary(t *testing.T) { body := strings.Repeat("x", 198) + "👍" + "tail" fake := &fakeService{ StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{ - Healthz: body, - KeyID: "test-key-1", - }, nil + return &kmsservice.StatusResponse{Healthz: body, KeyID: "test-key-1"}, nil }, } - - got := NewProbe(fake, testTimeout).Probe(context.Background()) + got := NewProbe(fake, time.Second).Probe(context.Background()) if !utf8.ValidString(got.Healthz.Detail) { t.Errorf("Healthz detail is not valid UTF-8: %q", got.Healthz.Detail) @@ -137,30 +89,20 @@ func TestProbe_notOk_truncatesSafelyAtMultiByteBoundary(t *testing.T) { } func TestProbe_respectsPerProbeTimeout(t *testing.T) { - // Fake blocks until ctx fires; Probe's own WithTimeout should cut it - // off after probeTimeout. The fake wraps ctx.Err() exactly the way - // gRPC's transport does in production (status.FromContextError), so - // classification sees DeadlineExceeded. fake := &fakeService{ StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { <-ctx.Done() return nil, status.FromContextError(ctx.Err()).Err() }, } - - const probeTimeout = 20 * time.Millisecond - probe := NewProbe(fake, probeTimeout) + probe := NewProbe(fake, 20*time.Millisecond) start := time.Now() - // Parent context has NO deadline — if Probe doesn't apply its own - // timeout, this test hangs and we'd notice via go test -timeout. got := probe.Probe(context.Background()) elapsed := time.Since(start) - // Budget: probeTimeout + generous slack for CI. If we see >1s we're - // honoring the parent ctx instead of our own timeout. if elapsed > 500*time.Millisecond { - t.Errorf("probe elapsed %v — per-probe timeout not applied", elapsed) + t.Errorf("probe elapsed %v: per-probe timeout not applied", elapsed) } want := Healthz{Class: HealthClassRPCError, Detail: "DeadlineExceeded"} if got.Healthz != want { @@ -168,52 +110,19 @@ func TestProbe_respectsPerProbeTimeout(t *testing.T) { } } -func TestProbe_rpcError(t *testing.T) { - // The real k8senvelopekmsv2 client returns status errors from gRPC - // calls; generic (non-status) errors are treated by grpc/status as code - // Unknown. See vendor/google.golang.org/grpc/status/status.go. - scenarios := []struct { - name string - fakeErr error - wantCode string - }{ - { - name: "generic non-gRPC error classifies as Unknown", - fakeErr: errors.New("connection refused"), - wantCode: "Unknown", - }, - { - name: "gRPC Unavailable propagates through classification", - fakeErr: status.Error(codes.Unavailable, "server gone"), - wantCode: "Unavailable", - }, - { - name: "gRPC DeadlineExceeded propagates through classification", - fakeErr: status.Error(codes.DeadlineExceeded, "too slow"), - wantCode: "DeadlineExceeded", +func TestProbe_rpcError_classifiesNonGRPCAsUnknown(t *testing.T) { + fake := &fakeService{ + StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { + return nil, errors.New("connection refused") }, } + got := NewProbe(fake, time.Second).Probe(context.Background()) - for _, tc := range scenarios { - t.Run(tc.name, func(t *testing.T) { - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return nil, tc.fakeErr - }, - } - - got := NewProbe(fake, testTimeout).Probe(context.Background()) - - want := Healthz{Class: HealthClassRPCError, Detail: tc.wantCode} - if got.Healthz != want { - t.Errorf("Healthz: got %q, want %q", got.Healthz, want) - } - if got.KeyIDHash != "" { - t.Errorf("KeyIDHash: got %q, want empty (unreachable plugin)", got.KeyIDHash) - } - if got.Timestamp.IsZero() { - t.Error("Timestamp: expected non-zero even on RPC error") - } - }) + want := Healthz{Class: HealthClassRPCError, Detail: codes.Unknown.String()} + if got.Healthz != want { + t.Errorf("Healthz: got %q, want %q", got.Healthz, want) + } + if got.KeyIDHash != "" { + t.Errorf("KeyIDHash: got %q, want empty on unreachable plugin", got.KeyIDHash) } } diff --git a/pkg/operator/encryption/kms/health/types.go b/pkg/operator/encryption/kms/health/types.go index 0825f6595a..1db7e93735 100644 --- a/pkg/operator/encryption/kms/health/types.go +++ b/pkg/operator/encryption/kms/health/types.go @@ -2,7 +2,6 @@ package health import "time" -// HealthClass is the closed-set classifier for Healthz. type HealthClass string const ( @@ -16,34 +15,22 @@ type Healthz struct { Detail string } -// IsOK is the canonical health predicate. Prefer this over comparing -// Class directly so consumers don't depend on the internal shape. +// IsOK is the canonical predicate; prefer it over comparing Class so +// consumers don't depend on the internal classification shape. func (h Healthz) IsOK() bool { return h.Class == HealthClassOK } -// String renders the wire format: "ok" or "unhealthy::". -func (h Healthz) String() string { - if h.Class == HealthClassOK { - return string(HealthClassOK) - } - return "unhealthy:" + string(h.Class) + ":" + h.Detail -} - type HealthStatus struct { Healthz Healthz - // KeyIDHash is the sha256 hex of the KeyID the plugin returned, or - // empty if the probe could not reach Status. Hashing avoids leaking - // key material; consumers can still diff hashes across instances to - // detect rotation skew. + // KeyIDHash is the sha256 hex of the plugin's KeyID, empty when Status + // couldn't be reached. Hashing avoids leaking key material; consumers + // can still diff hashes across instances to detect rotation skew. KeyIDHash string - // Timestamp: RFC3339 on the wire. Timestamp time.Time - // ObserverPod is stable across deployment stages. ConfigMapWriter - // does not strictly need it (one CM per monitor, identity is in the - // CM name); the OpenShift CRD-condition writer does (one CR - // aggregates conditions from N pods, and identity must travel with - // each entry). + // ObserverPod identifies this monitor instance; one OperatorCondition + // CR aggregates entries from N pods, so identity must travel with + // each condition entry. ObserverPod string } diff --git a/pkg/operator/encryption/kms/health/writer.go b/pkg/operator/encryption/kms/health/writer.go deleted file mode 100644 index a14b45a5fe..0000000000 --- a/pkg/operator/encryption/kms/health/writer.go +++ /dev/null @@ -1,10 +0,0 @@ -package health - -import "context" - -// StatusWriter ships each HealthStatus produced by the Monitor. Write errors -// should be logged and tolerated by the Monitor. A missed publish is strictly -// less bad than crashing the sidecar. -type StatusWriter interface { - Write(ctx context.Context, status HealthStatus) error -} diff --git a/pkg/operator/encryption/kms/health/writer_configmap.go b/pkg/operator/encryption/kms/health/writer_configmap.go deleted file mode 100644 index 7025192e07..0000000000 --- a/pkg/operator/encryption/kms/health/writer_configmap.go +++ /dev/null @@ -1,104 +0,0 @@ -package health - -import ( - "context" - "encoding/json" - "fmt" - "time" - - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" -) - -// ConfigMapWriter uses merge-patch so a tick of class X updates data.X -// and leaves the other classes' keys alone. Consumers determine the -// "current" class by max data..timestamp; there is no separate -// pointer key. -// -// On the wire (data is map[string]string; values shown decoded): -// -// data.ok = {"timestamp":"...","observerPod":"...","keyIDHash":"..."} -// data.not-ok = {"timestamp":"...","observerPod":"...","detail":"...","keyIDHash":"..."} -// data.rpc-error = {"timestamp":"...","observerPod":"...","detail":"..."} -// -// Self-heals on miss: if the CM is absent, Write creates it. Caller -// RBAC must therefore grant create in addition to get/update/patch on -// the named CM. -// -// Concurrency contract: one ConfigMap per monitor instance. Two -// monitors writing the same CM produce last-writer-wins on every key. -// Callers MUST encode instance identity in the CM name (the cmd-layer -// default is "kms-health-${POD_NAME}"). -type ConfigMapWriter struct { - client kubernetes.Interface - namespace string - name string -} - -func NewConfigMapWriter(client kubernetes.Interface, namespace, name string) *ConfigMapWriter { - return &ConfigMapWriter{ - client: client, - namespace: namespace, - name: name, - } -} - -type classEntry struct { - Timestamp string `json:"timestamp"` - ObserverPod string `json:"observerPod"` - Detail string `json:"detail,omitempty"` - KeyIDHash string `json:"keyIDHash,omitempty"` -} - -type configMapDataPatch struct { - Data map[string]string `json:"data"` -} - -func (w *ConfigMapWriter) Write(ctx context.Context, status HealthStatus) error { - entry := classEntry{ - Timestamp: status.Timestamp.UTC().Format(time.RFC3339), - ObserverPod: status.ObserverPod, - Detail: status.Healthz.Detail, - KeyIDHash: status.KeyIDHash, - } - entryBytes, err := json.Marshal(entry) - if err != nil { - return fmt.Errorf("marshal entry for ConfigMap %s/%s: %w", w.namespace, w.name, err) - } - - data := map[string]string{string(status.Healthz.Class): string(entryBytes)} - patchBytes, err := json.Marshal(configMapDataPatch{Data: data}) - if err != nil { - return fmt.Errorf("marshal patch for ConfigMap %s/%s: %w", w.namespace, w.name, err) - } - - cms := w.client.CoreV1().ConfigMaps(w.namespace) - _, err = cms.Patch(ctx, w.name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) - if err == nil { - return nil - } - if !apierrors.IsNotFound(err) { - return fmt.Errorf("patch ConfigMap %s/%s: %w", w.namespace, w.name, err) - } - - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: w.name, Namespace: w.namespace}, - Data: data, - } - _, err = cms.Create(ctx, cm, metav1.CreateOptions{}) - if err == nil { - return nil - } - if !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("create ConfigMap %s/%s: %w", w.namespace, w.name, err) - } - - // Race: another writer created the CM between our Patch and Create. - if _, err := cms.Patch(ctx, w.name, types.MergePatchType, patchBytes, metav1.PatchOptions{}); err != nil { - return fmt.Errorf("patch ConfigMap %s/%s after create race: %w", w.namespace, w.name, err) - } - return nil -} diff --git a/pkg/operator/encryption/kms/health/writer_configmap_test.go b/pkg/operator/encryption/kms/health/writer_configmap_test.go deleted file mode 100644 index a572db5c68..0000000000 --- a/pkg/operator/encryption/kms/health/writer_configmap_test.go +++ /dev/null @@ -1,244 +0,0 @@ -package health - -import ( - "context" - "encoding/json" - "testing" - "time" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" -) - -const ( - testNamespace = "kms-health-test" - testCMName = "kms-health-status" -) - -func newTestConfigMap() *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: testCMName, - Namespace: testNamespace, - }, - } -} - -// decodeEntry parses data. back into a classEntry. Comparing -// decoded structs avoids brittleness against JSON field ordering changes. -func decodeEntry(t *testing.T, raw string) classEntry { - t.Helper() - var e classEntry - if err := json.Unmarshal([]byte(raw), &e); err != nil { - t.Fatalf("decode classEntry %q: %v", raw, err) - } - return e -} - -func TestConfigMapWriter_writesHealthyEntryUnderOKKey(t *testing.T) { - client := fake.NewClientset(newTestConfigMap()) - w := NewConfigMapWriter(client, testNamespace, testCMName) - - status := HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "abc123", - Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), - ObserverPod: "mock-kmsv2-provider-node", - } - if err := w.Write(context.Background(), status); err != nil { - t.Fatalf("Write: %v", err) - } - - got, err := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Get: %v", err) - } - - raw, ok := got.Data[string(HealthClassOK)] - if !ok { - t.Fatalf("data[%q] missing; data=%v", HealthClassOK, got.Data) - } - entry := decodeEntry(t, raw) - - want := classEntry{ - Timestamp: "2026-04-24T12:00:00Z", - ObserverPod: "mock-kmsv2-provider-node", - KeyIDHash: "abc123", - } - if entry != want { - t.Errorf("data[ok]: got %+v, want %+v", entry, want) - } - - // Other class keys must be absent: we never observed them. - for _, c := range []HealthClass{HealthClassNotOK, HealthClassRPCError} { - if _, present := got.Data[string(c)]; present { - t.Errorf("data[%q] present after only an OK probe; data=%v", c, got.Data) - } - } -} - -func TestConfigMapWriter_writesUnhealthyEntryUnderNotOKKey(t *testing.T) { - client := fake.NewClientset(newTestConfigMap()) - w := NewConfigMapWriter(client, testNamespace, testCMName) - - status := HealthStatus{ - Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, - Timestamp: time.Date(2026, 4, 24, 12, 1, 0, 0, time.UTC), - ObserverPod: "p", - } - if err := w.Write(context.Background(), status); err != nil { - t.Fatalf("Write: %v", err) - } - - got, _ := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - raw, ok := got.Data[string(HealthClassNotOK)] - if !ok { - t.Fatalf("data[%q] missing; data=%v", HealthClassNotOK, got.Data) - } - entry := decodeEntry(t, raw) - - want := classEntry{ - Timestamp: "2026-04-24T12:01:00Z", - ObserverPod: "p", - Detail: "boom", - } - if entry != want { - t.Errorf("data[not-ok]: got %+v, want %+v", entry, want) - } -} - -func TestConfigMapWriter_writesRPCErrorEntryUnderRPCErrorKey(t *testing.T) { - client := fake.NewClientset(newTestConfigMap()) - w := NewConfigMapWriter(client, testNamespace, testCMName) - - status := HealthStatus{ - Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, - Timestamp: time.Date(2026, 4, 24, 12, 2, 0, 0, time.UTC), - ObserverPod: "p", - } - if err := w.Write(context.Background(), status); err != nil { - t.Fatalf("Write: %v", err) - } - - got, _ := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - raw, ok := got.Data[string(HealthClassRPCError)] - if !ok { - t.Fatalf("data[%q] missing; data=%v", HealthClassRPCError, got.Data) - } - entry := decodeEntry(t, raw) - - want := classEntry{ - Timestamp: "2026-04-24T12:02:00Z", - ObserverPod: "p", - Detail: "Unavailable", - } - if entry != want { - t.Errorf("data[rpc-error]: got %+v, want %+v", entry, want) - } -} - -// TestConfigMapWriter_tickPreservesOtherClasses is the load-bearing -// test for the schema's whole point: a probe of class X must not -// clobber data.Y / data.Z. If a future refactor swaps merge-patch for -// Update or for a strategic-merge-with-replace directive, this test -// goes red. -func TestConfigMapWriter_tickPreservesOtherClasses(t *testing.T) { - // Pre-populate data.ok with a stale healthy entry; tick rpc-error; - // assert data.ok survived byte-for-byte. - staleOK := `{"timestamp":"2026-04-24T11:00:00Z","observerPod":"p","keyIDHash":"old"}` - existing := newTestConfigMap() - existing.Data = map[string]string{ - string(HealthClassOK): staleOK, - } - client := fake.NewClientset(existing) - w := NewConfigMapWriter(client, testNamespace, testCMName) - - status := HealthStatus{ - Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, - Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), - ObserverPod: "p", - } - if err := w.Write(context.Background(), status); err != nil { - t.Fatalf("Write: %v", err) - } - - got, _ := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - - if got.Data[string(HealthClassOK)] != staleOK { - t.Errorf("data[ok] mutated by rpc-error tick: got %q, want %q", - got.Data[string(HealthClassOK)], staleOK) - } - if _, ok := got.Data[string(HealthClassRPCError)]; !ok { - t.Errorf("data[rpc-error] not written; data=%v", got.Data) - } -} - -func TestConfigMapWriter_patchPreservesUnrelatedKeys(t *testing.T) { - // Same load-bearing property as tickPreservesOtherClasses, but for - // keys outside our schema entirely (e.g. an annotation-style key set - // by something else in the same CM). - existing := newTestConfigMap() - existing.Data = map[string]string{ - "unrelated-key": "set-by-something-else", - } - client := fake.NewClientset(existing) - w := NewConfigMapWriter(client, testNamespace, testCMName) - - if err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), - ObserverPod: "p", - }); err != nil { - t.Fatalf("Write: %v", err) - } - - got, _ := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - if got.Data["unrelated-key"] != "set-by-something-else" { - t.Errorf("merge patch dropped unrelated key: got %q", got.Data["unrelated-key"]) - } - if _, ok := got.Data[string(HealthClassOK)]; !ok { - t.Errorf("data[ok] not written; data=%v", got.Data) - } -} - -func TestConfigMapWriter_createsConfigMapWhenMissing(t *testing.T) { - // Empty clientset, no ConfigMap to patch. Write self-heals by - // creating the CM seeded with this tick's class entry; subsequent - // ticks would extend it via merge-patch. - client := fake.NewClientset() - w := NewConfigMapWriter(client, testNamespace, testCMName) - - status := HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "abc", - Timestamp: time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC), - ObserverPod: "p", - } - if err := w.Write(context.Background(), status); err != nil { - t.Fatalf("Write on missing CM: %v", err) - } - - got, err := client.CoreV1().ConfigMaps(testNamespace). - Get(context.Background(), testCMName, metav1.GetOptions{}) - if err != nil { - t.Fatalf("Get: %v", err) - } - raw, ok := got.Data[string(HealthClassOK)] - if !ok { - t.Fatalf("data[%q] missing after create; data=%v", HealthClassOK, got.Data) - } - want := classEntry{ - Timestamp: "2026-04-24T12:00:00Z", - ObserverPod: "p", - KeyIDHash: "abc", - } - if entry := decodeEntry(t, raw); entry != want { - t.Errorf("data[ok]: got %+v, want %+v", entry, want) - } -} diff --git a/pkg/operator/encryption/kms/health/writer_operator_condition_test.go b/pkg/operator/encryption/kms/health/writer_operator_condition_test.go deleted file mode 100644 index 19823675e6..0000000000 --- a/pkg/operator/encryption/kms/health/writer_operator_condition_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package health - -import ( - "context" - "testing" - "time" - - operatorv1 "github.com/openshift/api/operator/v1" - applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" - "github.com/openshift/library-go/pkg/apiserver/jsonpatch" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/cache" - clocktesting "k8s.io/utils/clock/testing" -) - -const ( - testObserverPod = "kube-apiserver-master-0" - testFieldManager = "kms-health-monitor-kube-apiserver-master-0" - testTypeAvailable = "KMSv2HealthPlugin_kube-apiserver-master-0_Available" - testTypeDegraded = "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded" -) - -// recordingOperatorClient is a focused test double for OperatorConditionWriter: -// it captures every ApplyOperatorStatus call AND merges the applied conditions -// (LastTransitionTime included) into an in-memory OperatorStatus that -// subsequent GetOperatorState reads return. v1helpers.fakeOperatorClient drops -// LastTransitionTime in its merge, which would defeat the LTT tests. -// -// Merge semantics mirror real SSA per-entry ownership: each apply upserts its -// own conditions by Type, leaving conditions written by other fieldManagers -// untouched. -type recordingOperatorClient struct { - status operatorv1.OperatorStatus - applies []recordedApply -} - -type recordedApply struct { - fieldManager string - cfg *applyoperatorv1.OperatorStatusApplyConfiguration -} - -func newRecordingOperatorClient() *recordingOperatorClient { - return &recordingOperatorClient{} -} - -func (c *recordingOperatorClient) GetOperatorState() (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { - return &operatorv1.OperatorSpec{}, c.status.DeepCopy(), "0", nil -} - -func (c *recordingOperatorClient) ApplyOperatorStatus(_ context.Context, fieldManager string, cfg *applyoperatorv1.OperatorStatusApplyConfiguration) error { - c.applies = append(c.applies, recordedApply{fieldManager: fieldManager, cfg: cfg}) - for _, ac := range cfg.Conditions { - merged := operatorv1.OperatorCondition{} - if ac.Type != nil { - merged.Type = *ac.Type - } - if ac.Status != nil { - merged.Status = *ac.Status - } - if ac.Reason != nil { - merged.Reason = *ac.Reason - } - if ac.Message != nil { - merged.Message = *ac.Message - } - if ac.LastTransitionTime != nil { - merged.LastTransitionTime = *ac.LastTransitionTime - } - c.upsertCondition(merged) - } - return nil -} - -func (c *recordingOperatorClient) upsertCondition(cond operatorv1.OperatorCondition) { - for i := range c.status.Conditions { - if c.status.Conditions[i].Type == cond.Type { - c.status.Conditions[i] = cond - return - } - } - c.status.Conditions = append(c.status.Conditions, cond) -} - -// Methods OperatorConditionWriter never calls. Panic-on-call signals if a -// future change accidentally introduces a dependency. -func (c *recordingOperatorClient) Informer() cache.SharedIndexInformer { return nil } -func (c *recordingOperatorClient) GetObjectMeta() (*metav1.ObjectMeta, error) { - return &metav1.ObjectMeta{}, nil -} -func (c *recordingOperatorClient) GetOperatorStateWithQuorum(ctx context.Context) (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { - return c.GetOperatorState() -} -func (c *recordingOperatorClient) UpdateOperatorSpec(context.Context, string, *operatorv1.OperatorSpec) (*operatorv1.OperatorSpec, string, error) { - panic("UpdateOperatorSpec not used by OperatorConditionWriter") -} -func (c *recordingOperatorClient) UpdateOperatorStatus(context.Context, string, *operatorv1.OperatorStatus) (*operatorv1.OperatorStatus, error) { - panic("UpdateOperatorStatus not used by OperatorConditionWriter") -} -func (c *recordingOperatorClient) ApplyOperatorSpec(context.Context, string, *applyoperatorv1.OperatorSpecApplyConfiguration) error { - panic("ApplyOperatorSpec not used by OperatorConditionWriter") -} -func (c *recordingOperatorClient) PatchOperatorStatus(context.Context, *jsonpatch.PatchSet) error { - panic("PatchOperatorStatus not used by OperatorConditionWriter") -} - -func findCondition(conds []operatorv1.OperatorCondition, t string) *operatorv1.OperatorCondition { - for i := range conds { - if conds[i].Type == t { - return &conds[i] - } - } - return nil -} - -// newTestWriter builds an OperatorConditionWriter with an injected fake clock -// so LTT-sensitive tests can advance time deterministically. -func newTestWriter(client *recordingOperatorClient, observerPod string, fc *clocktesting.FakeClock) *OperatorConditionWriter { - w := NewOperatorConditionWriter(client, observerPod) - w.clock = fc - return w -} - -func TestOperatorConditionWriter_healthyStatusWritesAvailableTrueAndDegradedFalse(t *testing.T) { - client := newRecordingOperatorClient() - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - w := newTestWriter(client, testObserverPod, fc) - - err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "abc123", - ObserverPod: testObserverPod, - }) - if err != nil { - t.Fatalf("Write: %v", err) - } - - a := findCondition(client.status.Conditions, testTypeAvailable) - if a == nil { - t.Fatalf("Available condition missing; conditions=%v", client.status.Conditions) - } - if a.Status != operatorv1.ConditionTrue { - t.Errorf("Available.Status: got %q, want True", a.Status) - } - if a.Reason != reasonAsExpected { - t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonAsExpected) - } - if got, want := a.Message, "keyIDHash=abc123"; got != want { - t.Errorf("Available.Message: got %q, want %q", got, want) - } - - d := findCondition(client.status.Conditions, testTypeDegraded) - if d == nil { - t.Fatalf("Degraded condition missing; conditions=%v", client.status.Conditions) - } - if d.Status != operatorv1.ConditionFalse { - t.Errorf("Degraded.Status: got %q, want False", d.Status) - } - if d.Reason != reasonAsExpected { - t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonAsExpected) - } - if got, want := d.Message, "keyIDHash=abc123"; got != want { - t.Errorf("Degraded.Message: got %q, want %q", got, want) - } - - if got, want := len(client.applies), 1; got != want { - t.Fatalf("apply call count: got %d, want %d", got, want) - } - if got, want := client.applies[0].fieldManager, testFieldManager; got != want { - t.Errorf("fieldManager: got %q, want %q", got, want) - } -} - -func TestOperatorConditionWriter_notOKStatusWritesAvailableFalseAndDegradedTrue(t *testing.T) { - client := newRecordingOperatorClient() - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - w := newTestWriter(client, testObserverPod, fc) - - err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassNotOK, Detail: "kid lookup failed"}, - KeyIDHash: "deadbeef", - ObserverPod: testObserverPod, - }) - if err != nil { - t.Fatalf("Write: %v", err) - } - - a := findCondition(client.status.Conditions, testTypeAvailable) - if a == nil { - t.Fatalf("Available missing") - } - if a.Status != operatorv1.ConditionFalse { - t.Errorf("Available.Status: got %q, want False", a.Status) - } - if a.Reason != reasonPluginUnhealthy { - t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonPluginUnhealthy) - } - want := "keyIDHash=deadbeef detail=kid lookup failed" - if a.Message != want { - t.Errorf("Available.Message: got %q, want %q", a.Message, want) - } - - d := findCondition(client.status.Conditions, testTypeDegraded) - if d == nil { - t.Fatalf("Degraded missing") - } - if d.Status != operatorv1.ConditionTrue { - t.Errorf("Degraded.Status: got %q, want True", d.Status) - } - if d.Reason != reasonHealthzNotOK { - t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonHealthzNotOK) - } - if d.Message != want { - t.Errorf("Degraded.Message: got %q, want %q", d.Message, want) - } -} - -func TestOperatorConditionWriter_rpcErrorStatusWritesAvailableUnknownAndDegradedTrue(t *testing.T) { - client := newRecordingOperatorClient() - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - w := newTestWriter(client, testObserverPod, fc) - - err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}, - ObserverPod: testObserverPod, - }) - if err != nil { - t.Fatalf("Write: %v", err) - } - - a := findCondition(client.status.Conditions, testTypeAvailable) - if a == nil { - t.Fatalf("Available missing") - } - // Asymmetric: Available expresses confirmed-good. Lost contact is not - // "confirmed bad", so Unknown is the honest answer. - if a.Status != operatorv1.ConditionUnknown { - t.Errorf("Available.Status: got %q, want Unknown", a.Status) - } - if a.Reason != reasonProbeUnreachable { - t.Errorf("Available.Reason: got %q, want %q", a.Reason, reasonProbeUnreachable) - } - if got, want := a.Message, "detail=Unavailable"; got != want { - t.Errorf("Available.Message: got %q, want %q (no keyIDHash= prefix when KeyIDHash is empty)", got, want) - } - - d := findCondition(client.status.Conditions, testTypeDegraded) - if d == nil { - t.Fatalf("Degraded missing") - } - // Asymmetric: Degraded expresses persistent-wrong. Inability to reach - // the plugin IS wrong, so Degraded is True. - if d.Status != operatorv1.ConditionTrue { - t.Errorf("Degraded.Status: got %q, want True", d.Status) - } - if d.Reason != reasonRPCError { - t.Errorf("Degraded.Reason: got %q, want %q", d.Reason, reasonRPCError) - } - if got, want := d.Message, "detail=Unavailable"; got != want { - t.Errorf("Degraded.Message: got %q, want %q", got, want) - } -} - -// Repeated identical Write must not bump LastTransitionTime. If it did, -// every reconcile would churn the field and downstream consumers (rollout -// gates, alerts) would think transitions were happening when nothing has. -func TestOperatorConditionWriter_repeatedWriteSameStatusDoesNotBumpLastTransitionTime(t *testing.T) { - client := newRecordingOperatorClient() - t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) - fc := clocktesting.NewFakeClock(t1) - w := newTestWriter(client, testObserverPod, fc) - - healthy := HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "abc123", - ObserverPod: testObserverPod, - } - if err := w.Write(context.Background(), healthy); err != nil { - t.Fatalf("first Write: %v", err) - } - - fc.Step(5 * time.Minute) - if err := w.Write(context.Background(), healthy); err != nil { - t.Fatalf("second Write: %v", err) - } - - for _, ct := range []string{testTypeAvailable, testTypeDegraded} { - c := findCondition(client.status.Conditions, ct) - if c == nil { - t.Fatalf("%s missing", ct) - } - if !c.LastTransitionTime.Time.Equal(t1) { - t.Errorf("%s.LastTransitionTime: got %v, want %v (Status unchanged, must carry over)", ct, c.LastTransitionTime.Time, t1) - } - } -} - -// Status flip must bump LastTransitionTime to the current time. -func TestOperatorConditionWriter_statusFlipBumpsLastTransitionTime(t *testing.T) { - client := newRecordingOperatorClient() - t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) - fc := clocktesting.NewFakeClock(t1) - w := newTestWriter(client, testObserverPod, fc) - - if err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "abc", - ObserverPod: testObserverPod, - }); err != nil { - t.Fatalf("first Write: %v", err) - } - - t2 := t1.Add(5 * time.Minute) - fc.SetTime(t2) - if err := w.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassNotOK, Detail: "boom"}, - KeyIDHash: "abc", - ObserverPod: testObserverPod, - }); err != nil { - t.Fatalf("second Write: %v", err) - } - - for _, ct := range []string{testTypeAvailable, testTypeDegraded} { - c := findCondition(client.status.Conditions, ct) - if c == nil { - t.Fatalf("%s missing", ct) - } - if !c.LastTransitionTime.Time.Equal(t2) { - t.Errorf("%s.LastTransitionTime: got %v, want %v (advanced on Status flip)", ct, c.LastTransitionTime.Time, t2) - } - } -} - -// Two writers with different ObserverPods write to the same CR without -// clobbering each other. This is the load-bearing property: per-pod -// fieldManager + per-entry SSA ownership keeps both pods' conditions -// coexisting in .status.conditions[]. -func TestOperatorConditionWriter_twoWritersDifferentObserverPodsDoNotCollide(t *testing.T) { - client := newRecordingOperatorClient() - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - - const ( - podA = "kube-apiserver-master-0" - podB = "kube-apiserver-master-1" - ) - wa := newTestWriter(client, podA, fc) - wb := newTestWriter(client, podB, fc) - - if err := wa.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassOK}, - KeyIDHash: "a", - ObserverPod: podA, - }); err != nil { - t.Fatalf("wa.Write: %v", err) - } - if err := wb.Write(context.Background(), HealthStatus{ - Healthz: Healthz{Class: HealthClassNotOK, Detail: "bad"}, - KeyIDHash: "b", - ObserverPod: podB, - }); err != nil { - t.Fatalf("wb.Write: %v", err) - } - - wantTypes := []string{ - "KMSv2HealthPlugin_kube-apiserver-master-0_Available", - "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded", - "KMSv2HealthPlugin_kube-apiserver-master-1_Available", - "KMSv2HealthPlugin_kube-apiserver-master-1_Degraded", - } - for _, ct := range wantTypes { - if findCondition(client.status.Conditions, ct) == nil { - t.Errorf("condition %q missing; conditions=%v", ct, client.status.Conditions) - } - } - if got, want := len(client.status.Conditions), 4; got != want { - t.Errorf("conditions count: got %d, want %d (no overwrites)", got, want) - } - - // Spot-check that A's OK status survived B's apply. - availA := findCondition(client.status.Conditions, "KMSv2HealthPlugin_kube-apiserver-master-0_Available") - if availA == nil || availA.Status != operatorv1.ConditionTrue { - t.Errorf("podA Available: got %v, want Status=True (writer B clobbered it)", availA) - } - availB := findCondition(client.status.Conditions, "KMSv2HealthPlugin_kube-apiserver-master-1_Available") - if availB == nil || availB.Status != operatorv1.ConditionFalse { - t.Errorf("podB Available: got %v, want Status=False (NotOK)", availB) - } - - // Per-pod fieldManager: each writer used its own. - if got, want := len(client.applies), 2; got != want { - t.Fatalf("apply count: got %d, want %d", got, want) - } - if got, want := client.applies[0].fieldManager, "kms-health-monitor-"+podA; got != want { - t.Errorf("apply[0].fieldManager: got %q, want %q", got, want) - } - if got, want := client.applies[1].fieldManager, "kms-health-monitor-"+podB; got != want { - t.Errorf("apply[1].fieldManager: got %q, want %q", got, want) - } -} From e33de33e99a926b95e8e64cb621f23bf73e043da Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Mon, 11 May 2026 14:34:24 +0200 Subject: [PATCH 5/8] kms/health: drop cmd flag-validation tests --- .../encryption/kms/health/cmd_test.go | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 pkg/operator/encryption/kms/health/cmd_test.go diff --git a/pkg/operator/encryption/kms/health/cmd_test.go b/pkg/operator/encryption/kms/health/cmd_test.go deleted file mode 100644 index 080a8ef481..0000000000 --- a/pkg/operator/encryption/kms/health/cmd_test.go +++ /dev/null @@ -1,100 +0,0 @@ -package health - -import ( - "strings" - "testing" - "time" - - "github.com/spf13/pflag" -) - -func validOpts() commandOptions { - return commandOptions{ - kmsSocket: "/var/run/kms.sock", - probeInterval: 60 * time.Second, - probeTimeout: 3 * time.Second, - writeTimeout: 5 * time.Second, - operatorResource: "kubeapiserver", - observerPodName: "master-0", - } -} - -func TestValidate_acceptsValidOptions(t *testing.T) { - o := validOpts() - if err := o.validate(); err != nil { - t.Fatalf("validate(valid): %v", err) - } -} - -func TestValidate_rejectsMissingOrZeroFields(t *testing.T) { - cases := []struct { - name string - mut func(*commandOptions) - wants string - }{ - {"empty kmsSocket", func(o *commandOptions) { o.kmsSocket = "" }, "--kms-socket"}, - {"zero probeInterval", func(o *commandOptions) { o.probeInterval = 0 }, "--probe-interval"}, - {"zero probeTimeout", func(o *commandOptions) { o.probeTimeout = 0 }, "--probe-timeout"}, - {"zero writeTimeout", func(o *commandOptions) { o.writeTimeout = 0 }, "--write-timeout"}, - {"empty observerPodName", func(o *commandOptions) { o.observerPodName = "" }, "--observer-pod-name"}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - o := validOpts() - tc.mut(&o) - err := o.validate() - if err == nil { - t.Fatalf("validate: want error containing %q, got nil", tc.wants) - } - if !strings.Contains(err.Error(), tc.wants) { - t.Errorf("validate: got %q, want substring %q", err.Error(), tc.wants) - } - }) - } -} - -func TestValidate_operatorResourceEnum(t *testing.T) { - cases := []struct { - name string - resource string - wantErr bool - }{ - {"kubeapiserver accepted", "kubeapiserver", false}, - {"authentication accepted", "authentication", false}, - {"openshiftapiserver accepted", "openshiftapiserver", false}, - {"unknown rejected", "bogus", true}, - {"empty rejected", "", true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - o := validOpts() - o.operatorResource = tc.resource - err := o.validate() - if tc.wantErr { - if err == nil { - t.Fatalf("validate(%q): want error, got nil", tc.resource) - } - if !strings.Contains(err.Error(), "--operator-resource") { - t.Errorf("validate(%q): got %q, want substring --operator-resource", tc.resource, err.Error()) - } - return - } - if err != nil { - t.Errorf("validate(%q): got %v, want nil", tc.resource, err) - } - }) - } -} - -func TestAddFlags_observerPodNameDefaultsFromPodNameEnv(t *testing.T) { - t.Setenv("POD_NAME", "from-env-0") - o := commandOptions{} - fs := pflag.NewFlagSet("test", pflag.ContinueOnError) - o.addFlags(fs) - if err := fs.Parse(nil); err != nil { - t.Fatalf("parse: %v", err) - } - if o.observerPodName != "from-env-0" { - t.Errorf("observerPodName: got %q, want %q (env-derived flag default)", o.observerPodName, "from-env-0") - } -} From 24159d6975fd1551bb617c43493fa51ddb3f8c7d Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Tue, 26 May 2026 12:49:46 +0200 Subject: [PATCH 6/8] wip: sync to other device --- pkg/operator/encryption/kms/health/checker.go | 48 ++++ pkg/operator/encryption/kms/health/cmd.go | 138 +++------- .../encryption/kms/health/condition.go | 142 ---------- .../encryption/kms/health/condition_test.go | 243 ------------------ pkg/operator/encryption/kms/health/monitor.go | 50 ---- .../encryption/kms/health/monitor_test.go | 64 ----- pkg/operator/encryption/kms/health/probe.go | 87 ------- .../encryption/kms/health/probe_test.go | 128 --------- pkg/operator/encryption/kms/health/types.go | 58 +++-- pkg/operator/encryption/kms/health/writer.go | 35 +++ 10 files changed, 151 insertions(+), 842 deletions(-) create mode 100644 pkg/operator/encryption/kms/health/checker.go delete mode 100644 pkg/operator/encryption/kms/health/condition.go delete mode 100644 pkg/operator/encryption/kms/health/condition_test.go delete mode 100644 pkg/operator/encryption/kms/health/monitor.go delete mode 100644 pkg/operator/encryption/kms/health/monitor_test.go delete mode 100644 pkg/operator/encryption/kms/health/probe.go delete mode 100644 pkg/operator/encryption/kms/health/probe_test.go create mode 100644 pkg/operator/encryption/kms/health/writer.go diff --git a/pkg/operator/encryption/kms/health/checker.go b/pkg/operator/encryption/kms/health/checker.go new file mode 100644 index 0000000000..5f3a165b8d --- /dev/null +++ b/pkg/operator/encryption/kms/health/checker.go @@ -0,0 +1,48 @@ +package health + +import ( + "context" + "fmt" + "time" + + k8senvelopekmsv2 "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2" + kmsservice "k8s.io/kms/pkg/service" +) + +type Checker struct { + services []kmsservice.Service + now func() time.Time +} + +func NewChecker(ctx context.Context, udsPaths []string, timeout time.Duration) (*Checker, error) { + c := Checker{ + services: make([]kmsservice.Service, 0, len(udsPaths)), + now: time.Now, + } + + for _, socket := range udsPaths { + service, err := k8senvelopekmsv2.NewGRPCService( + ctx, + "unix://"+socket, + providerName, + timeout, + ) + if err != nil { + return nil, fmt.Errorf("dial KMS plugin at %q: %w", socket, err) + } + + c.services = append(c.services, service) + } + + return &c, nil +} + +func (c *Checker) CheckStatus(ctx context.Context) error { + for _, service := range c.services { + _, err := service.Status(ctx) + if err != nil { + return fmt.Errorf("check kms plugin status: %w", err) + } + } + return nil +} diff --git a/pkg/operator/encryption/kms/health/cmd.go b/pkg/operator/encryption/kms/health/cmd.go index a0ef8b2454..261926869d 100644 --- a/pkg/operator/encryption/kms/health/cmd.go +++ b/pkg/operator/encryption/kms/health/cmd.go @@ -3,64 +3,34 @@ package health import ( "context" "fmt" - "os" - "sort" "strings" "time" "github.com/spf13/cobra" "github.com/spf13/pflag" - applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" - "github.com/openshift/library-go/pkg/operator/genericoperatorclient" - - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - k8senvelopekmsv2 "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" - "k8s.io/utils/clock" ) const providerName = "kms-health-monitor" -var supportedOperators = map[string]struct { - GVR schema.GroupVersionResource - GVK schema.GroupVersionKind -}{ - "kubeapiserver": { - GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "kubeapiservers"}, - GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "KubeAPIServer"}, - }, - "authentication": { - GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "authentications"}, - GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "Authentication"}, - }, - "openshiftapiserver": { - GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "openshiftapiservers"}, - GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "OpenShiftAPIServer"}, - }, -} - type commandOptions struct { - kmsSocket string - probeInterval time.Duration - probeTimeout time.Duration - writeTimeout time.Duration - operatorResource string - observerPodName string - kubeconfig string + kmsSockets []string + readInterval time.Duration + readTimeout time.Duration + writeTimeout time.Duration + targetOperator string + nodeName string + kubeconfig string } -// NewCommand wires the cobra command. ctx is owned by the caller so -// signal handling lives there, not here. func NewCommand(ctx context.Context) *cobra.Command { o := &commandOptions{ - kmsSocket: "/var/run/kmsplugin/kms.sock", - probeInterval: 60 * time.Second, - probeTimeout: 3 * time.Second, - writeTimeout: 5 * time.Second, + readInterval: 30 * time.Second, + readTimeout: 5 * time.Second, + writeTimeout: 10 * time.Second, } cmd := &cobra.Command{ Use: "kms-health-monitor", @@ -77,36 +47,34 @@ func NewCommand(ctx context.Context) *cobra.Command { } func (o *commandOptions) addFlags(fs *pflag.FlagSet) { - fs.StringVar(&o.kmsSocket, "kms-socket", o.kmsSocket, "filesystem path to the KMSv2 plugin UDS") - fs.DurationVar(&o.probeInterval, "probe-interval", o.probeInterval, "cadence between probes") - fs.DurationVar(&o.probeTimeout, "probe-timeout", o.probeTimeout, "deadline for each Status RPC") - fs.DurationVar(&o.writeTimeout, "write-timeout", o.writeTimeout, "deadline for each condition update; should fit inside --probe-interval") - fs.StringVar(&o.operatorResource, "operator-resource", o.operatorResource, - "target operator CRD: "+strings.Join(supportedOperatorKeys(), ", ")) - fs.StringVar(&o.observerPodName, "observer-pod-name", os.Getenv("POD_NAME"), - "pod name recorded in the condition; defaults to $POD_NAME") + fs.StringSliceVar(&o.kmsSockets, "kms-sockets", nil, "filesystem paths to the KMSv2 plugin UDS") + fs.DurationVar(&o.readInterval, "read-interval", o.readInterval, "cadence between checks") + fs.DurationVar(&o.readTimeout, "read-timeout", o.readTimeout, "deadline for each Status RPC") + fs.DurationVar(&o.writeTimeout, "write-timeout", o.writeTimeout, "deadline for each condition update; should fit inside --read-interval") + fs.StringVar(&o.targetOperator, "target-operator", "", "target operator CRD: "+strings.Join(supportedOperatorKeys(), ", ")) + fs.StringVar(&o.nodeName, "node-name", "", "node name recorded in the condition used to help to identify the origin") fs.StringVar(&o.kubeconfig, "kubeconfig", "", "path to a kubeconfig; empty uses in-cluster config") } func (o *commandOptions) validate() error { - if o.kmsSocket == "" { - return fmt.Errorf("--kms-socket is required") + if len(o.kmsSockets) == 0 { + return fmt.Errorf("--kms-sockets is required, at least one") } - if o.probeInterval <= 0 { - return fmt.Errorf("--probe-interval must be positive") + if o.readInterval <= 0 { + return fmt.Errorf("--read-interval must be positive") } - if o.probeTimeout <= 0 { - return fmt.Errorf("--probe-timeout must be positive") + if o.readTimeout <= 0 { + return fmt.Errorf("--read-timeout must be positive") } if o.writeTimeout <= 0 { return fmt.Errorf("--write-timeout must be positive") } - if o.observerPodName == "" { - return fmt.Errorf("--observer-pod-name is required (or set $POD_NAME)") + if o.nodeName == "" { + return fmt.Errorf("--node-name is required") } - if _, ok := supportedOperators[o.operatorResource]; !ok { - return fmt.Errorf("--operator-resource must be one of %s (got %q)", - strings.Join(supportedOperatorKeys(), ", "), o.operatorResource) + if _, ok := supportedOperators[TargetOperator(o.targetOperator)]; !ok { + return fmt.Errorf("--target-operator must be one of %s (got %q)", + strings.Join(supportedOperatorKeys(), ", "), o.targetOperator) } return nil } @@ -116,39 +84,25 @@ func (o *commandOptions) run(ctx context.Context) error { if err != nil { return fmt.Errorf("build rest config: %w", err) } - - target := supportedOperators[o.operatorResource] - operatorClient, _, err := genericoperatorclient.NewClusterScopedOperatorClient( - clock.RealClock{}, cfg, target.GVR, target.GVK, - emptyOperatorSpec, emptyOperatorStatus, - ) + _, err = buildWriter(cfg, TargetOperator(o.targetOperator)) if err != nil { - return fmt.Errorf("build operator client for %s: %w", o.operatorResource, err) + return fmt.Errorf("create new writer: %w", err) } - writer := NewOperatorConditionWriter(operatorClient, o.observerPodName) - // kmsv2.NewGRPCService sets WaitForReady(true), so dial returns - // immediately even if the plugin socket isn't listening yet. - endpoint := "unix://" + o.kmsSocket - service, err := k8senvelopekmsv2.NewGRPCService(ctx, endpoint, providerName, o.probeTimeout) + _, err = NewChecker(ctx, o.kmsSockets, o.readTimeout) if err != nil { - return fmt.Errorf("dial KMS plugin at %q: %w", endpoint, err) + return fmt.Errorf("create new checker: %w", err) } - probe := NewProbe(service, 0) - - monitor := NewMonitor(probe, writer, o.observerPodName, o.probeInterval, o.writeTimeout) klog.InfoS("kms-health-monitor starting", - "socket", o.kmsSocket, - "operatorResource", o.operatorResource, - "observerPod", o.observerPodName, - "interval", o.probeInterval, - "probeTimeout", o.probeTimeout, + "socket", o.kmsSockets, + "targetOperator", o.targetOperator, + "observerNode", o.nodeName, + "interval", o.readInterval, + "readTimeout", o.readTimeout, "writeTimeout", o.writeTimeout, ) - monitor.Run(ctx) - klog.Info("kms-health-monitor stopping") return nil } @@ -158,23 +112,3 @@ func buildRESTConfig(kubeconfig string) (*rest.Config, error) { } return rest.InClusterConfig() } - -func supportedOperatorKeys() []string { - keys := make([]string, 0, len(supportedOperators)) - for k := range supportedOperators { - keys = append(keys, k) - } - sort.Strings(keys) - return keys -} - -// Empty extractors are sufficient: OperatorConditionWriter reads prior -// conditions via GetOperatorState (which does not use these) and writes -// via SSA with its own fieldManager. -func emptyOperatorSpec(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorSpecApplyConfiguration, error) { - return applyoperatorv1.OperatorSpec(), nil -} - -func emptyOperatorStatus(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorStatusApplyConfiguration, error) { - return applyoperatorv1.OperatorStatus(), nil -} diff --git a/pkg/operator/encryption/kms/health/condition.go b/pkg/operator/encryption/kms/health/condition.go deleted file mode 100644 index 746f636cde..0000000000 --- a/pkg/operator/encryption/kms/health/condition.go +++ /dev/null @@ -1,142 +0,0 @@ -package health - -import ( - "context" - "fmt" - "time" - - operatorv1 "github.com/openshift/api/operator/v1" - applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" - operatorv1helpers "github.com/openshift/library-go/pkg/operator/v1helpers" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/utils/clock" -) - -// Per-pod condition Types: KMSv2HealthPlugin__Available and -// KMSv2HealthPlugin__Degraded. Pod-name hyphens pass through -// verbatim; library-go UnionCondition handles them. -const ( - conditionTypePrefix = "KMSv2HealthPlugin_" - conditionTypeAvailableSuffix = "_Available" - conditionTypeDegradedSuffix = "_Degraded" - - reasonAsExpected = "AsExpected" - reasonPluginUnhealthy = "PluginUnhealthy" - reasonProbeUnreachable = "ProbeUnreachable" - reasonHealthzNotOK = "HealthzNotOK" - reasonRPCError = "RPCError" - - // fieldManagerPrefix isolates each pod's two condition entries via SSA - // per-entry ownership (Conditions is listType=map keyed on Type). - fieldManagerPrefix = "kms-health-monitor-" -) - -type OperatorConditionWriter struct { - client operatorv1helpers.OperatorClient - observerPod string - fieldManager string - clock clock.Clock -} - -func NewOperatorConditionWriter(client operatorv1helpers.OperatorClient, observerPod string) *OperatorConditionWriter { - return &OperatorConditionWriter{ - client: client, - observerPod: observerPod, - fieldManager: fieldManagerPrefix + observerPod, - clock: clock.RealClock{}, - } -} - -type conditionDecision struct { - status operatorv1.ConditionStatus - reason string -} - -// Asymmetric: Available expresses confirmed-good, so RPCError ("we don't -// know") maps to Unknown; Degraded expresses persistent-wrong, and an -// unreachable plugin IS wrong, so Degraded is True. -func mapHealthClass(class HealthClass) (avail, degraded conditionDecision) { - switch class { - case HealthClassOK: - return conditionDecision{operatorv1.ConditionTrue, reasonAsExpected}, - conditionDecision{operatorv1.ConditionFalse, reasonAsExpected} - case HealthClassNotOK: - return conditionDecision{operatorv1.ConditionFalse, reasonPluginUnhealthy}, - conditionDecision{operatorv1.ConditionTrue, reasonHealthzNotOK} - case HealthClassRPCError: - return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, - conditionDecision{operatorv1.ConditionTrue, reasonRPCError} - } - // Defensive default for an unknown class. HealthClass is a closed set, - // so this is unreachable today, but a future class would otherwise - // crash a sidecar. - return conditionDecision{operatorv1.ConditionUnknown, reasonProbeUnreachable}, - conditionDecision{operatorv1.ConditionTrue, reasonRPCError} -} - -func buildMessage(status HealthStatus) string { - if status.Healthz.IsOK() { - return "keyIDHash=" + status.KeyIDHash - } - if status.KeyIDHash != "" { - return fmt.Sprintf("keyIDHash=%s detail=%s", status.KeyIDHash, status.Healthz.Detail) - } - return "detail=" + status.Healthz.Detail -} - -// Carry over the prior LastTransitionTime when Status is unchanged so -// consumers don't see field churn on every reconcile. Mirrors -// v1helpers.SetOperatorCondition for the SSA path. -func resolveLastTransitionTime(prior []operatorv1.OperatorCondition, conditionType string, newStatus operatorv1.ConditionStatus, now time.Time) metav1.Time { - for i := range prior { - if prior[i].Type != conditionType { - continue - } - if prior[i].Status == newStatus { - return prior[i].LastTransitionTime - } - break - } - return metav1.NewTime(now) -} - -func (w *OperatorConditionWriter) Write(ctx context.Context, status HealthStatus) error { - availType := conditionTypePrefix + w.observerPod + conditionTypeAvailableSuffix - degradedType := conditionTypePrefix + w.observerPod + conditionTypeDegradedSuffix - - avail, degraded := mapHealthClass(status.Healthz.Class) - msg := buildMessage(status) - - // Read failure is non-fatal: worst case LTT=now on an unchanged Status, - // self-correcting next tick. Skipping the publish would punish transient - // apiserver glitches harder than the contract asks for. - var priorConditions []operatorv1.OperatorCondition - if _, opStatus, _, err := w.client.GetOperatorState(); err == nil && opStatus != nil { - priorConditions = opStatus.Conditions - } - - now := w.clock.Now() - availLTT := resolveLastTransitionTime(priorConditions, availType, avail.status, now) - degradedLTT := resolveLastTransitionTime(priorConditions, degradedType, degraded.status, now) - - cfg := applyoperatorv1.OperatorStatus().WithConditions( - applyoperatorv1.OperatorCondition(). - WithType(availType). - WithStatus(avail.status). - WithReason(avail.reason). - WithMessage(msg). - WithLastTransitionTime(availLTT), - applyoperatorv1.OperatorCondition(). - WithType(degradedType). - WithStatus(degraded.status). - WithReason(degraded.reason). - WithMessage(msg). - WithLastTransitionTime(degradedLTT), - ) - - if err := w.client.ApplyOperatorStatus(ctx, w.fieldManager, cfg); err != nil { - return fmt.Errorf("apply operator-condition status (fieldManager=%s): %w", w.fieldManager, err) - } - return nil -} diff --git a/pkg/operator/encryption/kms/health/condition_test.go b/pkg/operator/encryption/kms/health/condition_test.go deleted file mode 100644 index c0698d5ac6..0000000000 --- a/pkg/operator/encryption/kms/health/condition_test.go +++ /dev/null @@ -1,243 +0,0 @@ -package health - -import ( - "context" - "testing" - "time" - - operatorv1 "github.com/openshift/api/operator/v1" - applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" - "github.com/openshift/library-go/pkg/apiserver/jsonpatch" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/cache" - clocktesting "k8s.io/utils/clock/testing" -) - -const ( - testObserverPod = "kube-apiserver-master-0" - testFieldManager = "kms-health-monitor-kube-apiserver-master-0" - testTypeAvailable = "KMSv2HealthPlugin_kube-apiserver-master-0_Available" - testTypeDegraded = "KMSv2HealthPlugin_kube-apiserver-master-0_Degraded" -) - -// recordingOperatorClient merges applied conditions (LastTransitionTime -// included) into the in-memory status so GetOperatorState sees prior -// state. v1helpers.fakeOperatorClient drops LTT in its merge, which -// would defeat the LTT carry-over tests. -type recordingOperatorClient struct { - status operatorv1.OperatorStatus - applies []recordedApply -} - -type recordedApply struct { - fieldManager string - cfg *applyoperatorv1.OperatorStatusApplyConfiguration -} - -func (c *recordingOperatorClient) GetOperatorState() (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { - return &operatorv1.OperatorSpec{}, c.status.DeepCopy(), "0", nil -} - -func (c *recordingOperatorClient) ApplyOperatorStatus(_ context.Context, fieldManager string, cfg *applyoperatorv1.OperatorStatusApplyConfiguration) error { - c.applies = append(c.applies, recordedApply{fieldManager: fieldManager, cfg: cfg}) - for _, ac := range cfg.Conditions { - merged := operatorv1.OperatorCondition{} - if ac.Type != nil { - merged.Type = *ac.Type - } - if ac.Status != nil { - merged.Status = *ac.Status - } - if ac.Reason != nil { - merged.Reason = *ac.Reason - } - if ac.Message != nil { - merged.Message = *ac.Message - } - if ac.LastTransitionTime != nil { - merged.LastTransitionTime = *ac.LastTransitionTime - } - c.upsertCondition(merged) - } - return nil -} - -func (c *recordingOperatorClient) upsertCondition(cond operatorv1.OperatorCondition) { - for i := range c.status.Conditions { - if c.status.Conditions[i].Type == cond.Type { - c.status.Conditions[i] = cond - return - } - } - c.status.Conditions = append(c.status.Conditions, cond) -} - -func (c *recordingOperatorClient) Informer() cache.SharedIndexInformer { return nil } -func (c *recordingOperatorClient) GetObjectMeta() (*metav1.ObjectMeta, error) { - return &metav1.ObjectMeta{}, nil -} -func (c *recordingOperatorClient) GetOperatorStateWithQuorum(ctx context.Context) (*operatorv1.OperatorSpec, *operatorv1.OperatorStatus, string, error) { - return c.GetOperatorState() -} -func (c *recordingOperatorClient) UpdateOperatorSpec(context.Context, string, *operatorv1.OperatorSpec) (*operatorv1.OperatorSpec, string, error) { - panic("not used") -} -func (c *recordingOperatorClient) UpdateOperatorStatus(context.Context, string, *operatorv1.OperatorStatus) (*operatorv1.OperatorStatus, error) { - panic("not used") -} -func (c *recordingOperatorClient) ApplyOperatorSpec(context.Context, string, *applyoperatorv1.OperatorSpecApplyConfiguration) error { - panic("not used") -} -func (c *recordingOperatorClient) PatchOperatorStatus(context.Context, *jsonpatch.PatchSet) error { - panic("not used") -} - -func findCondition(conds []operatorv1.OperatorCondition, t string) *operatorv1.OperatorCondition { - for i := range conds { - if conds[i].Type == t { - return &conds[i] - } - } - return nil -} - -func newTestWriter(client *recordingOperatorClient, observerPod string, fc *clocktesting.FakeClock) *OperatorConditionWriter { - w := NewOperatorConditionWriter(client, observerPod) - w.clock = fc - return w -} - -func TestOperatorConditionWriter_classMapping(t *testing.T) { - cases := []struct { - name string - status HealthStatus - availStatus operatorv1.ConditionStatus - availReason string - degrStatus operatorv1.ConditionStatus - degrReason string - wantMessage string - }{ - { - name: "OK", - status: HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "abc123"}, - availStatus: operatorv1.ConditionTrue, availReason: reasonAsExpected, - degrStatus: operatorv1.ConditionFalse, degrReason: reasonAsExpected, - wantMessage: "keyIDHash=abc123", - }, - { - name: "NotOK", - status: HealthStatus{Healthz: Healthz{Class: HealthClassNotOK, Detail: "kid lookup failed"}, KeyIDHash: "deadbeef"}, - availStatus: operatorv1.ConditionFalse, availReason: reasonPluginUnhealthy, - degrStatus: operatorv1.ConditionTrue, degrReason: reasonHealthzNotOK, - wantMessage: "keyIDHash=deadbeef detail=kid lookup failed", - }, - { - // Asymmetric: Available expresses confirmed-good (Unknown when we - // don't know), Degraded expresses persistent-wrong (True since - // unreachable IS wrong). - name: "RPCError", - status: HealthStatus{Healthz: Healthz{Class: HealthClassRPCError, Detail: "Unavailable"}}, - availStatus: operatorv1.ConditionUnknown, availReason: reasonProbeUnreachable, - degrStatus: operatorv1.ConditionTrue, degrReason: reasonRPCError, - wantMessage: "detail=Unavailable", - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - client := &recordingOperatorClient{} - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - w := newTestWriter(client, testObserverPod, fc) - - tc.status.ObserverPod = testObserverPod - if err := w.Write(context.Background(), tc.status); err != nil { - t.Fatalf("Write: %v", err) - } - - a := findCondition(client.status.Conditions, testTypeAvailable) - if a == nil { - t.Fatalf("Available missing") - } - if a.Status != tc.availStatus || a.Reason != tc.availReason || a.Message != tc.wantMessage { - t.Errorf("Available: got %+v, want status=%s reason=%s message=%q", a, tc.availStatus, tc.availReason, tc.wantMessage) - } - d := findCondition(client.status.Conditions, testTypeDegraded) - if d == nil { - t.Fatalf("Degraded missing") - } - if d.Status != tc.degrStatus || d.Reason != tc.degrReason || d.Message != tc.wantMessage { - t.Errorf("Degraded: got %+v, want status=%s reason=%s message=%q", d, tc.degrStatus, tc.degrReason, tc.wantMessage) - } - if client.applies[0].fieldManager != testFieldManager { - t.Errorf("fieldManager: got %q, want %q", client.applies[0].fieldManager, testFieldManager) - } - }) - } -} - -// Status-unchanged reconciles must carry over LTT; otherwise downstream -// rollout gates and alerts would think a transition happened on every tick. -func TestOperatorConditionWriter_repeatedSameStatusCarriesOverLTT(t *testing.T) { - client := &recordingOperatorClient{} - t1 := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC) - fc := clocktesting.NewFakeClock(t1) - w := newTestWriter(client, testObserverPod, fc) - - healthy := HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "abc123", ObserverPod: testObserverPod} - if err := w.Write(context.Background(), healthy); err != nil { - t.Fatalf("first Write: %v", err) - } - fc.Step(5 * time.Minute) - if err := w.Write(context.Background(), healthy); err != nil { - t.Fatalf("second Write: %v", err) - } - - for _, ct := range []string{testTypeAvailable, testTypeDegraded} { - c := findCondition(client.status.Conditions, ct) - if c == nil { - t.Fatalf("%s missing", ct) - } - if !c.LastTransitionTime.Time.Equal(t1) { - t.Errorf("%s.LastTransitionTime: got %v, want %v (unchanged status must carry over)", ct, c.LastTransitionTime.Time, t1) - } - } -} - -// Per-pod fieldManager + per-entry SSA ownership: two writers with -// different ObserverPods coexist in the same CR without clobbering. -func TestOperatorConditionWriter_twoObserverPodsCoexist(t *testing.T) { - client := &recordingOperatorClient{} - fc := clocktesting.NewFakeClock(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) - - const ( - podA = "kube-apiserver-master-0" - podB = "kube-apiserver-master-1" - ) - wa := newTestWriter(client, podA, fc) - wb := newTestWriter(client, podB, fc) - - if err := wa.Write(context.Background(), HealthStatus{Healthz: Healthz{Class: HealthClassOK}, KeyIDHash: "a", ObserverPod: podA}); err != nil { - t.Fatalf("wa.Write: %v", err) - } - if err := wb.Write(context.Background(), HealthStatus{Healthz: Healthz{Class: HealthClassNotOK, Detail: "bad"}, KeyIDHash: "b", ObserverPod: podB}); err != nil { - t.Fatalf("wb.Write: %v", err) - } - - if got, want := len(client.status.Conditions), 4; got != want { - t.Errorf("conditions count: got %d, want %d", got, want) - } - availA := findCondition(client.status.Conditions, "KMSv2HealthPlugin_"+podA+"_Available") - if availA == nil || availA.Status != operatorv1.ConditionTrue { - t.Errorf("podA Available: got %v, want Status=True", availA) - } - availB := findCondition(client.status.Conditions, "KMSv2HealthPlugin_"+podB+"_Available") - if availB == nil || availB.Status != operatorv1.ConditionFalse { - t.Errorf("podB Available: got %v, want Status=False", availB) - } - if client.applies[0].fieldManager != "kms-health-monitor-"+podA { - t.Errorf("apply[0].fieldManager: got %q, want kms-health-monitor-%s", client.applies[0].fieldManager, podA) - } - if client.applies[1].fieldManager != "kms-health-monitor-"+podB { - t.Errorf("apply[1].fieldManager: got %q, want kms-health-monitor-%s", client.applies[1].fieldManager, podB) - } -} diff --git a/pkg/operator/encryption/kms/health/monitor.go b/pkg/operator/encryption/kms/health/monitor.go deleted file mode 100644 index 4ad3c418d3..0000000000 --- a/pkg/operator/encryption/kms/health/monitor.go +++ /dev/null @@ -1,50 +0,0 @@ -package health - -import ( - "context" - "time" - - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/klog/v2" -) - -type Monitor struct { - probe *Probe - writer *OperatorConditionWriter - observerPod string - interval time.Duration - writeTimeout time.Duration -} - -func NewMonitor( - probe *Probe, - writer *OperatorConditionWriter, - observerPod string, - interval, writeTimeout time.Duration, -) *Monitor { - return &Monitor{ - probe: probe, - writer: writer, - observerPod: observerPod, - interval: interval, - writeTimeout: writeTimeout, - } -} - -// Run blocks until ctx is cancelled. wait.UntilWithContext recovers panics -// inside tick so a misbehaving plugin or apiserver flake cannot crash the -// sidecar. -func (m *Monitor) Run(ctx context.Context) { - wait.UntilWithContext(ctx, m.tick, m.interval) -} - -func (m *Monitor) tick(ctx context.Context) { - status := m.probe.Probe(ctx) - status.ObserverPod = m.observerPod - - writeCtx, cancel := context.WithTimeout(ctx, m.writeTimeout) - defer cancel() - if err := m.writer.Write(writeCtx, status); err != nil { - klog.ErrorS(err, "kms-health: writer failed; continuing") - } -} diff --git a/pkg/operator/encryption/kms/health/monitor_test.go b/pkg/operator/encryption/kms/health/monitor_test.go deleted file mode 100644 index 30913e9257..0000000000 --- a/pkg/operator/encryption/kms/health/monitor_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package health - -import ( - "context" - "errors" - "testing" - "time" - - operatorv1 "github.com/openshift/api/operator/v1" - "github.com/openshift/library-go/pkg/operator/v1helpers" - - kmsservice "k8s.io/kms/pkg/service" -) - -type stubKMSService struct { - resp *kmsservice.StatusResponse - err error -} - -func (s *stubKMSService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) { - return s.resp, s.err -} -func (s *stubKMSService) Encrypt(ctx context.Context, uid string, plaintext []byte) (*kmsservice.EncryptResponse, error) { - return nil, errors.New("not used") -} -func (s *stubKMSService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) { - return nil, errors.New("not used") -} - -func newTestMonitor(interval time.Duration) (*Monitor, v1helpers.OperatorClient) { - svc := &stubKMSService{resp: &kmsservice.StatusResponse{Healthz: "ok", KeyID: "k1"}} - client := v1helpers.NewFakeOperatorClient(&operatorv1.OperatorSpec{}, &operatorv1.OperatorStatus{}, nil) - writer := NewOperatorConditionWriter(client, "test-pod") - return NewMonitor(NewProbe(svc, time.Second), writer, "test-pod", interval, time.Second), client -} - -func TestMonitor_tickProbesAndWrites(t *testing.T) { - m, client := newTestMonitor(time.Second) - m.tick(context.Background()) - - _, status, _, _ := client.GetOperatorState() - if len(status.Conditions) == 0 { - t.Fatal("no conditions written after tick") - } -} - -func TestMonitor_runStopsOnContextCancel(t *testing.T) { - m, _ := newTestMonitor(10 * time.Millisecond) - - ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - m.Run(ctx) - close(done) - }() - - time.Sleep(50 * time.Millisecond) - cancel() - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("Monitor did not stop within 2s of context cancel") - } -} diff --git a/pkg/operator/encryption/kms/health/probe.go b/pkg/operator/encryption/kms/health/probe.go deleted file mode 100644 index b7f963d9f7..0000000000 --- a/pkg/operator/encryption/kms/health/probe.go +++ /dev/null @@ -1,87 +0,0 @@ -package health - -import ( - "context" - "crypto/sha256" - "encoding/hex" - "strings" - "time" - - "google.golang.org/grpc/status" - kmsservice "k8s.io/kms/pkg/service" -) - -// healthzMaxBodyLen caps the plugin's Healthz body so a misbehaving -// plugin cannot push multi-MB strings into the OperatorCondition.Message -// on every tick. -const healthzMaxBodyLen = 200 - -// Probe never returns an error: failures classify into HealthStatus.Healthz -// so the Monitor loop stays flat. -type Probe struct { - service kmsservice.Service - timeout time.Duration - now func() time.Time -} - -// NewProbe applies the timeout only when positive; 0 relies on ctx or -// the kmsv2 client's own deadline. -func NewProbe(service kmsservice.Service, timeout time.Duration) *Probe { - return &Probe{ - service: service, - timeout: timeout, - now: time.Now, - } -} - -func (p *Probe) Probe(ctx context.Context) HealthStatus { - if p.timeout > 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, p.timeout) - defer cancel() - } - resp, err := p.service.Status(ctx) - timestamp := p.now() - if err != nil { - return HealthStatus{ - Healthz: classifyRPCError(err), - Timestamp: timestamp, - } - } - return HealthStatus{ - Healthz: classifyHealthz(resp.Healthz), - KeyIDHash: hashKeyID(resp.KeyID), - Timestamp: timestamp, - } -} - -// classifyHealthz truncates on a UTF-8-safe boundary: the proto allows -// arbitrary bytes, but Condition.Message must be valid UTF-8; naive -// byte-slicing could split a multi-byte rune. -func classifyHealthz(body string) Healthz { - if body == string(HealthClassOK) { - return Healthz{Class: HealthClassOK} - } - if len(body) > healthzMaxBodyLen { - body = strings.ToValidUTF8(body[:healthzMaxBodyLen], "") - } - return Healthz{Class: HealthClassNotOK, Detail: body} -} - -// status.Code returns codes.Unknown for non-gRPC errors and the real code -// for gRPC status errors, covering transport failures and deadlines without -// extra branching. -func classifyRPCError(err error) Healthz { - return Healthz{ - Class: HealthClassRPCError, - Detail: status.Code(err).String(), - } -} - -func hashKeyID(keyID string) string { - if keyID == "" { - return "" - } - sum := sha256.Sum256([]byte(keyID)) - return hex.EncodeToString(sum[:]) -} diff --git a/pkg/operator/encryption/kms/health/probe_test.go b/pkg/operator/encryption/kms/health/probe_test.go deleted file mode 100644 index 856ba20187..0000000000 --- a/pkg/operator/encryption/kms/health/probe_test.go +++ /dev/null @@ -1,128 +0,0 @@ -package health - -import ( - "context" - "crypto/sha256" - "encoding/hex" - "errors" - "strings" - "testing" - "time" - "unicode/utf8" - - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - kmsservice "k8s.io/kms/pkg/service" -) - -type fakeService struct { - StatusFn func(ctx context.Context) (*kmsservice.StatusResponse, error) -} - -func (f *fakeService) Status(ctx context.Context) (*kmsservice.StatusResponse, error) { - return f.StatusFn(ctx) -} -func (f *fakeService) Encrypt(ctx context.Context, uid string, data []byte) (*kmsservice.EncryptResponse, error) { - panic("Encrypt not expected") -} -func (f *fakeService) Decrypt(ctx context.Context, uid string, req *kmsservice.DecryptRequest) ([]byte, error) { - panic("Decrypt not expected") -} - -func sha256Hex(s string) string { - sum := sha256.Sum256([]byte(s)) - return hex.EncodeToString(sum[:]) -} - -func TestProbe_healthy(t *testing.T) { - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{Healthz: "ok", Version: "v2", KeyID: "test-key-1"}, nil - }, - } - got := NewProbe(fake, time.Second).Probe(context.Background()) - - if !got.Healthz.IsOK() { - t.Errorf("Healthz: got %q, want IsOK", got.Healthz) - } - if got.KeyIDHash != sha256Hex("test-key-1") { - t.Errorf("KeyIDHash mismatch") - } - if got.Timestamp.IsZero() { - t.Error("Timestamp: expected non-zero") - } -} - -func TestProbe_notOK(t *testing.T) { - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{Healthz: "not-ready", KeyID: "test-key-1"}, nil - }, - } - got := NewProbe(fake, time.Second).Probe(context.Background()) - - want := Healthz{Class: HealthClassNotOK, Detail: "not-ready"} - if got.Healthz != want { - t.Errorf("Healthz: got %q, want %q", got.Healthz, want) - } -} - -// Naive byte-slicing at healthzMaxBodyLen would split a multi-byte rune; -// apiserver rejects Condition.Message containing invalid UTF-8 and would -// freeze the published status. The 198 + 4-byte emoji + tail boundary -// targets that exact slice point. -func TestProbe_notOK_truncatesSafelyAtMultiByteBoundary(t *testing.T) { - body := strings.Repeat("x", 198) + "👍" + "tail" - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return &kmsservice.StatusResponse{Healthz: body, KeyID: "test-key-1"}, nil - }, - } - got := NewProbe(fake, time.Second).Probe(context.Background()) - - if !utf8.ValidString(got.Healthz.Detail) { - t.Errorf("Healthz detail is not valid UTF-8: %q", got.Healthz.Detail) - } - if len(got.Healthz.Detail) > healthzMaxBodyLen { - t.Errorf("Healthz detail too long: got %d bytes, want <= %d", len(got.Healthz.Detail), healthzMaxBodyLen) - } -} - -func TestProbe_respectsPerProbeTimeout(t *testing.T) { - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - <-ctx.Done() - return nil, status.FromContextError(ctx.Err()).Err() - }, - } - probe := NewProbe(fake, 20*time.Millisecond) - - start := time.Now() - got := probe.Probe(context.Background()) - elapsed := time.Since(start) - - if elapsed > 500*time.Millisecond { - t.Errorf("probe elapsed %v: per-probe timeout not applied", elapsed) - } - want := Healthz{Class: HealthClassRPCError, Detail: "DeadlineExceeded"} - if got.Healthz != want { - t.Errorf("Healthz: got %q, want %q", got.Healthz, want) - } -} - -func TestProbe_rpcError_classifiesNonGRPCAsUnknown(t *testing.T) { - fake := &fakeService{ - StatusFn: func(ctx context.Context) (*kmsservice.StatusResponse, error) { - return nil, errors.New("connection refused") - }, - } - got := NewProbe(fake, time.Second).Probe(context.Background()) - - want := Healthz{Class: HealthClassRPCError, Detail: codes.Unknown.String()} - if got.Healthz != want { - t.Errorf("Healthz: got %q, want %q", got.Healthz, want) - } - if got.KeyIDHash != "" { - t.Errorf("KeyIDHash: got %q, want empty on unreachable plugin", got.KeyIDHash) - } -} diff --git a/pkg/operator/encryption/kms/health/types.go b/pkg/operator/encryption/kms/health/types.go index 1db7e93735..dfad4fa577 100644 --- a/pkg/operator/encryption/kms/health/types.go +++ b/pkg/operator/encryption/kms/health/types.go @@ -1,36 +1,42 @@ package health -import "time" +import ( + "sort" -type HealthClass string + "k8s.io/apimachinery/pkg/runtime/schema" +) + +type TargetOperator string const ( - HealthClassOK HealthClass = "ok" - HealthClassNotOK HealthClass = "not-ok" - HealthClassRPCError HealthClass = "rpc-error" + kubeAPIServer TargetOperator = "kube-apiserver" + openshiftAPIServer TargetOperator = "openshift-apiserver" + authAPIServer TargetOperator = "auth-apiserver" ) -type Healthz struct { - Class HealthClass - Detail string +var supportedOperators = map[TargetOperator]struct { + GVR schema.GroupVersionResource + GVK schema.GroupVersionKind +}{ + kubeAPIServer: { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "kubeapiservers"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "KubeAPIServer"}, + }, + authAPIServer: { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "authentications"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "Authentication"}, + }, + openshiftAPIServer: { + GVR: schema.GroupVersionResource{Group: "operator.openshift.io", Version: "v1", Resource: "openshiftapiservers"}, + GVK: schema.GroupVersionKind{Group: "operator.openshift.io", Version: "v1", Kind: "OpenShiftAPIServer"}, + }, } -// IsOK is the canonical predicate; prefer it over comparing Class so -// consumers don't depend on the internal classification shape. -func (h Healthz) IsOK() bool { return h.Class == HealthClassOK } - -type HealthStatus struct { - Healthz Healthz - - // KeyIDHash is the sha256 hex of the plugin's KeyID, empty when Status - // couldn't be reached. Hashing avoids leaking key material; consumers - // can still diff hashes across instances to detect rotation skew. - KeyIDHash string - - Timestamp time.Time - - // ObserverPod identifies this monitor instance; one OperatorCondition - // CR aggregates entries from N pods, so identity must travel with - // each condition entry. - ObserverPod string +func supportedOperatorKeys() []string { + keys := make([]string, 0, len(supportedOperators)) + for k := range supportedOperators { + keys = append(keys, string(k)) + } + sort.Strings(keys) + return keys } diff --git a/pkg/operator/encryption/kms/health/writer.go b/pkg/operator/encryption/kms/health/writer.go new file mode 100644 index 0000000000..dd05411fd3 --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer.go @@ -0,0 +1,35 @@ +package health + +import ( + "fmt" + + applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" + "github.com/openshift/library-go/pkg/operator/genericoperatorclient" + "github.com/openshift/library-go/pkg/operator/v1helpers" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/client-go/rest" + "k8s.io/utils/clock" +) + +func buildWriter(cfg *rest.Config, targetOperator TargetOperator) (v1helpers.OperatorClientWithFinalizers, error) { + target := supportedOperators[TargetOperator(targetOperator)] + operatorClient, _, err := genericoperatorclient.NewClusterScopedOperatorClient( + clock.RealClock{}, cfg, target.GVR, target.GVK, + emptyOperatorSpec, emptyOperatorStatus, + ) + if err != nil { + return nil, fmt.Errorf("build operator client for %s: %w", targetOperator, err) + } + + // TODO create writer + + return operatorClient, nil +} + +func emptyOperatorSpec(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorSpecApplyConfiguration, error) { + return applyoperatorv1.OperatorSpec(), nil +} + +func emptyOperatorStatus(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorStatusApplyConfiguration, error) { + return applyoperatorv1.OperatorStatus(), nil +} From 8d9b06a0fead1042bfcf131bda1357a143374b3d Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Tue, 26 May 2026 18:42:35 +0200 Subject: [PATCH 7/8] kms/health: add unit tests for checker and writer --- pkg/operator/encryption/kms/health/checker.go | 60 ++++++++--- .../encryption/kms/health/checker_test.go | 68 +++++++++++++ pkg/operator/encryption/kms/health/cmd.go | 34 ++++--- pkg/operator/encryption/kms/health/types.go | 11 +++ pkg/operator/encryption/kms/health/writer.go | 36 ++++++- .../encryption/kms/health/writer_test.go | 99 +++++++++++++++++++ 6 files changed, 281 insertions(+), 27 deletions(-) create mode 100644 pkg/operator/encryption/kms/health/checker_test.go create mode 100644 pkg/operator/encryption/kms/health/writer_test.go diff --git a/pkg/operator/encryption/kms/health/checker.go b/pkg/operator/encryption/kms/health/checker.go index 5f3a165b8d..2694858f43 100644 --- a/pkg/operator/encryption/kms/health/checker.go +++ b/pkg/operator/encryption/kms/health/checker.go @@ -3,21 +3,28 @@ package health import ( "context" "fmt" + "path/filepath" + "strings" "time" k8senvelopekmsv2 "k8s.io/apiserver/pkg/storage/value/encrypt/envelope/kmsv2" kmsservice "k8s.io/kms/pkg/service" ) +type plugin struct { + keyID string + service kmsservice.Service +} + type Checker struct { - services []kmsservice.Service - now func() time.Time + plugins []plugin + now func() time.Time } func NewChecker(ctx context.Context, udsPaths []string, timeout time.Duration) (*Checker, error) { c := Checker{ - services: make([]kmsservice.Service, 0, len(udsPaths)), - now: time.Now, + plugins: make([]plugin, 0, len(udsPaths)), + now: time.Now, } for _, socket := range udsPaths { @@ -31,18 +38,49 @@ func NewChecker(ctx context.Context, udsPaths []string, timeout time.Duration) ( return nil, fmt.Errorf("dial KMS plugin at %q: %w", socket, err) } - c.services = append(c.services, service) + c.plugins = append(c.plugins, plugin{ + keyID: keyIDFromSocket(socket), + service: service, + }) } return &c, nil } -func (c *Checker) CheckStatus(ctx context.Context) error { - for _, service := range c.services { - _, err := service.Status(ctx) - if err != nil { - return fmt.Errorf("check kms plugin status: %w", err) +func keyIDFromSocket(path string) string { + name := strings.TrimSuffix(filepath.Base(path), filepath.Ext(path)) + if i := strings.LastIndex(name, "-"); i >= 0 { + return name[i+1:] + } + return name +} + +// CheckStatus checks the KMS plugin health via UDS. It never reports an error, +// it encodes the error into a Condition. +func (c *Checker) CheckStatus(ctx context.Context) []PluginHealthCondition { + conditions := make([]PluginHealthCondition, 0, len(c.plugins)) + + // Safe to parallelise: each plugin probes an independent socket / has a unique index in slice. + for _, p := range c.plugins { + cond := PluginHealthCondition{ + KeyID: p.keyID, + LastChecked: c.now(), } + + resp, err := p.service.Status(ctx) + switch { + case err != nil: + cond.Status = "error" + cond.Detail = err.Error() + case resp.Healthz == "ok": + cond.Status = "healthy" + cond.KEKID = resp.KeyID + default: + cond.Status = "unhealthy" + cond.Detail = resp.Healthz + } + + conditions = append(conditions, cond) } - return nil + return conditions } diff --git a/pkg/operator/encryption/kms/health/checker_test.go b/pkg/operator/encryption/kms/health/checker_test.go new file mode 100644 index 0000000000..d02a191eb6 --- /dev/null +++ b/pkg/operator/encryption/kms/health/checker_test.go @@ -0,0 +1,68 @@ +package health + +import ( + "context" + "fmt" + "reflect" + "testing" + "time" + + kmsservice "k8s.io/kms/pkg/service" +) + +type fakeService struct { + resp *kmsservice.StatusResponse + err error +} + +func (f *fakeService) Status(context.Context) (*kmsservice.StatusResponse, error) { + return f.resp, f.err +} +func (f *fakeService) Encrypt(context.Context, string, []byte) (*kmsservice.EncryptResponse, error) { + return nil, nil +} +func (f *fakeService) Decrypt(context.Context, string, *kmsservice.DecryptRequest) ([]byte, error) { + return nil, nil +} + +func TestChecker_CheckStatus(t *testing.T) { + fixed := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + c := &Checker{ + plugins: []plugin{ + {keyID: "1", service: &fakeService{resp: &kmsservice.StatusResponse{Healthz: "ok", KeyID: "kek-abc"}}}, + {keyID: "2", service: &fakeService{err: fmt.Errorf("connection refused")}}, + {keyID: "3", service: &fakeService{resp: &kmsservice.StatusResponse{Healthz: "degraded"}}}, + }, + now: func() time.Time { return fixed }, + } + + got := c.CheckStatus(context.Background()) + want := []PluginHealthCondition{ + {KeyID: "1", KEKID: "kek-abc", Status: "healthy", LastChecked: fixed}, + {KeyID: "2", Status: "error", Detail: "connection refused", LastChecked: fixed}, + {KeyID: "3", Status: "unhealthy", Detail: "degraded", LastChecked: fixed}, + } + if !reflect.DeepEqual(got, want) { + t.Errorf("CheckStatus():\n got: %+v\n want: %+v", got, want) + } +} + +func Test_keyIDFromSocket(t *testing.T) { + tests := []struct { + path string + want string + }{ + {"/var/run/kmsplugin/kms-1.sock", "1"}, + {"/var/run/kmsplugin/kms-2.sock", "2"}, + {"kms-42.sock", "42"}, + {"plugin.sock", "plugin"}, + {"/tmp/my-custom-provider.sock", "provider"}, + } + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + if got := keyIDFromSocket(tt.path); got != tt.want { + t.Errorf("keyIDFromSocket(%q) = %q, want %q", tt.path, got, tt.want) + } + }) + } +} diff --git a/pkg/operator/encryption/kms/health/cmd.go b/pkg/operator/encryption/kms/health/cmd.go index 261926869d..5939567955 100644 --- a/pkg/operator/encryption/kms/health/cmd.go +++ b/pkg/operator/encryption/kms/health/cmd.go @@ -9,6 +9,7 @@ import ( "github.com/spf13/cobra" "github.com/spf13/pflag" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2" @@ -80,28 +81,39 @@ func (o *commandOptions) validate() error { } func (o *commandOptions) run(ctx context.Context) error { + klog.InfoS("kms-health-monitor starting", + "socket", o.kmsSockets, + "targetOperator", o.targetOperator, + "observerNode", o.nodeName, + "interval", o.readInterval, + "readTimeout", o.readTimeout, + "writeTimeout", o.writeTimeout, + ) + cfg, err := buildRESTConfig(o.kubeconfig) if err != nil { return fmt.Errorf("build rest config: %w", err) } - _, err = buildWriter(cfg, TargetOperator(o.targetOperator)) + writer, err := buildWriter(cfg, TargetOperator(o.targetOperator), o.nodeName) if err != nil { return fmt.Errorf("create new writer: %w", err) } - - _, err = NewChecker(ctx, o.kmsSockets, o.readTimeout) + checker, err := NewChecker(ctx, o.kmsSockets, o.readTimeout) if err != nil { return fmt.Errorf("create new checker: %w", err) } - klog.InfoS("kms-health-monitor starting", - "socket", o.kmsSockets, - "targetOperator", o.targetOperator, - "observerNode", o.nodeName, - "interval", o.readInterval, - "readTimeout", o.readTimeout, - "writeTimeout", o.writeTimeout, - ) + wait.JitterUntilWithContext(ctx, func(ctx context.Context) { + probeCtx, cancel := context.WithTimeout(ctx, o.readTimeout) + defer cancel() + conditions := checker.CheckStatus(probeCtx) + + writeCtx, writeCancel := context.WithTimeout(ctx, o.writeTimeout) + defer writeCancel() + if err := writer.Apply(writeCtx, conditions); err != nil { + klog.ErrorS(err, "apply operator status") + } + }, o.readInterval, 0.1, false) return nil } diff --git a/pkg/operator/encryption/kms/health/types.go b/pkg/operator/encryption/kms/health/types.go index dfad4fa577..75e8ae04ae 100644 --- a/pkg/operator/encryption/kms/health/types.go +++ b/pkg/operator/encryption/kms/health/types.go @@ -2,6 +2,7 @@ package health import ( "sort" + "time" "k8s.io/apimachinery/pkg/runtime/schema" ) @@ -32,6 +33,16 @@ var supportedOperators = map[TargetOperator]struct { }, } +type PluginHealthCondition struct { + // KeyID is the sequential key identifier assigned by the key controller. + KeyID string `json:"keyID"` + // KEKID is the ID of the key used by the KMS provider for encryption/decryption. + KEKID string `json:"kekID,omitempty"` + Status string `json:"status"` + LastChecked time.Time `json:"lastChecked"` + Detail string `json:"detail,omitempty"` +} + func supportedOperatorKeys() []string { keys := make([]string, 0, len(supportedOperators)) for k := range supportedOperators { diff --git a/pkg/operator/encryption/kms/health/writer.go b/pkg/operator/encryption/kms/health/writer.go index dd05411fd3..a18ae6e969 100644 --- a/pkg/operator/encryption/kms/health/writer.go +++ b/pkg/operator/encryption/kms/health/writer.go @@ -1,18 +1,46 @@ package health import ( + "context" + "encoding/json" "fmt" + operatorv1 "github.com/openshift/api/operator/v1" applyoperatorv1 "github.com/openshift/client-go/operator/applyconfigurations/operator/v1" "github.com/openshift/library-go/pkg/operator/genericoperatorclient" "github.com/openshift/library-go/pkg/operator/v1helpers" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/rest" "k8s.io/utils/clock" ) -func buildWriter(cfg *rest.Config, targetOperator TargetOperator) (v1helpers.OperatorClientWithFinalizers, error) { - target := supportedOperators[TargetOperator(targetOperator)] +type Writer struct { + operatorClient v1helpers.OperatorClientWithFinalizers + nodeName string +} + +func (w *Writer) Apply(ctx context.Context, conditions []PluginHealthCondition) error { + msg, err := json.Marshal(conditions) + if err != nil { + return fmt.Errorf("marshal conditions: %w", err) + } + + // Hardcoded to avoid StatusSyncer side-effects; will be rewritten after API change in operator CR status. + cond := applyoperatorv1.OperatorCondition(). + WithType("KMSHealthReporter_" + w.nodeName). + WithStatus(operatorv1.ConditionTrue). + WithReason("AsExpected"). + WithMessage(string(msg)). + WithLastTransitionTime(metav1.Now()) + + status := applyoperatorv1.OperatorStatus().WithConditions(cond) + fieldManager := "kms-health-monitor-" + w.nodeName + return w.operatorClient.ApplyOperatorStatus(ctx, fieldManager, status) +} + +func buildWriter(cfg *rest.Config, targetOperator TargetOperator, nodeName string) (*Writer, error) { + target := supportedOperators[targetOperator] operatorClient, _, err := genericoperatorclient.NewClusterScopedOperatorClient( clock.RealClock{}, cfg, target.GVR, target.GVK, emptyOperatorSpec, emptyOperatorStatus, @@ -21,9 +49,7 @@ func buildWriter(cfg *rest.Config, targetOperator TargetOperator) (v1helpers.Ope return nil, fmt.Errorf("build operator client for %s: %w", targetOperator, err) } - // TODO create writer - - return operatorClient, nil + return &Writer{operatorClient: operatorClient, nodeName: nodeName}, nil } func emptyOperatorSpec(_ *unstructured.Unstructured, _ string) (*applyoperatorv1.OperatorSpecApplyConfiguration, error) { diff --git a/pkg/operator/encryption/kms/health/writer_test.go b/pkg/operator/encryption/kms/health/writer_test.go new file mode 100644 index 0000000000..476233b7f6 --- /dev/null +++ b/pkg/operator/encryption/kms/health/writer_test.go @@ -0,0 +1,99 @@ +package health + +import ( + "context" + "encoding/json" + "testing" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/openshift/library-go/pkg/operator/v1helpers" +) + +func TestWriter_Apply(t *testing.T) { + tests := []struct { + name string + nodeName string + conditions []PluginHealthCondition + wantType string + wantMsg string + }{ + { + name: "single healthy plugin", + nodeName: "master-0", + conditions: []PluginHealthCondition{ + {KeyID: "1", KEKID: "key-abc", Status: "healthy", LastChecked: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + }, + wantType: "KMSHealthReporter_master-0", + wantMsg: `[{"keyID":"1","kekID":"key-abc","status":"healthy","lastChecked":"2025-01-01T00:00:00Z"}]`, + }, + { + name: "mixed healthy and error", + nodeName: "master-1", + conditions: []PluginHealthCondition{ + {KeyID: "1", KEKID: "key-abc", Status: "healthy", LastChecked: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + {KeyID: "2", Status: "error", Detail: "connection refused", LastChecked: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + }, + wantType: "KMSHealthReporter_master-1", + wantMsg: `[{"keyID":"1","kekID":"key-abc","status":"healthy","lastChecked":"2025-01-01T00:00:00Z"},{"keyID":"2","status":"error","lastChecked":"2025-01-01T00:00:00Z","detail":"connection refused"}]`, + }, + { + name: "unhealthy plugin", + nodeName: "master-2", + conditions: []PluginHealthCondition{ + {KeyID: "1", Status: "unhealthy", Detail: "not ready", LastChecked: time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)}, + }, + wantType: "KMSHealthReporter_master-2", + wantMsg: `[{"keyID":"1","status":"unhealthy","lastChecked":"2025-01-01T00:00:00Z","detail":"not ready"}]`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient := v1helpers.NewFakeOperatorClient( + &operatorv1.OperatorSpec{}, + &operatorv1.OperatorStatus{}, + nil, + ) + w := &Writer{operatorClient: fakeClient, nodeName: tt.nodeName} + + if err := w.Apply(context.Background(), tt.conditions); err != nil { + t.Fatalf("Apply() error: %v", err) + } + + _, status, _, err := fakeClient.GetOperatorState() + if err != nil { + t.Fatalf("GetOperatorState() error: %v", err) + } + + if len(status.Conditions) != 1 { + t.Fatalf("expected 1 condition, got %d", len(status.Conditions)) + } + + cond := status.Conditions[0] + if cond.Type != tt.wantType { + t.Errorf("condition type = %q, want %q", cond.Type, tt.wantType) + } + if cond.Status != operatorv1.ConditionTrue { + t.Errorf("condition status = %q, want %q", cond.Status, operatorv1.ConditionTrue) + } + if cond.Reason != "AsExpected" { + t.Errorf("condition reason = %q, want %q", cond.Reason, "AsExpected") + } + + // Compare as parsed JSON to ignore key ordering differences. + var gotParsed, wantParsed any + if err := json.Unmarshal([]byte(cond.Message), &gotParsed); err != nil { + t.Fatalf("parse condition message: %v", err) + } + if err := json.Unmarshal([]byte(tt.wantMsg), &wantParsed); err != nil { + t.Fatalf("parse want message: %v", err) + } + gotJSON, _ := json.Marshal(gotParsed) + wantJSON, _ := json.Marshal(wantParsed) + if string(gotJSON) != string(wantJSON) { + t.Errorf("condition message:\n got: %s\n want: %s", gotJSON, wantJSON) + } + }) + } +} From 4ae153013fe0203dc47a52a8d2d5720c4d1db0b4 Mon Sep 17 00:00:00 2001 From: Krzysztof Ostrowski Date: Fri, 29 May 2026 12:31:54 +0200 Subject: [PATCH 8/8] kms/health: address review feedback on validation and writer - reject empty/whitespace --kms-sockets entries instead of failing later at dial time - drop manual WithLastTransitionTime; ApplyOperatorStatus already canonicalizes it via SetApplyConditionsLastTransitionTime --- pkg/operator/encryption/kms/health/cmd.go | 5 +++++ pkg/operator/encryption/kms/health/writer.go | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pkg/operator/encryption/kms/health/cmd.go b/pkg/operator/encryption/kms/health/cmd.go index 5939567955..90beebe5dc 100644 --- a/pkg/operator/encryption/kms/health/cmd.go +++ b/pkg/operator/encryption/kms/health/cmd.go @@ -61,6 +61,11 @@ func (o *commandOptions) validate() error { if len(o.kmsSockets) == 0 { return fmt.Errorf("--kms-sockets is required, at least one") } + for _, s := range o.kmsSockets { + if strings.TrimSpace(s) == "" { + return fmt.Errorf("--kms-sockets cannot contain empty entries") + } + } if o.readInterval <= 0 { return fmt.Errorf("--read-interval must be positive") } diff --git a/pkg/operator/encryption/kms/health/writer.go b/pkg/operator/encryption/kms/health/writer.go index a18ae6e969..b824940a5c 100644 --- a/pkg/operator/encryption/kms/health/writer.go +++ b/pkg/operator/encryption/kms/health/writer.go @@ -10,7 +10,6 @@ import ( "github.com/openshift/library-go/pkg/operator/genericoperatorclient" "github.com/openshift/library-go/pkg/operator/v1helpers" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/rest" "k8s.io/utils/clock" ) @@ -31,8 +30,7 @@ func (w *Writer) Apply(ctx context.Context, conditions []PluginHealthCondition) WithType("KMSHealthReporter_" + w.nodeName). WithStatus(operatorv1.ConditionTrue). WithReason("AsExpected"). - WithMessage(string(msg)). - WithLastTransitionTime(metav1.Now()) + WithMessage(string(msg)) status := applyoperatorv1.OperatorStatus().WithConditions(cond) fieldManager := "kms-health-monitor-" + w.nodeName