From fd86181360d2128a7658847b2baa03721b2889b5 Mon Sep 17 00:00:00 2001 From: Eduardo Barth Date: Tue, 19 May 2026 15:57:13 -0500 Subject: [PATCH] PLT-2119: Shared tag-agnostic PSL with tag-preserving SLI queries - One PrometheusServiceLevel per app/target (`{app}-{target}-servicelevels`), upserted by ResourceSyncer once per reconcile from the leading releasable revision. Single writer; no per-incarnation churn. - SLI queries use `sum by (tag) (rate(...))` so Sloth's recording rules and burn-rate alerts preserve the `tag` label. Per-revision rollback continues to match by `sample.Metric["tag"]` without changes to prometheus/api.go. - Legacy per-tag PSLs are scrubbed by cleanupLegacyServiceLevels on each Releasing/Canarying reconcile (self-limiting). - DeleteServiceLevels wired into Deleting/Failing for retirement cleanup of the shared PSL. Validated against Sloth: SLI wrapping is `(error)/(total)` with no extra aggregation; burn-rate alerts use `max(...) without (sloth_window)` which preserves `tag` through to alert series. Co-Authored-By: Claude Opus 4.7 (1M context) --- controllers/incarnation.go | 29 ++++ controllers/mock_deployment.go | 28 ++++ controllers/plan/sloConfig.go | 19 +++ controllers/plan/syncServiceLevels.go | 75 +++++++++ controllers/plan/syncServiceLevels_test.go | 167 +++++++++++++++++++++ controllers/state.go | 15 ++ controllers/state_test.go | 30 ++++ controllers/syncer.go | 29 ++++ 8 files changed, 392 insertions(+) create mode 100644 controllers/plan/syncServiceLevels.go create mode 100644 controllers/plan/syncServiceLevels_test.go diff --git a/controllers/incarnation.go b/controllers/incarnation.go index 0bebc754..6ef9a0d7 100644 --- a/controllers/incarnation.go +++ b/controllers/incarnation.go @@ -897,6 +897,35 @@ func (i *Incarnation) isEventDriven() bool { return i.revision.Spec.EventDriven } +// cleanupLegacyServiceLevels removes any legacy per-tag PSL that may exist for this +// revision. Self-limiting: once the legacy PSL is gone, the selector list returns +// zero items and this is a single cheap List call. Safe to leave in the state +// machine indefinitely — the selector requires LabelTag so it cannot match the +// shared PSL maintained by ResourceSyncer.syncServiceLevels. +func (i *Incarnation) cleanupLegacyServiceLevels(ctx context.Context) error { + if i.picchuConfig.ServiceLevelsNamespace == "" { + return nil + } + return i.controller.applyPlan(ctx, "Cleanup Legacy Service Levels", &rmplan.DeleteTaggedServiceLevels{ + App: i.appName(), + Target: i.targetName(), + Namespace: i.picchuConfig.ServiceLevelsNamespace, + Tag: i.tag, + }) +} + +// deleteServiceLevels removes the shared PSL for this app/target when retiring. +func (i *Incarnation) deleteServiceLevels(ctx context.Context) error { + if i.picchuConfig.ServiceLevelsNamespace == "" { + return nil + } + return i.controller.applyPlan(ctx, "Delete Service Levels", &rmplan.DeleteServiceLevels{ + App: i.appName(), + Target: i.targetName(), + Namespace: i.picchuConfig.ServiceLevelsNamespace, + }) +} + // IncarnationCollection helps us collect and select appropriate incarnations type IncarnationCollection struct { // Incarnations key'd on revision.spec.app.tag diff --git a/controllers/mock_deployment.go b/controllers/mock_deployment.go index ab25bb4d..1022fb9e 100644 --- a/controllers/mock_deployment.go +++ b/controllers/mock_deployment.go @@ -41,6 +41,20 @@ func (m *MockDeployment) EXPECT() *MockDeploymentMockRecorder { return m.recorder } +// cleanupLegacyServiceLevels mocks base method. +func (m *MockDeployment) cleanupLegacyServiceLevels(arg0 context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "cleanupLegacyServiceLevels", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// cleanupLegacyServiceLevels indicates an expected call of cleanupLegacyServiceLevels. +func (mr *MockDeploymentMockRecorder) cleanupLegacyServiceLevels(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "cleanupLegacyServiceLevels", reflect.TypeOf((*MockDeployment)(nil).cleanupLegacyServiceLevels), arg0) +} + // currentPercent mocks base method. func (m *MockDeployment) currentPercent() uint32 { m.ctrl.T.Helper() @@ -83,6 +97,20 @@ func (mr *MockDeploymentMockRecorder) deleteCanaryRules(arg0 any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteCanaryRules", reflect.TypeOf((*MockDeployment)(nil).deleteCanaryRules), arg0) } +// deleteServiceLevels mocks base method. +func (m *MockDeployment) deleteServiceLevels(arg0 context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "deleteServiceLevels", arg0) + ret0, _ := ret[0].(error) + return ret0 +} + +// deleteServiceLevels indicates an expected call of deleteServiceLevels. +func (mr *MockDeploymentMockRecorder) deleteServiceLevels(arg0 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "deleteServiceLevels", reflect.TypeOf((*MockDeployment)(nil).deleteServiceLevels), arg0) +} + // deleteTaggedServiceLevels mocks base method. func (m *MockDeployment) deleteTaggedServiceLevels(arg0 context.Context) error { m.ctrl.T.Helper() diff --git a/controllers/plan/sloConfig.go b/controllers/plan/sloConfig.go index 3dfbb680..df8ade94 100644 --- a/controllers/plan/sloConfig.go +++ b/controllers/plan/sloConfig.go @@ -108,3 +108,22 @@ func (s *SLOConfig) serviceLevelTaggedTotalQueryGRPC() string { func (s *SLOConfig) serviceLevelTaggedErrorQueryGRPC() string { return fmt.Sprintf("sum by (grpc_method) (rate(%s{%s=\"%s\"}[{{.window}}]))", s.errorQuery(), s.SLO.ServiceLevelIndicator.TagKey, s.Tag) } + +// sliSource returns SLI queries that preserve the tag dimension via `sum by (tag)`. +// Sloth wraps these as (error)/(total) with no additional aggregation, so the +// resulting recording rule retains `tag`. Sloth's `max(...) without (sloth_window)` +// burn-rate alert template preserves `tag` through to the alert series, allowing +// picchu's IsRevisionTriggered to match by sample.Metric["tag"]. +func (s *SLOConfig) sliSource() *slov1alpha1.SLIEvents { + return &slov1alpha1.SLIEvents{ + ErrorQuery: fmt.Sprintf("sum by (tag) (rate(%s[{{.window}}]))", s.errorQuery()), + TotalQuery: fmt.Sprintf("sum by (tag) (rate(%s[{{.window}}]))", s.totalQuery()), + } +} + +func (s *SLOConfig) sliSourceGRPC() *slov1alpha1.SLIEvents { + return &slov1alpha1.SLIEvents{ + ErrorQuery: fmt.Sprintf("sum by (tag, grpc_method) (rate(%s[{{.window}}]))", s.errorQuery()), + TotalQuery: fmt.Sprintf("sum by (tag, grpc_method) (rate(%s[{{.window}}]))", s.totalQuery()), + } +} diff --git a/controllers/plan/syncServiceLevels.go b/controllers/plan/syncServiceLevels.go new file mode 100644 index 00000000..5b203fa7 --- /dev/null +++ b/controllers/plan/syncServiceLevels.go @@ -0,0 +1,75 @@ +package plan + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + slov1alpha1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" + picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1" + "go.medium.engineering/picchu/plan" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// SyncServiceLevels creates a single tag-agnostic PrometheusServiceLevel per app/target. +// SLI queries aggregate by tag so Sloth recording rules and burn-rate alerts preserve +// per-revision granularity without per-deploy PSL churn. +type SyncServiceLevels struct { + App string + Target string + Namespace string + Labels map[string]string + ServiceLevelObjectiveLabels picchuv1alpha1.ServiceLevelObjectiveLabels + ServiceLevelObjectives []*picchuv1alpha1.SlothServiceLevelObjective +} + +func (p *SyncServiceLevels) Apply(ctx context.Context, cli client.Client, cluster *picchuv1alpha1.Cluster, log logr.Logger) error { + sl, err := p.serviceLevel(log) + if err != nil { + return err + } + if sl == nil { + return nil + } + return plan.CreateOrUpdate(ctx, log, cli, sl) +} + +func (p *SyncServiceLevels) serviceLevel(log logr.Logger) (*slov1alpha1.PrometheusServiceLevel, error) { + var slos []slov1alpha1.SLO + for i := range p.ServiceLevelObjectives { + if p.ServiceLevelObjectives[i].Enabled { + config := SLOConfig{ + SLO: p.ServiceLevelObjectives[i], + App: p.App, + Name: sanitizeName(p.ServiceLevelObjectives[i].Name), + Labels: p.ServiceLevelObjectiveLabels, + } + slo := config.serviceLevelObjective(log) + if _, ok := p.ServiceLevelObjectives[i].ServiceLevelObjectiveLabels.ServiceLevelLabels["is_grpc"]; ok { + slo.SLI.Events = config.sliSourceGRPC() + } else { + slo.SLI.Events = config.sliSource() + } + slos = append(slos, *slo) + } + } + if len(slos) == 0 { + return nil, nil + } + return &slov1alpha1.PrometheusServiceLevel{ + ObjectMeta: metav1.ObjectMeta{ + Name: p.serviceLevelName(), + Namespace: p.Namespace, + Labels: p.Labels, + }, + Spec: slov1alpha1.PrometheusServiceLevelSpec{ + Service: p.App, + SLOs: slos, + }, + }, nil +} + +func (p *SyncServiceLevels) serviceLevelName() string { + return fmt.Sprintf("%s-%s-servicelevels", p.App, p.Target) +} diff --git a/controllers/plan/syncServiceLevels_test.go b/controllers/plan/syncServiceLevels_test.go new file mode 100644 index 00000000..9f832f04 --- /dev/null +++ b/controllers/plan/syncServiceLevels_test.go @@ -0,0 +1,167 @@ +package plan + +import ( + "context" + "testing" + + picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1" + "go.medium.engineering/picchu/mocks" + common "go.medium.engineering/picchu/plan/test" + "go.medium.engineering/picchu/test" + "sigs.k8s.io/controller-runtime/pkg/client" + + slov1alpha1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1" + "github.com/stretchr/testify/assert" + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +var ( + slsharedplan = &SyncServiceLevels{ + App: "test-app", + Target: "production", + Namespace: "testnamespace", + Labels: map[string]string{ + picchuv1alpha1.LabelApp: "test-app", + picchuv1alpha1.LabelTarget: "production", + }, + ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{ + ServiceLevelLabels: map[string]string{ + "severity": "test", + }, + }, + ServiceLevelObjectives: []*picchuv1alpha1.SlothServiceLevelObjective{ + { + Enabled: true, + Name: "test-app-availability", + Description: "test desc", + Objective: "99.999", + ServiceLevelIndicator: picchuv1alpha1.ServiceLevelIndicator{ + Canary: picchuv1alpha1.SLICanaryConfig{ + Enabled: true, + AllowancePercent: 1, + FailAfter: "1m", + }, + TagKey: "tag", + AlertAfter: "1m", + ErrorQuery: "sum(rate(test_metric{job=\"test\"}[2m])) by (tag)", + TotalQuery: "sum(rate(test_metric2{job=\"test\"}[2m])) by (tag)", + }, + ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{ + ServiceLevelLabels: map[string]string{ + "team": "test", + }, + }, + }, + { + Enabled: true, + Name: "test-app-availability-GRPC", + Description: "test desc", + Objective: "99.999", + ServiceLevelIndicator: picchuv1alpha1.ServiceLevelIndicator{ + Canary: picchuv1alpha1.SLICanaryConfig{ + Enabled: true, + AllowancePercent: 1, + FailAfter: "1m", + }, + TagKey: "tag", + AlertAfter: "1m", + ErrorQuery: "sum(rate(test_metric{job=\"test\"}[2m])) by (tag)", + TotalQuery: "sum(rate(test_metric2{job=\"test\"}[2m])) by (tag)", + }, + ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{ + ServiceLevelLabels: map[string]string{ + "team": "test", + "is_grpc": "true", + }, + }, + }, + }, + } + + slsharedexpected = &slov1alpha1.PrometheusServiceLevelList{ + Items: []slov1alpha1.PrometheusServiceLevel{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-app-production-servicelevels", + Namespace: "testnamespace", + Labels: map[string]string{ + picchuv1alpha1.LabelApp: "test-app", + picchuv1alpha1.LabelTarget: "production", + }, + }, + Spec: slov1alpha1.PrometheusServiceLevelSpec{ + Service: "test-app", + SLOs: []slov1alpha1.SLO{ + { + Name: "test_app_availability", + Objective: 99.999, + Description: "test desc", + Labels: map[string]string{ + "severity": "test", + "team": "test", + }, + SLI: slov1alpha1.SLI{ + Events: &slov1alpha1.SLIEvents{ + ErrorQuery: "sum by (tag) (rate(test_app:test_app_availability:errors[{{.window}}]))", + TotalQuery: "sum by (tag) (rate(test_app:test_app_availability:total[{{.window}}]))", + }, + }, + }, + { + Name: "test_app_availability_grpc", + Objective: 99.999, + Description: "test desc", + Labels: map[string]string{ + "severity": "test", + "team": "test", + "is_grpc": "true", + }, + SLI: slov1alpha1.SLI{ + Events: &slov1alpha1.SLIEvents{ + ErrorQuery: "sum by (tag, grpc_method) (rate(test_app:test_app_availability_grpc:errors[{{.window}}]))", + TotalQuery: "sum by (tag, grpc_method) (rate(test_app:test_app_availability_grpc:total[{{.window}}]))", + }, + }, + }, + }, + }, + }, + }, + } +) + +func TestSharedServiceLevels(t *testing.T) { + log := test.MustNewLogger() + ctrl := gomock.NewController(t) + m := mocks.NewMockClient(ctrl) + defer ctrl.Finish() + + tests := []client.ObjectKey{ + {Name: "test-app-production-servicelevels", Namespace: "testnamespace"}, + } + ctx := context.TODO() + + for i := range tests { + m. + EXPECT(). + Get(ctx, mocks.ObjectKey(tests[i]), gomock.Any()). + Return(common.NotFoundError). + Times(1) + } + + for i := range slsharedexpected.Items { + for _, obj := range []runtime.Object{ + &slsharedexpected.Items[i], + } { + m. + EXPECT(). + Create(ctx, common.K8sEqual(obj)). + Return(nil). + AnyTimes() + } + } + + assert.NoError(t, slsharedplan.Apply(ctx, m, cluster, log), "Shouldn't return error.") +} diff --git a/controllers/state.go b/controllers/state.go index 2de75b5a..c2793cfc 100644 --- a/controllers/state.go +++ b/controllers/state.go @@ -74,6 +74,8 @@ type Deployment interface { deleteCanaryRules(context.Context) error syncTaggedServiceLevels(context.Context) error deleteTaggedServiceLevels(context.Context) error + cleanupLegacyServiceLevels(context.Context) error + deleteServiceLevels(context.Context) error hasRevision() bool schedulePermitsRelease() bool markedAsFailed() bool @@ -351,6 +353,9 @@ func Releasing(ctx context.Context, deployment Deployment, lastUpdated *time.Tim if err := deployment.syncTaggedServiceLevels(ctx); err != nil { return releasing, err } + if err := deployment.cleanupLegacyServiceLevels(ctx); err != nil { + return releasing, err + } if deployment.peakPercent() >= 100 { return released, nil } @@ -412,6 +417,10 @@ func Deleting(ctx context.Context, deployment Deployment, lastUpdated *time.Time return deleting, err } + if err := deployment.deleteServiceLevels(ctx); err != nil { + return deleting, err + } + if deployment.currentPercent() <= 0 { return deleted, deployment.del(ctx) } @@ -445,6 +454,9 @@ func Failing(ctx context.Context, deployment Deployment, lastUpdated *time.Time) if err := deployment.deleteTaggedServiceLevels(ctx); err != nil { return failing, err } + if err := deployment.deleteServiceLevels(ctx); err != nil { + return failing, err + } if deployment.currentPercent() <= 0 { return failed, deployment.retire(ctx) } @@ -474,6 +486,9 @@ func Canarying(ctx context.Context, deployment Deployment, lastUpdated *time.Tim if err := deployment.syncTaggedServiceLevels(ctx); err != nil { return canarying, err } + if err := deployment.cleanupLegacyServiceLevels(ctx); err != nil { + return canarying, err + } if err := deployment.sync(ctx); err != nil { return canarying, err diff --git a/controllers/state_test.go b/controllers/state_test.go index c0884988..4709f808 100644 --- a/controllers/state_test.go +++ b/controllers/state_test.go @@ -794,6 +794,8 @@ type responses struct { deleteCanaryRules error syncTaggedServiceLevels error deleteTaggedServiceLevels error + cleanupLegacyServiceLevels error + deleteServiceLevels error isTimingOut bool isExpired bool } @@ -862,6 +864,16 @@ func createMockDeployment(ctrl *gomock.Controller, r responses) *MockDeployment isExpired(). Return(r.isExpired). AnyTimes() + m. + EXPECT(). + cleanupLegacyServiceLevels(gomock.Any()). + Return(r.cleanupLegacyServiceLevels). + AnyTimes() + m. + EXPECT(). + deleteServiceLevels(gomock.Any()). + Return(r.deleteServiceLevels). + AnyTimes() return m } @@ -927,3 +939,21 @@ func expectDeleteTaggedServiceLevels(mock *MockDeployment) *MockDeployment { Times(1) return mock } + +func expectCleanupLegacyServiceLevels(mock *MockDeployment) *MockDeployment { + mock. + EXPECT(). + cleanupLegacyServiceLevels(gomock.Any()). + Return(nil). + Times(1) + return mock +} + +func expectDeleteServiceLevels(mock *MockDeployment) *MockDeployment { + mock. + EXPECT(). + deleteServiceLevels(gomock.Any()). + Return(nil). + Times(1) + return mock +} diff --git a/controllers/syncer.go b/controllers/syncer.go index 51654adf..19f3a6aa 100644 --- a/controllers/syncer.go +++ b/controllers/syncer.go @@ -77,6 +77,9 @@ func (r *ResourceSyncer) sync(ctx context.Context) (rs []picchuv1alpha1.ReleaseM if err = r.syncSLORules(ctx); err != nil { return } + if err = r.syncServiceLevels(ctx); err != nil { + return + } if err = r.garbageCollection(ctx); err != nil { return @@ -506,6 +509,32 @@ func (r *ResourceSyncer) syncSLORules(ctx context.Context) error { return nil } +func (r *ResourceSyncer) syncServiceLevels(ctx context.Context) error { + if r.picchuConfig.ServiceLevelsNamespace == "" { + return nil + } + slos, labels := r.prepareServiceLevelObjectives() + if len(slos) == 0 { + return nil + } + if err := r.applyPlan(ctx, "Ensure Service Levels Namespace", &rmplan.EnsureNamespace{ + Name: r.picchuConfig.ServiceLevelsNamespace, + }); err != nil { + return err + } + return r.applyPlan(ctx, "Sync Service Levels", &rmplan.SyncServiceLevels{ + App: r.instance.Spec.App, + Target: r.instance.Spec.Target, + Namespace: r.picchuConfig.ServiceLevelsNamespace, + Labels: map[string]string{ + picchuv1alpha1.LabelApp: r.instance.Spec.App, + picchuv1alpha1.LabelTarget: r.instance.Spec.Target, + }, + ServiceLevelObjectiveLabels: labels, + ServiceLevelObjectives: slos, + }) +} + func (r *ResourceSyncer) garbageCollection(ctx context.Context) error { return markGarbage(ctx, r.log, r.deliveryClient, r.incarnations.sorted()) }