Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions controllers/incarnation.go
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,35 @@ func (i *Incarnation) isEventDriven() bool {
return i.revision.Spec.EventDriven
}

// cleanupLegacyServiceLevels removes any legacy per-tag PSL that may exist for this
// revision. Self-limiting: once the legacy PSL is gone, the selector list returns
// zero items and this is a single cheap List call. Safe to leave in the state
// machine indefinitely — the selector requires LabelTag so it cannot match the
// shared PSL maintained by ResourceSyncer.syncServiceLevels.
func (i *Incarnation) cleanupLegacyServiceLevels(ctx context.Context) error {
if i.picchuConfig.ServiceLevelsNamespace == "" {
return nil
}
return i.controller.applyPlan(ctx, "Cleanup Legacy Service Levels", &rmplan.DeleteTaggedServiceLevels{
App: i.appName(),
Target: i.targetName(),
Namespace: i.picchuConfig.ServiceLevelsNamespace,
Tag: i.tag,
})
}

// deleteServiceLevels removes the shared PSL for this app/target when retiring.
func (i *Incarnation) deleteServiceLevels(ctx context.Context) error {
if i.picchuConfig.ServiceLevelsNamespace == "" {
return nil
}
return i.controller.applyPlan(ctx, "Delete Service Levels", &rmplan.DeleteServiceLevels{
App: i.appName(),
Target: i.targetName(),
Namespace: i.picchuConfig.ServiceLevelsNamespace,
})
}

// IncarnationCollection helps us collect and select appropriate incarnations
type IncarnationCollection struct {
// Incarnations key'd on revision.spec.app.tag
Expand Down
28 changes: 28 additions & 0 deletions controllers/mock_deployment.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions controllers/plan/sloConfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,22 @@ func (s *SLOConfig) serviceLevelTaggedTotalQueryGRPC() string {
func (s *SLOConfig) serviceLevelTaggedErrorQueryGRPC() string {
return fmt.Sprintf("sum by (grpc_method) (rate(%s{%s=\"%s\"}[{{.window}}]))", s.errorQuery(), s.SLO.ServiceLevelIndicator.TagKey, s.Tag)
}

// sliSource returns SLI queries that preserve the tag dimension via `sum by (tag)`.
// Sloth wraps these as (error)/(total) with no additional aggregation, so the
// resulting recording rule retains `tag`. Sloth's `max(...) without (sloth_window)`
// burn-rate alert template preserves `tag` through to the alert series, allowing
// picchu's IsRevisionTriggered to match by sample.Metric["tag"].
func (s *SLOConfig) sliSource() *slov1alpha1.SLIEvents {
return &slov1alpha1.SLIEvents{
ErrorQuery: fmt.Sprintf("sum by (tag) (rate(%s[{{.window}}]))", s.errorQuery()),
TotalQuery: fmt.Sprintf("sum by (tag) (rate(%s[{{.window}}]))", s.totalQuery()),
}
}

func (s *SLOConfig) sliSourceGRPC() *slov1alpha1.SLIEvents {
return &slov1alpha1.SLIEvents{
ErrorQuery: fmt.Sprintf("sum by (tag, grpc_method) (rate(%s[{{.window}}]))", s.errorQuery()),
TotalQuery: fmt.Sprintf("sum by (tag, grpc_method) (rate(%s[{{.window}}]))", s.totalQuery()),
}
}
75 changes: 75 additions & 0 deletions controllers/plan/syncServiceLevels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package plan

import (
"context"
"fmt"

"github.com/go-logr/logr"
slov1alpha1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1"
picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1"
"go.medium.engineering/picchu/plan"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// SyncServiceLevels creates a single tag-agnostic PrometheusServiceLevel per app/target.
// SLI queries aggregate by tag so Sloth recording rules and burn-rate alerts preserve
// per-revision granularity without per-deploy PSL churn.
type SyncServiceLevels struct {
App string
Target string
Namespace string
Labels map[string]string
ServiceLevelObjectiveLabels picchuv1alpha1.ServiceLevelObjectiveLabels
ServiceLevelObjectives []*picchuv1alpha1.SlothServiceLevelObjective
}

func (p *SyncServiceLevels) Apply(ctx context.Context, cli client.Client, cluster *picchuv1alpha1.Cluster, log logr.Logger) error {
sl, err := p.serviceLevel(log)
if err != nil {
return err
}
if sl == nil {
return nil
}
return plan.CreateOrUpdate(ctx, log, cli, sl)
}

func (p *SyncServiceLevels) serviceLevel(log logr.Logger) (*slov1alpha1.PrometheusServiceLevel, error) {
var slos []slov1alpha1.SLO
for i := range p.ServiceLevelObjectives {
if p.ServiceLevelObjectives[i].Enabled {
config := SLOConfig{
SLO: p.ServiceLevelObjectives[i],
App: p.App,
Name: sanitizeName(p.ServiceLevelObjectives[i].Name),
Labels: p.ServiceLevelObjectiveLabels,
}
slo := config.serviceLevelObjective(log)
if _, ok := p.ServiceLevelObjectives[i].ServiceLevelObjectiveLabels.ServiceLevelLabels["is_grpc"]; ok {
slo.SLI.Events = config.sliSourceGRPC()
} else {
slo.SLI.Events = config.sliSource()
}
slos = append(slos, *slo)
}
}
if len(slos) == 0 {
return nil, nil
}
return &slov1alpha1.PrometheusServiceLevel{
ObjectMeta: metav1.ObjectMeta{
Name: p.serviceLevelName(),
Namespace: p.Namespace,
Labels: p.Labels,
},
Spec: slov1alpha1.PrometheusServiceLevelSpec{
Service: p.App,
SLOs: slos,
},
}, nil
}

func (p *SyncServiceLevels) serviceLevelName() string {
return fmt.Sprintf("%s-%s-servicelevels", p.App, p.Target)
}
167 changes: 167 additions & 0 deletions controllers/plan/syncServiceLevels_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package plan

import (
"context"
"testing"

picchuv1alpha1 "go.medium.engineering/picchu/api/v1alpha1"
"go.medium.engineering/picchu/mocks"
common "go.medium.engineering/picchu/plan/test"
"go.medium.engineering/picchu/test"
"sigs.k8s.io/controller-runtime/pkg/client"

slov1alpha1 "github.com/slok/sloth/pkg/kubernetes/api/sloth/v1"
"github.com/stretchr/testify/assert"
"go.uber.org/mock/gomock"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
)

var (
slsharedplan = &SyncServiceLevels{
App: "test-app",
Target: "production",
Namespace: "testnamespace",
Labels: map[string]string{
picchuv1alpha1.LabelApp: "test-app",
picchuv1alpha1.LabelTarget: "production",
},
ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{
ServiceLevelLabels: map[string]string{
"severity": "test",
},
},
ServiceLevelObjectives: []*picchuv1alpha1.SlothServiceLevelObjective{
{
Enabled: true,
Name: "test-app-availability",
Description: "test desc",
Objective: "99.999",
ServiceLevelIndicator: picchuv1alpha1.ServiceLevelIndicator{
Canary: picchuv1alpha1.SLICanaryConfig{
Enabled: true,
AllowancePercent: 1,
FailAfter: "1m",
},
TagKey: "tag",
AlertAfter: "1m",
ErrorQuery: "sum(rate(test_metric{job=\"test\"}[2m])) by (tag)",
TotalQuery: "sum(rate(test_metric2{job=\"test\"}[2m])) by (tag)",
},
ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{
ServiceLevelLabels: map[string]string{
"team": "test",
},
},
},
{
Enabled: true,
Name: "test-app-availability-GRPC",
Description: "test desc",
Objective: "99.999",
ServiceLevelIndicator: picchuv1alpha1.ServiceLevelIndicator{
Canary: picchuv1alpha1.SLICanaryConfig{
Enabled: true,
AllowancePercent: 1,
FailAfter: "1m",
},
TagKey: "tag",
AlertAfter: "1m",
ErrorQuery: "sum(rate(test_metric{job=\"test\"}[2m])) by (tag)",
TotalQuery: "sum(rate(test_metric2{job=\"test\"}[2m])) by (tag)",
},
ServiceLevelObjectiveLabels: picchuv1alpha1.ServiceLevelObjectiveLabels{
ServiceLevelLabels: map[string]string{
"team": "test",
"is_grpc": "true",
},
},
},
},
}

slsharedexpected = &slov1alpha1.PrometheusServiceLevelList{
Items: []slov1alpha1.PrometheusServiceLevel{
{
ObjectMeta: metav1.ObjectMeta{
Name: "test-app-production-servicelevels",
Namespace: "testnamespace",
Labels: map[string]string{
picchuv1alpha1.LabelApp: "test-app",
picchuv1alpha1.LabelTarget: "production",
},
},
Spec: slov1alpha1.PrometheusServiceLevelSpec{
Service: "test-app",
SLOs: []slov1alpha1.SLO{
{
Name: "test_app_availability",
Objective: 99.999,
Description: "test desc",
Labels: map[string]string{
"severity": "test",
"team": "test",
},
SLI: slov1alpha1.SLI{
Events: &slov1alpha1.SLIEvents{
ErrorQuery: "sum by (tag) (rate(test_app:test_app_availability:errors[{{.window}}]))",
TotalQuery: "sum by (tag) (rate(test_app:test_app_availability:total[{{.window}}]))",
},
},
},
{
Name: "test_app_availability_grpc",
Objective: 99.999,
Description: "test desc",
Labels: map[string]string{
"severity": "test",
"team": "test",
"is_grpc": "true",
},
SLI: slov1alpha1.SLI{
Events: &slov1alpha1.SLIEvents{
ErrorQuery: "sum by (tag, grpc_method) (rate(test_app:test_app_availability_grpc:errors[{{.window}}]))",
TotalQuery: "sum by (tag, grpc_method) (rate(test_app:test_app_availability_grpc:total[{{.window}}]))",
},
},
},
},
},
},
},
}
)

func TestSharedServiceLevels(t *testing.T) {
log := test.MustNewLogger()
ctrl := gomock.NewController(t)
m := mocks.NewMockClient(ctrl)
defer ctrl.Finish()

tests := []client.ObjectKey{
{Name: "test-app-production-servicelevels", Namespace: "testnamespace"},
}
ctx := context.TODO()

for i := range tests {
m.
EXPECT().
Get(ctx, mocks.ObjectKey(tests[i]), gomock.Any()).
Return(common.NotFoundError).
Times(1)
}

for i := range slsharedexpected.Items {
for _, obj := range []runtime.Object{
&slsharedexpected.Items[i],
} {
m.
EXPECT().
Create(ctx, common.K8sEqual(obj)).
Return(nil).
AnyTimes()
}
}

assert.NoError(t, slsharedplan.Apply(ctx, m, cluster, log), "Shouldn't return error.")
}
15 changes: 15 additions & 0 deletions controllers/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ type Deployment interface {
deleteCanaryRules(context.Context) error
syncTaggedServiceLevels(context.Context) error
deleteTaggedServiceLevels(context.Context) error
cleanupLegacyServiceLevels(context.Context) error
deleteServiceLevels(context.Context) error
hasRevision() bool
schedulePermitsRelease() bool
markedAsFailed() bool
Expand Down Expand Up @@ -351,6 +353,9 @@ func Releasing(ctx context.Context, deployment Deployment, lastUpdated *time.Tim
if err := deployment.syncTaggedServiceLevels(ctx); err != nil {
return releasing, err
}
if err := deployment.cleanupLegacyServiceLevels(ctx); err != nil {
return releasing, err
}
if deployment.peakPercent() >= 100 {
return released, nil
}
Expand Down Expand Up @@ -412,6 +417,10 @@ func Deleting(ctx context.Context, deployment Deployment, lastUpdated *time.Time
return deleting, err
}

if err := deployment.deleteServiceLevels(ctx); err != nil {
return deleting, err
}

if deployment.currentPercent() <= 0 {
return deleted, deployment.del(ctx)
}
Expand Down Expand Up @@ -445,6 +454,9 @@ func Failing(ctx context.Context, deployment Deployment, lastUpdated *time.Time)
if err := deployment.deleteTaggedServiceLevels(ctx); err != nil {
return failing, err
}
if err := deployment.deleteServiceLevels(ctx); err != nil {
return failing, err
}
if deployment.currentPercent() <= 0 {
return failed, deployment.retire(ctx)
}
Expand Down Expand Up @@ -474,6 +486,9 @@ func Canarying(ctx context.Context, deployment Deployment, lastUpdated *time.Tim
if err := deployment.syncTaggedServiceLevels(ctx); err != nil {
return canarying, err
}
if err := deployment.cleanupLegacyServiceLevels(ctx); err != nil {
return canarying, err
}

if err := deployment.sync(ctx); err != nil {
return canarying, err
Expand Down
Loading