From 6721bdc8d4f313f1d9f66a77f58acbdf19e5e4fc Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Tue, 23 Jun 2026 12:17:01 -0500 Subject: [PATCH 01/12] allow capacity http errors to requeue Signed-off-by: Britania Rodriguez Reyes --- .../azure/compute/vmsizerecommenderclient.go | 24 +++--- .../compute/vmsizerecommenderclient_test.go | 80 +++++++++++++++---- pkg/clients/httputil/httputil.go | 49 ++++++++++++ pkg/clients/httputil/httputil_test.go | 76 ++++++++++++++++++ pkg/scheduler/framework/framework.go | 11 +++ 5 files changed, 211 insertions(+), 29 deletions(-) create mode 100644 pkg/clients/httputil/httputil_test.go diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient.go b/pkg/clients/azure/compute/vmsizerecommenderclient.go index 0c9db0904..d08db0b53 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient.go @@ -16,7 +16,6 @@ import ( "os" "time" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/gofrs/uuid" "google.golang.org/protobuf/encoding/protojson" "k8s.io/klog/v2" @@ -69,10 +68,12 @@ func NewAttributeBasedVMSizeRecommenderClient( } // GenerateAttributeBasedRecommendations generates VM size recommendations based on attributes. +// Transient HTTP errors (429, 500, 502, 503, 504) are wrapped with ErrExpectedBehavior so that +// the scheduler can requeue the placement for later retry. func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommendations( ctx context.Context, req *computev1.GenerateAttributeBasedRecommendationsRequest, -) (response *computev1.GenerateAttributeBasedRecommendationsResponse, err error) { +) (*computev1.GenerateAttributeBasedRecommendationsResponse, error) { if req == nil { return nil, controller.NewUnexpectedBehaviorError(errors.New("request cannot be nil")) } @@ -116,17 +117,11 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda // Execute the request startTime := time.Now() klog.V(2).InfoS("Generating VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID) - defer func() { - latency := time.Since(startTime).Milliseconds() - if err != nil { - klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latency", latency) - } else { - klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latency", latency) - } - }() resp, err := c.httpClient.Do(httpReq) + latency := time.Since(startTime).Milliseconds() if err != nil { + klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return nil, fmt.Errorf("failed to execute request: %w", err) } defer resp.Body.Close() @@ -134,22 +129,27 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda // Read response body respBody, err := io.ReadAll(resp.Body) if err != nil { + klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return nil, fmt.Errorf("failed to read response body: %w", err) } // Check status code if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return nil, fmt.Errorf("request failed with status %d: %w", resp.StatusCode, runtime.NewResponseError(resp)) + httpErr := httputil.NewHTTPError(resp) + klog.ErrorS(httpErr, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + return nil, httpErr } // Unmarshal response using protojson for proper proto3 support - response = &computev1.GenerateAttributeBasedRecommendationsResponse{} + response := &computev1.GenerateAttributeBasedRecommendationsResponse{} unmarshaler := protojson.UnmarshalOptions{ DiscardUnknown: true, } if err := unmarshaler.Unmarshal(respBody, response); err != nil { + klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return nil, fmt.Errorf("failed to unmarshal response: %w", err) } + klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return response, nil } diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go index bfb36c60a..6f6373342 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go @@ -15,6 +15,7 @@ import ( "google.golang.org/protobuf/proto" computev1 "go.goms.io/fleet/apis/protos/azure/compute/v1" + "go.goms.io/fleet/pkg/clients/httputil" "go.goms.io/fleet/test/utils/azure/compute" ) @@ -81,13 +82,14 @@ func TestNewAttributeBasedVMSizeRecommenderClient(t *testing.T) { func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { tests := []struct { - name string - request *computev1.GenerateAttributeBasedRecommendationsRequest - mockStatusCode int - mockResponse string - wantResponse *computev1.GenerateAttributeBasedRecommendationsResponse - wantErr bool - wantErrMsg string + name string + request *computev1.GenerateAttributeBasedRecommendationsRequest + mockStatusCode int + mockResponse string + wantResponse *computev1.GenerateAttributeBasedRecommendationsResponse + wantErr bool + wantErrMsg string + wantIsTransient bool // true if error should be a transient HTTPError }{ { name: "successful request with regular priority profile", @@ -169,7 +171,7 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { wantErrMsg: "either regular priority profile or spot priority profile must be provided", }, { - name: "HTTP 400 error", + name: "HTTP 400 error is not transient", request: &computev1.GenerateAttributeBasedRecommendationsRequest{ SubscriptionId: "sub-123", Location: "eastus", @@ -177,13 +179,14 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { TargetCapacity: 5, }, }, - mockStatusCode: http.StatusBadRequest, - mockResponse: `{"error":"invalid request"}`, - wantErr: true, - wantErrMsg: "request failed with status 400", + mockStatusCode: http.StatusBadRequest, + mockResponse: `{"error":"invalid request"}`, + wantErr: true, + wantErrMsg: "request failed with status 400: POST", + wantIsTransient: false, // 400 is NOT transient }, { - name: "HTTP 500 error", + name: "HTTP 500 error is transient", request: &computev1.GenerateAttributeBasedRecommendationsRequest{ SubscriptionId: "sub-123", Location: "eastus", @@ -191,10 +194,41 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { TargetCapacity: 5, }, }, - mockStatusCode: http.StatusInternalServerError, - mockResponse: `{"error":"internal server error"}`, - wantErr: true, - wantErrMsg: "request failed with status 500", + mockStatusCode: http.StatusInternalServerError, + mockResponse: `{"error":"internal server error"}`, + wantErr: true, + wantErrMsg: "request failed with status 500: POST", + wantIsTransient: true, // 500 IS transient + }, + { + name: "HTTP 503 error is transient", + request: &computev1.GenerateAttributeBasedRecommendationsRequest{ + SubscriptionId: "sub-123", + Location: "eastus", + RegularPriorityProfile: &computev1.RegularPriorityProfile{ + TargetCapacity: 5, + }, + }, + mockStatusCode: http.StatusServiceUnavailable, + mockResponse: `{"error":"service unavailable"}`, + wantErr: true, + wantErrMsg: "request failed with status 503: POST", + wantIsTransient: true, // 503 IS transient + }, + { + name: "HTTP 429 error is transient", + request: &computev1.GenerateAttributeBasedRecommendationsRequest{ + SubscriptionId: "sub-123", + Location: "eastus", + RegularPriorityProfile: &computev1.RegularPriorityProfile{ + TargetCapacity: 5, + }, + }, + mockStatusCode: http.StatusTooManyRequests, + mockResponse: `{"error":"too many requests"}`, + wantErr: true, + wantErrMsg: "request failed with status 429: POST", + wantIsTransient: true, // 429 IS transient }, { name: "invalid JSON response", @@ -240,6 +274,18 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { return } + // Check if error is a transient HTTPError. + if tt.wantErr && tt.wantIsTransient { + if !httputil.IsTransientHTTPError(err) { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want transient HTTPError", err) + } + } + if tt.wantErr && !tt.wantIsTransient && tt.mockStatusCode >= 400 && tt.mockStatusCode < 500 { + if httputil.IsTransientHTTPError(err) { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be transient HTTPError for 4xx errors", err) + } + } + // Compare response. if !proto.Equal(tt.wantResponse, got) { t.Errorf("GenerateAttributeBasedRecommendations() = %+v, want %+v", got, tt.wantResponse) diff --git a/pkg/clients/httputil/httputil.go b/pkg/clients/httputil/httputil.go index 7ccbe84f6..6f7d75831 100644 --- a/pkg/clients/httputil/httputil.go +++ b/pkg/clients/httputil/httputil.go @@ -7,6 +7,8 @@ Licensed under the MIT license. package httputil import ( + "errors" + "fmt" "net/http" "time" @@ -41,3 +43,50 @@ var ( // DefaultClientForAzure is the default HTTP client to access Azure services. DefaultClientForAzure = &http.Client{Timeout: HTTPTimeoutAzure} ) + +// transientHTTPStatusCodes defines HTTP status codes that indicate transient errors +// which may succeed on retry. +var transientHTTPStatusCodes = map[int]bool{ + http.StatusTooManyRequests: true, // 429 - Rate limiting + http.StatusInternalServerError: true, // 500 - Server error + http.StatusBadGateway: true, // 502 - Bad gateway + http.StatusServiceUnavailable: true, // 503 - Service unavailable + http.StatusGatewayTimeout: true, // 504 - Gateway timeout +} + +// HTTPError represents an HTTP error with a status code that can be checked +// for transient error conditions. +type HTTPError struct { + StatusCode int + Method string + URL string +} + +// Error implements the error interface for HTTPError. +func (e *HTTPError) Error() string { + return fmt.Sprintf("request failed with status %d: %s %s", e.StatusCode, e.Method, e.URL) +} + +// IsTransient returns true if this error represents a transient HTTP error +// that may succeed on retry. +func (e *HTTPError) IsTransient() bool { + return transientHTTPStatusCodes[e.StatusCode] +} + +// NewHTTPError creates a new HTTPError from an HTTP response. +func NewHTTPError(resp *http.Response) *HTTPError { + return &HTTPError{ + StatusCode: resp.StatusCode, + Method: resp.Request.Method, + URL: resp.Request.URL.String(), + } +} + +// IsTransientHTTPError checks if the given error is or wraps a transient HTTP error. +func IsTransientHTTPError(err error) bool { + var httpErr *HTTPError + if errors.As(err, &httpErr) { + return httpErr.IsTransient() + } + return false +} diff --git a/pkg/clients/httputil/httputil_test.go b/pkg/clients/httputil/httputil_test.go new file mode 100644 index 000000000..6de618041 --- /dev/null +++ b/pkg/clients/httputil/httputil_test.go @@ -0,0 +1,76 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package httputil + +import ( + "fmt" + "net/http" + "net/url" + "testing" +) + +func TestHTTPError(t *testing.T) { + resp := &http.Response{ + StatusCode: http.StatusServiceUnavailable, + Request: &http.Request{ + Method: http.MethodPost, + URL: &url.URL{ + Scheme: "https", + Host: "example.com", + Path: "/api/v1/resource", + }, + }, + } + + httpErr := NewHTTPError(resp) + + // Test fields are populated correctly. + if httpErr.StatusCode != http.StatusServiceUnavailable { + t.Errorf("HTTPError.StatusCode = %d, want %d", httpErr.StatusCode, http.StatusServiceUnavailable) + } + if httpErr.Method != http.MethodPost { + t.Errorf("HTTPError.Method = %q, want %q", httpErr.Method, http.MethodPost) + } + if httpErr.URL != "https://example.com/api/v1/resource" { + t.Errorf("HTTPError.URL = %q, want %q", httpErr.URL, "https://example.com/api/v1/resource") + } + + // Test Error() method. + wantErrMsg := "request failed with status 503: POST https://example.com/api/v1/resource" + if got := httpErr.Error(); got != wantErrMsg { + t.Errorf("HTTPError.Error() = %q, want %q", got, wantErrMsg) + } +} + +func TestIsTransientHTTPError(t *testing.T) { + tests := []struct { + name string + err error + want bool + }{ + {"nil error", nil, false}, + {"non-HTTP error", fmt.Errorf("some error"), false}, + {"400 Bad Request", &HTTPError{StatusCode: http.StatusBadRequest}, false}, + {"401 Unauthorized", &HTTPError{StatusCode: http.StatusUnauthorized}, false}, + {"403 Forbidden", &HTTPError{StatusCode: http.StatusForbidden}, false}, + {"404 Not Found", &HTTPError{StatusCode: http.StatusNotFound}, false}, + {"429 Too Many Requests", &HTTPError{StatusCode: http.StatusTooManyRequests}, true}, + {"500 Internal Server Error", &HTTPError{StatusCode: http.StatusInternalServerError}, true}, + {"502 Bad Gateway", &HTTPError{StatusCode: http.StatusBadGateway}, true}, + {"503 Service Unavailable", &HTTPError{StatusCode: http.StatusServiceUnavailable}, true}, + {"504 Gateway Timeout", &HTTPError{StatusCode: http.StatusGatewayTimeout}, true}, + {"wrapped transient error", fmt.Errorf("outer: %w", &HTTPError{StatusCode: http.StatusServiceUnavailable}), true}, + {"wrapped non-transient error", fmt.Errorf("outer: %w", &HTTPError{StatusCode: http.StatusBadRequest}), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsTransientHTTPError(tt.err); got != tt.want { + t.Errorf("IsTransientHTTPError() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/scheduler/framework/framework.go b/pkg/scheduler/framework/framework.go index d0018214e..cc318facf 100644 --- a/pkg/scheduler/framework/framework.go +++ b/pkg/scheduler/framework/framework.go @@ -40,6 +40,7 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/pkg/clients/httputil" "go.goms.io/fleet/pkg/scheduler/clustereligibilitychecker" "go.goms.io/fleet/pkg/scheduler/queue" "go.goms.io/fleet/pkg/utils/annotations" @@ -539,6 +540,11 @@ func (f *framework) runAllPluginsForPickAllPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) + // If the error is a transient HTTP error (e.g., 503 from external service), + // return it as-is so the scheduler can requeue. Otherwise, wrap it as unexpected behavior. + if httputil.IsTransientHTTPError(err) { + return nil, nil, err + } return nil, nil, controller.NewUnexpectedBehaviorError(err) } @@ -1159,6 +1165,11 @@ func (f *framework) runAllPluginsForPickNPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) + // If the error is a transient HTTP error (e.g., 503 from external service), + // return it as-is so the scheduler can requeue. Otherwise, wrap it as unexpected behavior. + if httputil.IsTransientHTTPError(err) { + return nil, nil, err + } return nil, nil, controller.NewUnexpectedBehaviorError(err) } From 101c7c07218ad55686240426c40ab1208d26f1bb Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Tue, 23 Jun 2026 12:36:16 -0500 Subject: [PATCH 02/12] restore defer logging Signed-off-by: Britania Rodriguez Reyes --- .../azure/compute/vmsizerecommenderclient.go | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient.go b/pkg/clients/azure/compute/vmsizerecommenderclient.go index d08db0b53..0e2ad8d39 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient.go @@ -68,12 +68,10 @@ func NewAttributeBasedVMSizeRecommenderClient( } // GenerateAttributeBasedRecommendations generates VM size recommendations based on attributes. -// Transient HTTP errors (429, 500, 502, 503, 504) are wrapped with ErrExpectedBehavior so that -// the scheduler can requeue the placement for later retry. func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommendations( ctx context.Context, req *computev1.GenerateAttributeBasedRecommendationsRequest, -) (*computev1.GenerateAttributeBasedRecommendationsResponse, error) { +) (response *computev1.GenerateAttributeBasedRecommendationsResponse, err error) { if req == nil { return nil, controller.NewUnexpectedBehaviorError(errors.New("request cannot be nil")) } @@ -117,11 +115,17 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda // Execute the request startTime := time.Now() klog.V(2).InfoS("Generating VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID) + defer func() { + latency := time.Since(startTime).Milliseconds() + if err != nil { + klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + } else { + klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + } + }() resp, err := c.httpClient.Do(httpReq) - latency := time.Since(startTime).Milliseconds() if err != nil { - klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return nil, fmt.Errorf("failed to execute request: %w", err) } defer resp.Body.Close() @@ -129,27 +133,22 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda // Read response body respBody, err := io.ReadAll(resp.Body) if err != nil { - klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return nil, fmt.Errorf("failed to read response body: %w", err) } // Check status code if resp.StatusCode < 200 || resp.StatusCode >= 300 { - httpErr := httputil.NewHTTPError(resp) - klog.ErrorS(httpErr, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) - return nil, httpErr + return nil, httputil.NewHTTPError(resp) } // Unmarshal response using protojson for proper proto3 support - response := &computev1.GenerateAttributeBasedRecommendationsResponse{} + response = &computev1.GenerateAttributeBasedRecommendationsResponse{} unmarshaler := protojson.UnmarshalOptions{ DiscardUnknown: true, } - if err := unmarshaler.Unmarshal(respBody, response); err != nil { - klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + if err = unmarshaler.Unmarshal(respBody, response); err != nil { return nil, fmt.Errorf("failed to unmarshal response: %w", err) } - klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) return response, nil } From 9b1bdec5f168363e30205eb4a36acf63d6bde8f2 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Tue, 23 Jun 2026 12:56:52 -0500 Subject: [PATCH 03/12] restore Signed-off-by: Britania Rodriguez Reyes --- pkg/clients/azure/compute/vmsizerecommenderclient.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient.go b/pkg/clients/azure/compute/vmsizerecommenderclient.go index 0e2ad8d39..240e5419e 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient.go @@ -118,9 +118,9 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda defer func() { latency := time.Since(startTime).Milliseconds() if err != nil { - klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + klog.ErrorS(err, "Failed to generate VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latency", latency) } else { - klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latencyMs", latency) + klog.V(2).InfoS("Generated VM size recommendations", "subscriptionID", req.SubscriptionId, "location", req.Location, "clientRequestID", clientRequestID, "latency", latency) } }() From fb2aa472837389227fbc9a7d32b2904de0560741 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Wed, 24 Jun 2026 18:43:23 -0500 Subject: [PATCH 04/12] error chain preservation so that IsTransientHTTPError() can properly detect transient HTTP error Signed-off-by: Britania Rodriguez Reyes --- logs.txt | 0 pkg/scheduler/framework/status.go | 4 +- pkg/scheduler/framework/status_test.go | 77 ++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 logs.txt diff --git a/logs.txt b/logs.txt new file mode 100644 index 000000000..e69de29bb diff --git a/pkg/scheduler/framework/status.go b/pkg/scheduler/framework/status.go index 2b0a67a1f..d5013830d 100644 --- a/pkg/scheduler/framework/status.go +++ b/pkg/scheduler/framework/status.go @@ -140,13 +140,13 @@ func (s *Status) String() string { return strings.Join(desc, ", ") } -// AsError returns a status as an error; it returns nil if the status is of the internalError code. +// AsError returns a status as an error; it returns nil if the status is not of the internalError code. func (s *Status) AsError() error { if !s.IsInteralError() { return nil } - return fmt.Errorf("plugin %s returned an error %s", s.sourcePlugin, s.String()) + return fmt.Errorf("plugin %s returned an error %w", s.sourcePlugin, s.err) } // NewNonErrorStatus returns a Status with a non-error status code. diff --git a/pkg/scheduler/framework/status_test.go b/pkg/scheduler/framework/status_test.go index 896bcba3d..26d5cb9bf 100644 --- a/pkg/scheduler/framework/status_test.go +++ b/pkg/scheduler/framework/status_test.go @@ -17,6 +17,7 @@ limitations under the License. package framework import ( + "errors" "fmt" "strings" "testing" @@ -157,3 +158,79 @@ func TestNilStatusMethods(t *testing.T) { t.Fatalf("String() = %s, want %s", status.String(), wantDesc) } } + +// customError is a custom error type for testing error chain preservation. +type customError struct { + code int +} + +func (e *customError) Error() string { + return fmt.Sprintf("custom error with code %d", e.code) +} + +func TestAsError(t *testing.T) { + testCases := []struct { + name string + status *Status + wantNil bool + originalErr error + }{ + { + name: "nil status returns nil", + status: nil, + wantNil: true, + }, + { + name: "success status returns nil", + status: NewNonErrorStatus(Success, dummyPlugin, "reason1"), + wantNil: true, + }, + { + name: "unschedulable status returns nil", + status: NewNonErrorStatus(ClusterUnschedulable, dummyPlugin, "reason1"), + wantNil: true, + }, + { + name: "internal error preserves error chain", + status: FromError(&customError{code: 503}, dummyPlugin, "reason1", "reason2"), + wantNil: false, + originalErr: &customError{code: 503}, + }, + { + name: "internal error with wrapped error preserves full chain", + status: FromError(fmt.Errorf("outer error: %w", &customError{code: 429}), dummyPlugin, "rate limited"), + wantNil: false, + originalErr: &customError{code: 429}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := tc.status.AsError() + + if tc.wantNil { + if err != nil { + t.Fatalf("AsError() = %v, want nil", err) + } + return + } + + if err == nil { + t.Fatalf("AsError() = nil, want non-nil error") + } + + // Verify error message contains plugin name + if !strings.Contains(err.Error(), dummyPlugin) { + t.Errorf("AsError().Error() = %q, want it to contain plugin name %q", err.Error(), dummyPlugin) + } + + // Verify the error chain is preserved using errors.As + var customErr *customError + if !errors.As(err, &customErr) { + t.Errorf("AsError() error chain broken: errors.As() could not find *customError in chain") + } else if customErr.code != tc.originalErr.(*customError).code { + t.Errorf("AsError() error chain: customError.code = %d, want %d", customErr.code, tc.originalErr.(*customError).code) + } + }) + } +} From 1bda7580add78ec3181ba3d00b1f108604d3a072 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Wed, 24 Jun 2026 18:50:59 -0500 Subject: [PATCH 05/12] fix linter error Signed-off-by: Britania Rodriguez Reyes --- pkg/scheduler/framework/status_test.go | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/scheduler/framework/status_test.go b/pkg/scheduler/framework/status_test.go index 26d5cb9bf..de52abe57 100644 --- a/pkg/scheduler/framework/status_test.go +++ b/pkg/scheduler/framework/status_test.go @@ -170,10 +170,10 @@ func (e *customError) Error() string { func TestAsError(t *testing.T) { testCases := []struct { - name string - status *Status - wantNil bool - originalErr error + name string + status *Status + wantNil bool + wantCode int }{ { name: "nil status returns nil", @@ -191,16 +191,16 @@ func TestAsError(t *testing.T) { wantNil: true, }, { - name: "internal error preserves error chain", - status: FromError(&customError{code: 503}, dummyPlugin, "reason1", "reason2"), - wantNil: false, - originalErr: &customError{code: 503}, + name: "internal error preserves error chain", + status: FromError(&customError{code: 503}, dummyPlugin, "reason1", "reason2"), + wantNil: false, + wantCode: 503, }, { - name: "internal error with wrapped error preserves full chain", - status: FromError(fmt.Errorf("outer error: %w", &customError{code: 429}), dummyPlugin, "rate limited"), - wantNil: false, - originalErr: &customError{code: 429}, + name: "internal error with wrapped error preserves full chain", + status: FromError(fmt.Errorf("outer error: %w", &customError{code: 429}), dummyPlugin, "rate limited"), + wantNil: false, + wantCode: 429, }, } @@ -228,8 +228,8 @@ func TestAsError(t *testing.T) { var customErr *customError if !errors.As(err, &customErr) { t.Errorf("AsError() error chain broken: errors.As() could not find *customError in chain") - } else if customErr.code != tc.originalErr.(*customError).code { - t.Errorf("AsError() error chain: customError.code = %d, want %d", customErr.code, tc.originalErr.(*customError).code) + } else if customErr.code != tc.wantCode { + t.Errorf("AsError() error chain: customError.code = %d, want %d", customErr.code, tc.wantCode) } }) } From dad343ea29c005ead21df9d743427369e90a48a8 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:17:31 -0500 Subject: [PATCH 06/12] Delete logs.txt --- logs.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 logs.txt diff --git a/logs.txt b/logs.txt deleted file mode 100644 index e69de29bb..000000000 From 34f75c902a1f86c70b4fb994475966f9f2e90452 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 12:07:55 -0500 Subject: [PATCH 07/12] update eror handling to use retryable interface Signed-off-by: Britania Rodriguez Reyes --- go.mod | 2 +- .../compute/vmsizerecommenderclient_test.go | 13 +- pkg/clients/httputil/httputil.go | 20 +-- pkg/clients/httputil/httputil_test.go | 13 +- pkg/propertychecker/azure/checker.go | 22 ++- pkg/scheduler/framework/framework.go | 18 ++- pkg/utils/errors/errors.go | 54 +++++++ pkg/utils/errors/errors_test.go | 151 ++++++++++++++++++ 8 files changed, 253 insertions(+), 40 deletions(-) diff --git a/go.mod b/go.mod index a3c6c2acb..37c69a3ef 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,8 @@ require ( github.com/Azure/karpenter-provider-azure v1.5.1 github.com/crossplane/crossplane-runtime/v2 v2.1.0 github.com/evanphx/json-patch/v5 v5.9.11 - github.com/gofrs/uuid v4.4.0+incompatible github.com/go-logr/logr v1.4.3 + github.com/gofrs/uuid v4.4.0+incompatible github.com/google/go-cmp v0.7.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 github.com/onsi/ginkgo/v2 v2.23.4 diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go index 6f6373342..e651bd26e 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go @@ -7,6 +7,7 @@ package compute import ( "context" + "errors" "net/http" "strings" "testing" @@ -274,15 +275,17 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { return } - // Check if error is a transient HTTPError. + // Check if error is a transient HTTPError by checking IsRetryable. if tt.wantErr && tt.wantIsTransient { - if !httputil.IsTransientHTTPError(err) { - t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want transient HTTPError", err) + var httpErr *httputil.HTTPError + if !errors.As(err, &httpErr) || !httpErr.IsRetryable() { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want retryable HTTPError", err) } } if tt.wantErr && !tt.wantIsTransient && tt.mockStatusCode >= 400 && tt.mockStatusCode < 500 { - if httputil.IsTransientHTTPError(err) { - t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be transient HTTPError for 4xx errors", err) + var httpErr *httputil.HTTPError + if errors.As(err, &httpErr) && httpErr.IsRetryable() { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be retryable HTTPError for 4xx errors", err) } } diff --git a/pkg/clients/httputil/httputil.go b/pkg/clients/httputil/httputil.go index 6f7d75831..c4d8be4f2 100644 --- a/pkg/clients/httputil/httputil.go +++ b/pkg/clients/httputil/httputil.go @@ -7,7 +7,6 @@ Licensed under the MIT license. package httputil import ( - "errors" "fmt" "net/http" "time" @@ -55,7 +54,9 @@ var transientHTTPStatusCodes = map[int]bool{ } // HTTPError represents an HTTP error with a status code that can be checked -// for transient error conditions. +// for transient error conditions. HTTPError implements the RetryableError interface +// from pkg/utils/errors, allowing control loops to make retry decisions based on +// HTTP status codes without format-specific inspection. type HTTPError struct { StatusCode int Method string @@ -67,9 +68,9 @@ func (e *HTTPError) Error() string { return fmt.Sprintf("request failed with status %d: %s %s", e.StatusCode, e.Method, e.URL) } -// IsTransient returns true if this error represents a transient HTTP error -// that may succeed on retry. -func (e *HTTPError) IsTransient() bool { +// IsRetryable implements the RetryableError interface. +// It returns true for transient HTTP errors (429, 5xx) that may succeed on retry. +func (e *HTTPError) IsRetryable() bool { return transientHTTPStatusCodes[e.StatusCode] } @@ -81,12 +82,3 @@ func NewHTTPError(resp *http.Response) *HTTPError { URL: resp.Request.URL.String(), } } - -// IsTransientHTTPError checks if the given error is or wraps a transient HTTP error. -func IsTransientHTTPError(err error) bool { - var httpErr *HTTPError - if errors.As(err, &httpErr) { - return httpErr.IsTransient() - } - return false -} diff --git a/pkg/clients/httputil/httputil_test.go b/pkg/clients/httputil/httputil_test.go index 6de618041..047d5ac85 100644 --- a/pkg/clients/httputil/httputil_test.go +++ b/pkg/clients/httputil/httputil_test.go @@ -6,7 +6,6 @@ Licensed under the MIT license. package httputil import ( - "fmt" "net/http" "net/url" "testing" @@ -45,14 +44,12 @@ func TestHTTPError(t *testing.T) { } } -func TestIsTransientHTTPError(t *testing.T) { +func TestHTTPErrorIsRetryable(t *testing.T) { tests := []struct { name string - err error + err *HTTPError want bool }{ - {"nil error", nil, false}, - {"non-HTTP error", fmt.Errorf("some error"), false}, {"400 Bad Request", &HTTPError{StatusCode: http.StatusBadRequest}, false}, {"401 Unauthorized", &HTTPError{StatusCode: http.StatusUnauthorized}, false}, {"403 Forbidden", &HTTPError{StatusCode: http.StatusForbidden}, false}, @@ -62,14 +59,12 @@ func TestIsTransientHTTPError(t *testing.T) { {"502 Bad Gateway", &HTTPError{StatusCode: http.StatusBadGateway}, true}, {"503 Service Unavailable", &HTTPError{StatusCode: http.StatusServiceUnavailable}, true}, {"504 Gateway Timeout", &HTTPError{StatusCode: http.StatusGatewayTimeout}, true}, - {"wrapped transient error", fmt.Errorf("outer: %w", &HTTPError{StatusCode: http.StatusServiceUnavailable}), true}, - {"wrapped non-transient error", fmt.Errorf("outer: %w", &HTTPError{StatusCode: http.StatusBadRequest}), false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := IsTransientHTTPError(tt.err); got != tt.want { - t.Errorf("IsTransientHTTPError() = %v, want %v", got, tt.want) + if got := tt.err.IsRetryable(); got != tt.want { + t.Errorf("HTTPError.IsRetryable() = %v, want %v", got, tt.want) } }) } diff --git a/pkg/propertychecker/azure/checker.go b/pkg/propertychecker/azure/checker.go index c031beee6..3fc624fdc 100644 --- a/pkg/propertychecker/azure/checker.go +++ b/pkg/propertychecker/azure/checker.go @@ -18,6 +18,7 @@ import ( placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" computev1 "go.goms.io/fleet/apis/protos/azure/compute/v1" "go.goms.io/fleet/pkg/clients/azure/compute" + fleetErrors "go.goms.io/fleet/pkg/utils/errors" "go.goms.io/fleet/pkg/utils/labels" ) @@ -58,6 +59,10 @@ func NewPropertyChecker(vmSizeRecommenderClient compute.AttributeBasedVMSizeReco // // The cluster must have both Azure location and subscription ID labels configured. // Returns true if the SKU capacity requirement can be met, false otherwise. +// +// Errors returned implement the RetryableError interface to indicate whether the operation +// can be retried. Configuration errors (missing labels, invalid capacity) are non-retryable, +// while Azure API errors preserve the retryability of the underlying HTTP error. func (s *PropertyChecker) CheckIfMeetSKUCapacityRequirement( cluster *clusterv1beta1.MemberCluster, req placementv1beta1.PropertySelectorRequirement, @@ -65,18 +70,24 @@ func (s *PropertyChecker) CheckIfMeetSKUCapacityRequirement( ) (bool, error) { location, err := labels.ExtractLabelFromMemberCluster(cluster, labels.AzureLocationLabel) if err != nil { - return false, fmt.Errorf("failed to extract Azure location label from cluster %s: %w", cluster.Name, err) + // Missing label is a configuration error; not retryable. + return false, fleetErrors.NewUserError(err, + fmt.Sprintf("failed to extract Azure location label from cluster %s", cluster.Name)) } subID, err := labels.ExtractLabelFromMemberCluster(cluster, labels.AzureSubscriptionIDLabel) if err != nil { - return false, fmt.Errorf("failed to extract Azure subscription ID label from cluster %s: %w", cluster.Name, err) + // Missing label is a configuration error; not retryable. + return false, fleetErrors.NewUserError(err, + fmt.Sprintf("failed to extract Azure subscription ID label from cluster %s", cluster.Name)) } // Extract capacity requirements from the property selector requirement. capacity, err := extractCapacityRequirements(req) if err != nil { - return false, fmt.Errorf("failed to extract capacity requirements from property selector requirement: %w", err) + // Invalid capacity specification is a user error; not retryable. + return false, fleetErrors.NewUserError(err, + "failed to extract capacity requirements from property selector requirement") } // Request VM size recommendations to validate SKU availability and capacity. @@ -101,7 +112,10 @@ func (s *PropertyChecker) CheckIfMeetSKUCapacityRequirement( respObj, err := s.vmSizeRecommenderClient.GenerateAttributeBasedRecommendations(context.Background(), request) if err != nil { - return false, fmt.Errorf("failed to generate VM size recommendations from Azure: %w", err) + // Wrap the error with context. The underlying HTTPError (if present) implements + // RetryableError, so fleetErrors.IsRetryable() will detect it in the error chain. + return false, fleetErrors.Wraps(err, + fmt.Sprintf("failed to generate VM size recommendations from Azure for SKU %s in cluster %s", sku, cluster.Name)) } // This check is a defense mechanism; vmSizeRecommenderClient should return a VM size recommendation diff --git a/pkg/scheduler/framework/framework.go b/pkg/scheduler/framework/framework.go index cc318facf..c562b7850 100644 --- a/pkg/scheduler/framework/framework.go +++ b/pkg/scheduler/framework/framework.go @@ -40,12 +40,12 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" - "go.goms.io/fleet/pkg/clients/httputil" "go.goms.io/fleet/pkg/scheduler/clustereligibilitychecker" "go.goms.io/fleet/pkg/scheduler/queue" "go.goms.io/fleet/pkg/utils/annotations" "go.goms.io/fleet/pkg/utils/condition" "go.goms.io/fleet/pkg/utils/controller" + fleetErrors "go.goms.io/fleet/pkg/utils/errors" "go.goms.io/fleet/pkg/utils/parallelizer" ) @@ -540,9 +540,11 @@ func (f *framework) runAllPluginsForPickAllPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) - // If the error is a transient HTTP error (e.g., 503 from external service), - // return it as-is so the scheduler can requeue. Otherwise, wrap it as unexpected behavior. - if httputil.IsTransientHTTPError(err) { + // Check if the error is retryable using structured error semantics. + // If the error (or any error in its chain) implements RetryableError and indicates + // it's retryable, return it as-is so the scheduler can requeue. + // Otherwise, wrap it as unexpected behavior. + if retryable, found := fleetErrors.IsRetryable(err); found && retryable { return nil, nil, err } return nil, nil, controller.NewUnexpectedBehaviorError(err) @@ -1165,9 +1167,11 @@ func (f *framework) runAllPluginsForPickNPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) - // If the error is a transient HTTP error (e.g., 503 from external service), - // return it as-is so the scheduler can requeue. Otherwise, wrap it as unexpected behavior. - if httputil.IsTransientHTTPError(err) { + // Check if the error is retryable using structured error semantics. + // If the error (or any error in its chain) implements RetryableError and indicates + // it's retryable, return it as-is so the scheduler can requeue. + // Otherwise, wrap it as unexpected behavior. + if retryable, found := fleetErrors.IsRetryable(err); found && retryable { return nil, nil, err } return nil, nil, controller.NewUnexpectedBehaviorError(err) diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go index 3f04059e5..573ce6d01 100644 --- a/pkg/utils/errors/errors.go +++ b/pkg/utils/errors/errors.go @@ -54,7 +54,38 @@ const ( ErrCategoryUncategorized ErrCategory = "uncategorized" ) +// RetryableError is an interface that errors can implement to indicate whether the +// operation that caused the error can be retried. This allows the control loop to +// make retry decisions based on error semantics rather than inspecting error formats. +type RetryableError interface { + error + // IsRetryable returns true if the error is transient and the operation may succeed + // on retry. Returns false if the error is permanent and retrying would not help. + IsRetryable() bool +} + +// IsRetryable checks if the given error (or any error in its chain) indicates that the +// operation can be retried. It traverses the error chain using errors.As to find any +// error that implements RetryableError interface. +// +// Returns: +// - (true, true) if a RetryableError is found and IsRetryable() returns true +// - (false, true) if a RetryableError is found and IsRetryable() returns false +// - (false, false) if no RetryableError is found in the chain +func IsRetryable(err error) (retryable bool, found bool) { + if err == nil { + return false, false + } + + var retryableErr RetryableError + if errors.As(err, &retryableErr) { + return retryableErr.IsRetryable(), true + } + return false, false +} + var _ error = &Error{} +var _ RetryableError = &Error{} type Error struct { // category is the category of the error. @@ -77,6 +108,29 @@ func (e *Error) categoryWithDefault() ErrCategory { return e.category } +// IsRetryable implements the RetryableError interface. +// It determines retryability based on the error category: +// - ErrCategoryTransient: retryable (will self-resolve) +// - ErrCategoryAPIServer: retryable (API server issues are often transient) +// - ErrCategoryUnexpected: not retryable (unknown state, cannot recover) +// - ErrCategoryUser: not retryable (requires user action to fix) +// - ErrCategoryUncategorized: retryable (default to retry when unknown) +func (e *Error) IsRetryable() bool { + switch e.category { + case ErrCategoryTransient: + return true + case ErrCategoryAPIServer: + return true + case ErrCategoryUnexpected: + return false + case ErrCategoryUser: + return false + default: + // ErrCategoryUncategorized or unknown: default to retryable. + return true + } +} + // Error implements the error interface. // // Note the output intentionally does not include the additional attributes, so as to keep the cardinality diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go index d715640fa..fdf0e672a 100644 --- a/pkg/utils/errors/errors_test.go +++ b/pkg/utils/errors/errors_test.go @@ -578,3 +578,154 @@ func readFromBuffer(t *testing.T, buf *bytes.Buffer) string { outputStr := string(output) return outputStr } + +// mockRetryableError is a test helper that implements the RetryableError interface. +type mockRetryableError struct { + retryable bool +} + +func (e *mockRetryableError) Error() string { + return "mock retryable error" +} + +func (e *mockRetryableError) IsRetryable() bool { + return e.retryable +} + +func TestIsRetryableFunction(t *testing.T) { + testCases := []struct { + name string + err error + wantRetryable bool + wantFound bool + }{ + { + name: "nil error", + err: nil, + wantRetryable: false, + wantFound: false, + }, + { + name: "plain error (no RetryableError in chain)", + err: fmt.Errorf("plain error"), + wantRetryable: false, + wantFound: false, + }, + { + name: "mock retryable error (retryable=true)", + err: &mockRetryableError{retryable: true}, + wantRetryable: true, + wantFound: true, + }, + { + name: "mock retryable error (retryable=false)", + err: &mockRetryableError{retryable: false}, + wantRetryable: false, + wantFound: true, + }, + { + name: "wrapped mock retryable error", + err: fmt.Errorf("wrapped: %w", &mockRetryableError{retryable: true}), + wantRetryable: true, + wantFound: true, + }, + { + name: "Error with ErrCategoryTransient (retryable)", + err: &Error{category: ErrCategoryTransient}, + wantRetryable: true, + wantFound: true, + }, + { + name: "Error with ErrCategoryUser (not retryable)", + err: &Error{category: ErrCategoryUser}, + wantRetryable: false, + wantFound: true, + }, + { + name: "Error with ErrCategoryUnexpected (not retryable)", + err: &Error{category: ErrCategoryUnexpected}, + wantRetryable: false, + wantFound: true, + }, + { + name: "Error with ErrCategoryAPIServer (retryable)", + err: &Error{category: ErrCategoryAPIServer}, + wantRetryable: true, + wantFound: true, + }, + { + name: "Error with ErrCategoryUncategorized (retryable by default)", + err: &Error{category: ErrCategoryUncategorized}, + wantRetryable: true, + wantFound: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotRetryable, gotFound := IsRetryable(tc.err) + if gotRetryable != tc.wantRetryable { + t.Errorf("IsRetryable() retryable = %v, want %v", gotRetryable, tc.wantRetryable) + } + if gotFound != tc.wantFound { + t.Errorf("IsRetryable() found = %v, want %v", gotFound, tc.wantFound) + } + }) + } +} + +func TestErrorIsRetryable(t *testing.T) { + testCases := []struct { + name string + err *Error + wantRetryable bool + }{ + { + name: "ErrCategoryTransient is retryable", + err: &Error{category: ErrCategoryTransient}, + wantRetryable: true, + }, + { + name: "ErrCategoryAPIServer is retryable", + err: &Error{category: ErrCategoryAPIServer}, + wantRetryable: true, + }, + { + name: "ErrCategoryUnexpected is not retryable", + err: &Error{category: ErrCategoryUnexpected}, + wantRetryable: false, + }, + { + name: "ErrCategoryUser is not retryable", + err: &Error{category: ErrCategoryUser}, + wantRetryable: false, + }, + { + name: "ErrCategoryUncategorized is retryable by default", + err: &Error{category: ErrCategoryUncategorized}, + wantRetryable: true, + }, + { + name: "empty category defaults to retryable", + err: &Error{}, + wantRetryable: true, + }, + { + name: "Error with wrapped error uses its own category", + err: &Error{ + category: ErrCategoryUser, + wrapped: fmt.Errorf("plain error"), + }, + wantRetryable: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotRetryable := tc.err.IsRetryable() + if gotRetryable != tc.wantRetryable { + t.Errorf("IsRetryable() = %v, want %v", gotRetryable, tc.wantRetryable) + } + }) + } +} From bba83232de196530f7a488a369fb9df9c73faacc Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 12:25:34 -0500 Subject: [PATCH 08/12] remove http error struct and use existing with retryable interface Signed-off-by: Britania Rodriguez Reyes --- .../azure/compute/vmsizerecommenderclient.go | 10 ++- .../compute/vmsizerecommenderclient_test.go | 17 +++-- pkg/clients/httputil/httputil.go | 33 ++-------- pkg/clients/httputil/httputil_test.go | 64 +++++-------------- pkg/propertychecker/azure/checker.go | 5 +- 5 files changed, 38 insertions(+), 91 deletions(-) diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient.go b/pkg/clients/azure/compute/vmsizerecommenderclient.go index 240e5419e..5c6fb2822 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient.go @@ -23,6 +23,7 @@ import ( computev1 "go.goms.io/fleet/apis/protos/azure/compute/v1" "go.goms.io/fleet/pkg/clients/httputil" "go.goms.io/fleet/pkg/utils/controller" + fleetErrors "go.goms.io/fleet/pkg/utils/errors" ) const ( @@ -136,9 +137,14 @@ func (c *AttributeBasedVMSizeRecommenderClient) GenerateAttributeBasedRecommenda return nil, fmt.Errorf("failed to read response body: %w", err) } - // Check status code + // Check status code - categorize based on transient vs non-transient errors if resp.StatusCode < 200 || resp.StatusCode >= 300 { - return nil, httputil.NewHTTPError(resp) + desc := fmt.Sprintf("request failed with status %d: %s %s", resp.StatusCode, httpReq.Method, url) + if httputil.IsTransientStatusCode(resp.StatusCode) { + return nil, fleetErrors.NewTransientError(nil, desc) + } + // Non-transient errors (4xx client errors) should not be retried. + return nil, fleetErrors.NewUnexpectedError(nil, desc) } // Unmarshal response using protojson for proper proto3 support diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go index e651bd26e..0a2bfbd64 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go @@ -7,7 +7,6 @@ package compute import ( "context" - "errors" "net/http" "strings" "testing" @@ -16,7 +15,7 @@ import ( "google.golang.org/protobuf/proto" computev1 "go.goms.io/fleet/apis/protos/azure/compute/v1" - "go.goms.io/fleet/pkg/clients/httputil" + fleetErrors "go.goms.io/fleet/pkg/utils/errors" "go.goms.io/fleet/test/utils/azure/compute" ) @@ -275,17 +274,17 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { return } - // Check if error is a transient HTTPError by checking IsRetryable. + // Check if error is retryable using fleetErrors.IsRetryable. if tt.wantErr && tt.wantIsTransient { - var httpErr *httputil.HTTPError - if !errors.As(err, &httpErr) || !httpErr.IsRetryable() { - t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want retryable HTTPError", err) + retryable, found := fleetErrors.IsRetryable(err) + if !found || !retryable { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want retryable error", err) } } if tt.wantErr && !tt.wantIsTransient && tt.mockStatusCode >= 400 && tt.mockStatusCode < 500 { - var httpErr *httputil.HTTPError - if errors.As(err, &httpErr) && httpErr.IsRetryable() { - t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be retryable HTTPError for 4xx errors", err) + retryable, found := fleetErrors.IsRetryable(err) + if found && retryable { + t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be retryable for 4xx errors", err) } } diff --git a/pkg/clients/httputil/httputil.go b/pkg/clients/httputil/httputil.go index c4d8be4f2..3af8faed4 100644 --- a/pkg/clients/httputil/httputil.go +++ b/pkg/clients/httputil/httputil.go @@ -7,7 +7,6 @@ Licensed under the MIT license. package httputil import ( - "fmt" "net/http" "time" @@ -53,32 +52,8 @@ var transientHTTPStatusCodes = map[int]bool{ http.StatusGatewayTimeout: true, // 504 - Gateway timeout } -// HTTPError represents an HTTP error with a status code that can be checked -// for transient error conditions. HTTPError implements the RetryableError interface -// from pkg/utils/errors, allowing control loops to make retry decisions based on -// HTTP status codes without format-specific inspection. -type HTTPError struct { - StatusCode int - Method string - URL string -} - -// Error implements the error interface for HTTPError. -func (e *HTTPError) Error() string { - return fmt.Sprintf("request failed with status %d: %s %s", e.StatusCode, e.Method, e.URL) -} - -// IsRetryable implements the RetryableError interface. -// It returns true for transient HTTP errors (429, 5xx) that may succeed on retry. -func (e *HTTPError) IsRetryable() bool { - return transientHTTPStatusCodes[e.StatusCode] -} - -// NewHTTPError creates a new HTTPError from an HTTP response. -func NewHTTPError(resp *http.Response) *HTTPError { - return &HTTPError{ - StatusCode: resp.StatusCode, - Method: resp.Request.Method, - URL: resp.Request.URL.String(), - } +// IsTransientStatusCode returns true if the HTTP status code indicates a transient +// error that may succeed on retry (429, 5xx). +func IsTransientStatusCode(statusCode int) bool { + return transientHTTPStatusCodes[statusCode] } diff --git a/pkg/clients/httputil/httputil_test.go b/pkg/clients/httputil/httputil_test.go index 047d5ac85..aaf6affcf 100644 --- a/pkg/clients/httputil/httputil_test.go +++ b/pkg/clients/httputil/httputil_test.go @@ -7,64 +7,30 @@ package httputil import ( "net/http" - "net/url" "testing" ) -func TestHTTPError(t *testing.T) { - resp := &http.Response{ - StatusCode: http.StatusServiceUnavailable, - Request: &http.Request{ - Method: http.MethodPost, - URL: &url.URL{ - Scheme: "https", - Host: "example.com", - Path: "/api/v1/resource", - }, - }, - } - - httpErr := NewHTTPError(resp) - - // Test fields are populated correctly. - if httpErr.StatusCode != http.StatusServiceUnavailable { - t.Errorf("HTTPError.StatusCode = %d, want %d", httpErr.StatusCode, http.StatusServiceUnavailable) - } - if httpErr.Method != http.MethodPost { - t.Errorf("HTTPError.Method = %q, want %q", httpErr.Method, http.MethodPost) - } - if httpErr.URL != "https://example.com/api/v1/resource" { - t.Errorf("HTTPError.URL = %q, want %q", httpErr.URL, "https://example.com/api/v1/resource") - } - - // Test Error() method. - wantErrMsg := "request failed with status 503: POST https://example.com/api/v1/resource" - if got := httpErr.Error(); got != wantErrMsg { - t.Errorf("HTTPError.Error() = %q, want %q", got, wantErrMsg) - } -} - -func TestHTTPErrorIsRetryable(t *testing.T) { +func TestIsTransientStatusCode(t *testing.T) { tests := []struct { - name string - err *HTTPError - want bool + name string + statusCode int + want bool }{ - {"400 Bad Request", &HTTPError{StatusCode: http.StatusBadRequest}, false}, - {"401 Unauthorized", &HTTPError{StatusCode: http.StatusUnauthorized}, false}, - {"403 Forbidden", &HTTPError{StatusCode: http.StatusForbidden}, false}, - {"404 Not Found", &HTTPError{StatusCode: http.StatusNotFound}, false}, - {"429 Too Many Requests", &HTTPError{StatusCode: http.StatusTooManyRequests}, true}, - {"500 Internal Server Error", &HTTPError{StatusCode: http.StatusInternalServerError}, true}, - {"502 Bad Gateway", &HTTPError{StatusCode: http.StatusBadGateway}, true}, - {"503 Service Unavailable", &HTTPError{StatusCode: http.StatusServiceUnavailable}, true}, - {"504 Gateway Timeout", &HTTPError{StatusCode: http.StatusGatewayTimeout}, true}, + {"400 Bad Request", http.StatusBadRequest, false}, + {"401 Unauthorized", http.StatusUnauthorized, false}, + {"403 Forbidden", http.StatusForbidden, false}, + {"404 Not Found", http.StatusNotFound, false}, + {"429 Too Many Requests", http.StatusTooManyRequests, true}, + {"500 Internal Server Error", http.StatusInternalServerError, true}, + {"502 Bad Gateway", http.StatusBadGateway, true}, + {"503 Service Unavailable", http.StatusServiceUnavailable, true}, + {"504 Gateway Timeout", http.StatusGatewayTimeout, true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := tt.err.IsRetryable(); got != tt.want { - t.Errorf("HTTPError.IsRetryable() = %v, want %v", got, tt.want) + if got := IsTransientStatusCode(tt.statusCode); got != tt.want { + t.Errorf("IsTransientStatusCode(%d) = %v, want %v", tt.statusCode, got, tt.want) } }) } diff --git a/pkg/propertychecker/azure/checker.go b/pkg/propertychecker/azure/checker.go index 3fc624fdc..3aad55c3b 100644 --- a/pkg/propertychecker/azure/checker.go +++ b/pkg/propertychecker/azure/checker.go @@ -112,8 +112,9 @@ func (s *PropertyChecker) CheckIfMeetSKUCapacityRequirement( respObj, err := s.vmSizeRecommenderClient.GenerateAttributeBasedRecommendations(context.Background(), request) if err != nil { - // Wrap the error with context. The underlying HTTPError (if present) implements - // RetryableError, so fleetErrors.IsRetryable() will detect it in the error chain. + // Wrap the error with context. The underlying error already has the appropriate + // category set (transient for 429/5xx, API server error for other failures), + // so fleetErrors.IsRetryable() will detect it in the error chain. return false, fleetErrors.Wraps(err, fmt.Sprintf("failed to generate VM size recommendations from Azure for SKU %s in cluster %s", sku, cluster.Name)) } From 875d0f660ce15698826e8bb9d855a3a049b11df8 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 12:38:44 -0500 Subject: [PATCH 09/12] add failing scheduling event Signed-off-by: Britania Rodriguez Reyes --- pkg/scheduler/framework/framework.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/scheduler/framework/framework.go b/pkg/scheduler/framework/framework.go index c562b7850..50601b566 100644 --- a/pkg/scheduler/framework/framework.go +++ b/pkg/scheduler/framework/framework.go @@ -27,6 +27,7 @@ import ( "time" "golang.org/x/sync/errgroup" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -58,6 +59,10 @@ const ( // NotFullyScheduledReason is the reason string of placement condition when the placement policy cannot be fully satisfied. NotFullyScheduledReason = "SchedulingPolicyUnfulfilled" + // Event reasons for scheduling errors. + // SchedulingErrorReason is used when the scheduler encounters an error during scheduling. + SchedulingErrorReason = "SchedulingError" + fullyScheduledMessage = "found all cluster needed as specified by the scheduling policy, found %d cluster(s)" notFullyScheduledMessage = "could not find all clusters needed as specified by the scheduling policy, found %d cluster(s) instead" @@ -540,6 +545,9 @@ func (f *framework) runAllPluginsForPickAllPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) + // Emit an event to inform the user about the scheduling error. + f.eventRecorder.Event(policy, corev1.EventTypeWarning, SchedulingErrorReason, + fmt.Sprintf("Failed to run filter plugins: %v", err)) // Check if the error is retryable using structured error semantics. // If the error (or any error in its chain) implements RetryableError and indicates // it's retryable, return it as-is so the scheduler can requeue. @@ -1167,6 +1175,9 @@ func (f *framework) runAllPluginsForPickNPlacementType( passed, filtered, err := f.runFilterPlugins(ctx, state, policy, clusters) if err != nil { klog.ErrorS(err, "Failed to run filter plugins", "policySnapshot", policyRef) + // Emit an event to inform the user about the scheduling error. + f.eventRecorder.Event(policy, corev1.EventTypeWarning, SchedulingErrorReason, + fmt.Sprintf("Failed to run filter plugins: %v", err)) // Check if the error is retryable using structured error semantics. // If the error (or any error in its chain) implements RetryableError and indicates // it's retryable, return it as-is so the scheduler can requeue. From f872549d1a8aa1f48e921422689e66df10764c84 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 13:51:24 -0500 Subject: [PATCH 10/12] fix UT failure Signed-off-by: Britania Rodriguez Reyes --- pkg/scheduler/framework/framework_test.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/scheduler/framework/framework_test.go b/pkg/scheduler/framework/framework_test.go index c7124916c..a8cfd6fd0 100644 --- a/pkg/scheduler/framework/framework_test.go +++ b/pkg/scheduler/framework/framework_test.go @@ -36,6 +36,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -1323,8 +1324,9 @@ func TestRunAllPluginsForPickAllPlacementType(t *testing.T) { profile.WithFilterPlugin(p) } f := &framework{ - profile: profile, - parallelizer: parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + profile: profile, + parallelizer: parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + eventRecorder: record.NewFakeRecorder(10), } ctx := context.Background() @@ -6253,8 +6255,9 @@ func TestRunAllPluginsForPickNPlacementType(t *testing.T) { } f := &framework{ - profile: profile, - parallelizer: parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + profile: profile, + parallelizer: parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + eventRecorder: record.NewFakeRecorder(10), } ctx := context.Background() From 06563b50cbee43296e4979b883897412bded5e0c Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 18:42:41 -0500 Subject: [PATCH 11/12] update naming of error with retry Signed-off-by: Britania Rodriguez Reyes --- pkg/utils/errors/errors.go | 28 ++++++++++++++-------------- pkg/utils/errors/errors_test.go | 24 ++++++++++++------------ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pkg/utils/errors/errors.go b/pkg/utils/errors/errors.go index 573ce6d01..16236df86 100644 --- a/pkg/utils/errors/errors.go +++ b/pkg/utils/errors/errors.go @@ -54,38 +54,38 @@ const ( ErrCategoryUncategorized ErrCategory = "uncategorized" ) -// RetryableError is an interface that errors can implement to indicate whether the +// ErrorWithRetryPolicy is an interface that errors can implement to indicate whether the // operation that caused the error can be retried. This allows the control loop to // make retry decisions based on error semantics rather than inspecting error formats. -type RetryableError interface { +type ErrorWithRetryPolicy interface { error // IsRetryable returns true if the error is transient and the operation may succeed // on retry. Returns false if the error is permanent and retrying would not help. IsRetryable() bool } -// IsRetryable checks if the given error (or any error in its chain) indicates that the -// operation can be retried. It traverses the error chain using errors.As to find any -// error that implements RetryableError interface. +// IsRetryable checks if the given error (or any error in its chain) has a retry policy +// configured. It traverses the error chain using errors.As to find any error that +// implements ErrorWithRetryPolicy interface. // // Returns: -// - (true, true) if a RetryableError is found and IsRetryable() returns true -// - (false, true) if a RetryableError is found and IsRetryable() returns false -// - (false, false) if no RetryableError is found in the chain -func IsRetryable(err error) (retryable bool, found bool) { +// - (true, true) if an ErrorWithRetryPolicy is found and IsRetryable() returns true +// - (false, true) if an ErrorWithRetryPolicy is found and IsRetryable() returns false +// - (false, false) if no ErrorWithRetryPolicy is found in the chain (no retry policy configured) +func IsRetryable(err error) (isRetryable bool, hasRetryPolicy bool) { if err == nil { return false, false } - var retryableErr RetryableError - if errors.As(err, &retryableErr) { - return retryableErr.IsRetryable(), true + var errWithPolicy ErrorWithRetryPolicy + if errors.As(err, &errWithPolicy) { + return errWithPolicy.IsRetryable(), true } return false, false } var _ error = &Error{} -var _ RetryableError = &Error{} +var _ ErrorWithRetryPolicy = &Error{} type Error struct { // category is the category of the error. @@ -108,7 +108,7 @@ func (e *Error) categoryWithDefault() ErrCategory { return e.category } -// IsRetryable implements the RetryableError interface. +// IsRetryable implements the ErrorWithRetryPolicy interface. // It determines retryability based on the error category: // - ErrCategoryTransient: retryable (will self-resolve) // - ErrCategoryAPIServer: retryable (API server issues are often transient) diff --git a/pkg/utils/errors/errors_test.go b/pkg/utils/errors/errors_test.go index fdf0e672a..277cbc2e3 100644 --- a/pkg/utils/errors/errors_test.go +++ b/pkg/utils/errors/errors_test.go @@ -579,16 +579,16 @@ func readFromBuffer(t *testing.T, buf *bytes.Buffer) string { return outputStr } -// mockRetryableError is a test helper that implements the RetryableError interface. -type mockRetryableError struct { +// mockErrorWithRetryPolicy is a test helper that implements the ErrorWithRetryPolicy interface. +type mockErrorWithRetryPolicy struct { retryable bool } -func (e *mockRetryableError) Error() string { - return "mock retryable error" +func (e *mockErrorWithRetryPolicy) Error() string { + return "mock error with retry policy" } -func (e *mockRetryableError) IsRetryable() bool { +func (e *mockErrorWithRetryPolicy) IsRetryable() bool { return e.retryable } @@ -606,26 +606,26 @@ func TestIsRetryableFunction(t *testing.T) { wantFound: false, }, { - name: "plain error (no RetryableError in chain)", + name: "plain error (no retry policy in chain)", err: fmt.Errorf("plain error"), wantRetryable: false, wantFound: false, }, { - name: "mock retryable error (retryable=true)", - err: &mockRetryableError{retryable: true}, + name: "error with retry policy (retryable=true)", + err: &mockErrorWithRetryPolicy{retryable: true}, wantRetryable: true, wantFound: true, }, { - name: "mock retryable error (retryable=false)", - err: &mockRetryableError{retryable: false}, + name: "error with retry policy (retryable=false)", + err: &mockErrorWithRetryPolicy{retryable: false}, wantRetryable: false, wantFound: true, }, { - name: "wrapped mock retryable error", - err: fmt.Errorf("wrapped: %w", &mockRetryableError{retryable: true}), + name: "wrapped error with retry policy", + err: fmt.Errorf("wrapped: %w", &mockErrorWithRetryPolicy{retryable: true}), wantRetryable: true, wantFound: true, }, From ff4cb01c98b6d225de9cddc1264fa6e23edf47b3 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 25 Jun 2026 18:46:23 -0500 Subject: [PATCH 12/12] update error with retry naming in all places Signed-off-by: Britania Rodriguez Reyes --- .../azure/compute/vmsizerecommenderclient_test.go | 10 +++++----- pkg/scheduler/framework/framework.go | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go index 0a2bfbd64..00933938f 100644 --- a/pkg/clients/azure/compute/vmsizerecommenderclient_test.go +++ b/pkg/clients/azure/compute/vmsizerecommenderclient_test.go @@ -274,16 +274,16 @@ func TestClient_GenerateAttributeBasedRecommendations(t *testing.T) { return } - // Check if error is retryable using fleetErrors.IsRetryable. + // Check if error has retry policy using fleetErrors.IsRetryable. if tt.wantErr && tt.wantIsTransient { - retryable, found := fleetErrors.IsRetryable(err) - if !found || !retryable { + isRetryable, hasRetryPolicy := fleetErrors.IsRetryable(err) + if !hasRetryPolicy || !isRetryable { t.Errorf("GenerateAttributeBasedRecommendations() error = %v, want retryable error", err) } } if tt.wantErr && !tt.wantIsTransient && tt.mockStatusCode >= 400 && tt.mockStatusCode < 500 { - retryable, found := fleetErrors.IsRetryable(err) - if found && retryable { + isRetryable, hasRetryPolicy := fleetErrors.IsRetryable(err) + if hasRetryPolicy && isRetryable { t.Errorf("GenerateAttributeBasedRecommendations() error = %v, should NOT be retryable for 4xx errors", err) } } diff --git a/pkg/scheduler/framework/framework.go b/pkg/scheduler/framework/framework.go index 50601b566..58484b44e 100644 --- a/pkg/scheduler/framework/framework.go +++ b/pkg/scheduler/framework/framework.go @@ -548,11 +548,11 @@ func (f *framework) runAllPluginsForPickAllPlacementType( // Emit an event to inform the user about the scheduling error. f.eventRecorder.Event(policy, corev1.EventTypeWarning, SchedulingErrorReason, fmt.Sprintf("Failed to run filter plugins: %v", err)) - // Check if the error is retryable using structured error semantics. - // If the error (or any error in its chain) implements RetryableError and indicates + // Check if the error has a retry policy configured. + // If the error (or any error in its chain) implements ErrorWithRetryPolicy and indicates // it's retryable, return it as-is so the scheduler can requeue. // Otherwise, wrap it as unexpected behavior. - if retryable, found := fleetErrors.IsRetryable(err); found && retryable { + if isRetryable, hasRetryPolicy := fleetErrors.IsRetryable(err); hasRetryPolicy && isRetryable { return nil, nil, err } return nil, nil, controller.NewUnexpectedBehaviorError(err) @@ -1178,11 +1178,11 @@ func (f *framework) runAllPluginsForPickNPlacementType( // Emit an event to inform the user about the scheduling error. f.eventRecorder.Event(policy, corev1.EventTypeWarning, SchedulingErrorReason, fmt.Sprintf("Failed to run filter plugins: %v", err)) - // Check if the error is retryable using structured error semantics. - // If the error (or any error in its chain) implements RetryableError and indicates + // Check if the error has a retry policy configured. + // If the error (or any error in its chain) implements ErrorWithRetryPolicy and indicates // it's retryable, return it as-is so the scheduler can requeue. // Otherwise, wrap it as unexpected behavior. - if retryable, found := fleetErrors.IsRetryable(err); found && retryable { + if isRetryable, hasRetryPolicy := fleetErrors.IsRetryable(err); hasRetryPolicy && isRetryable { return nil, nil, err } return nil, nil, controller.NewUnexpectedBehaviorError(err)