From 7fe6fdd972efa5bb1bf8a1900ffbcf3160f871d6 Mon Sep 17 00:00:00 2001 From: Yu Yi Date: Tue, 9 Jun 2026 19:06:59 -0400 Subject: [PATCH] =?UTF-8?q?loop:=20graduated=20guardrails=20=E2=80=94=20id?= =?UTF-8?q?entical-call=20and=20consecutive-mistake=20detectors=20(closes?= =?UTF-8?q?=20#344)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two detectors watch every tool round, on by default with opt-out via RunRequest.Guardrails (zero value = defaults, mirroring RetryPolicy): - Identical-call: a single-call round whose name+canonicalized-args hash matches the previous round's extends a streak. At 3 the loop injects a corrective user message ("it will keep producing the same result — change arguments or approach"); at 5 the run ends with the typed ErrRepeatedToolCalls. Any change of arguments resets. - Consecutive mistakes: rounds where every tool result IsError. At 3 a corrective message ("re-read the error messages before acting"); at 6 the run ends with ErrTooManyMistakes. Any success resets. Injected messages are marked glue/guardrail in metadata; EventGuardrail reports kind/count/action so UIs and the goal loop can surface them. Graduated nudge-then-halt policy per Gemini CLI, with Cline's thresholds (docs/coding-harness-roadmap.md P2.8). The "no tool used" reprompt from the roadmap is deliberately omitted: glue turns legitimately end without tool calls, and the narrated-stall case is covered by AutoContinue (#343). TestRunMaxTurnsDefaultIs32 now disables guardrails — it scripts 50 identical calls to test the turn budget, which the repeat detector would correctly halt first. Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 15 +++ loop/guardrails.go | 195 +++++++++++++++++++++++++++++++++++++ loop/guardrails_test.go | 209 ++++++++++++++++++++++++++++++++++++++++ loop/run.go | 21 ++++ loop/run_test.go | 3 + 5 files changed, 443 insertions(+) create mode 100644 loop/guardrails.go create mode 100644 loop/guardrails_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index bdfc852..b1df2b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,21 @@ and [`agents/peggy/CHANGELOG.md`](agents/peggy/CHANGELOG.md). ## Unreleased +- **Loop guardrails (`loop`).** Two graduated detectors now watch every + tool round, on by default (`RunRequest.Guardrails`, zero value = + defaults; `Disabled` opts out): repeating the **same tool call with + identical arguments** draws a corrective user message at 3 + consecutive occurrences and ends the run with the typed + `ErrRepeatedToolCalls` at 5; **consecutive all-error tool rounds** + draw a corrective message at 3 and end the run with + `ErrTooManyMistakes` at 6. Streaks reset on any change of arguments + or any successful result; injected messages are marked + `glue/guardrail` in metadata and `EventGuardrail` reports + kind/count/action. These are the failure shapes that waste tokens + fastest on open-weight models — and unattended `glue goal` runs + ([docs/coding-harness-roadmap.md](docs/coding-harness-roadmap.md) + P2.8). Closes #344. + - **Next-speaker stall recovery (`loop`, `glue`, `cmd/glue`).** Gemini's classic stall — narrating "I will now update the file." and stopping without calling a tool — is now recovered: with the new opt-in diff --git a/loop/guardrails.go b/loop/guardrails.go new file mode 100644 index 0000000..792fab0 --- /dev/null +++ b/loop/guardrails.go @@ -0,0 +1,195 @@ +package loop + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "strings" +) + +// Guardrails detect two pathological loop shapes that waste tokens +// fastest on open-weight models: repeating the same tool call with +// identical arguments, and burning turn after turn on nothing but +// failing tool calls. The response is graduated (Gemini CLI's policy): +// first inject a corrective user message, only halt when the model +// ignores it. + +// GuardrailPolicy bounds the loop guardrails. The zero value enables +// them with defaults; set Disabled to turn them off. +type GuardrailPolicy struct { + // Disabled turns the guardrails off entirely. + Disabled bool + + // RepeatNudge is the number of consecutive identical tool calls + // (same name, same canonical arguments) that triggers a corrective + // user message. Zero means DefaultRepeatNudge. + RepeatNudge int + + // RepeatHalt is the number of consecutive identical calls that + // ends the run with [ErrRepeatedToolCalls]. Zero means + // DefaultRepeatHalt. + RepeatHalt int + + // MistakeNudge is the number of consecutive all-error tool rounds + // that triggers a corrective user message. Zero means + // DefaultMistakeNudge. + MistakeNudge int + + // MistakeHalt is the number of consecutive all-error tool rounds + // that ends the run with [ErrTooManyMistakes]. Zero means + // DefaultMistakeHalt. + MistakeHalt int +} + +// Guardrail defaults, matching the thresholds Cline and Gemini CLI +// converged on. +const ( + DefaultRepeatNudge = 3 + DefaultRepeatHalt = 5 + DefaultMistakeNudge = 3 + DefaultMistakeHalt = 6 +) + +// EventGuardrail is emitted when a guardrail injects a corrective +// message or halts the run. Event.Error carries the reason; +// Event.Metadata carries "kind" ("repeat" or "mistakes"), "count", and +// "action" ("nudge" or "halt"). +const EventGuardrail EventType = "guardrail" + +// ErrRepeatedToolCalls ends a run after the model kept issuing the +// same tool call with identical arguments despite a corrective nudge. +var ErrRepeatedToolCalls = errors.New("loop: model repeated the same tool call past the guardrail limit") + +// ErrTooManyMistakes ends a run after every tool call failed for +// several consecutive rounds despite a corrective nudge. +var ErrTooManyMistakes = errors.New("loop: too many consecutive failed tool rounds") + +const repeatNudgeMessage = "You have repeated the exact same tool call several times. It will keep producing the same result. Step back, reconsider the approach, and either change the arguments or use a different tool." + +const mistakeNudgeMessage = "Your last several tool calls all failed. Stop and re-read the error messages carefully before acting again — they describe what to fix. If the same approach keeps failing, try a different one." + +func (p GuardrailPolicy) withDefaults() GuardrailPolicy { + if p.RepeatNudge <= 0 { + p.RepeatNudge = DefaultRepeatNudge + } + if p.RepeatHalt <= 0 { + p.RepeatHalt = DefaultRepeatHalt + } + if p.MistakeNudge <= 0 { + p.MistakeNudge = DefaultMistakeNudge + } + if p.MistakeHalt <= 0 { + p.MistakeHalt = DefaultMistakeHalt + } + return p +} + +type guardAction int + +const ( + guardOK guardAction = iota + guardNudge + guardHalt +) + +type guardVerdict struct { + action guardAction + kind string // "repeat" | "mistakes" + count int + message string // nudge text + err error // halt error +} + +// guardState tracks consecutive identical calls and consecutive +// all-error tool rounds across the turns of one Run. +type guardState struct { + policy GuardrailPolicy + + lastCallKey string + repeatCount int + repeatNudged bool + + mistakeRounds int + mistakeNudged bool +} + +func newGuardState(policy GuardrailPolicy) *guardState { + return &guardState{policy: policy.withDefaults()} +} + +// observe inspects one tool round (the calls of an assistant turn and +// their results) and returns the action to take. +func (g *guardState) observe(calls []ToolCall, results []Message) guardVerdict { + if g.policy.Disabled { + return guardVerdict{action: guardOK} + } + + // Identical-call tracking: a round consisting of exactly one call + // whose name+arguments hash matches the previous round's extends + // the streak; anything else resets it. + if len(calls) == 1 { + key := toolCallKey(calls[0]) + if key == g.lastCallKey { + g.repeatCount++ + } else { + g.lastCallKey = key + g.repeatCount = 1 + g.repeatNudged = false + } + } else { + g.lastCallKey = "" + g.repeatCount = 0 + g.repeatNudged = false + } + + // Mistake tracking: a round where every result is an error extends + // the streak; any success resets it. + allErrors := len(results) > 0 + for _, m := range results { + if !m.IsError { + allErrors = false + break + } + } + if allErrors { + g.mistakeRounds++ + } else { + g.mistakeRounds = 0 + g.mistakeNudged = false + } + + if g.repeatCount >= g.policy.RepeatHalt { + return guardVerdict{action: guardHalt, kind: "repeat", count: g.repeatCount, err: fmt.Errorf("%w (%d identical calls to %s)", ErrRepeatedToolCalls, g.repeatCount, calls[0].Name)} + } + if g.mistakeRounds >= g.policy.MistakeHalt { + return guardVerdict{action: guardHalt, kind: "mistakes", count: g.mistakeRounds, err: fmt.Errorf("%w (%d consecutive failed rounds)", ErrTooManyMistakes, g.mistakeRounds)} + } + if g.repeatCount >= g.policy.RepeatNudge && !g.repeatNudged { + g.repeatNudged = true + return guardVerdict{action: guardNudge, kind: "repeat", count: g.repeatCount, message: repeatNudgeMessage} + } + if g.mistakeRounds >= g.policy.MistakeNudge && !g.mistakeNudged { + g.mistakeNudged = true + return guardVerdict{action: guardNudge, kind: "mistakes", count: g.mistakeRounds, message: mistakeNudgeMessage} + } + return guardVerdict{action: guardOK} +} + +// toolCallKey canonicalizes a call for identity comparison: name plus +// a hash of the raw arguments with whitespace collapsed. +func toolCallKey(call ToolCall) string { + args := strings.Join(strings.Fields(string(call.Arguments)), "") + sum := sha256.Sum256([]byte(call.Name + "\x00" + args)) + return hex.EncodeToString(sum[:8]) +} + +// guardrailUserMessage is the injected corrective message, marked in +// metadata so stores and UIs can identify it. +func guardrailUserMessage(text, kind string) Message { + return Message{ + Role: MessageRoleUser, + Content: []ContentPart{{Type: ContentTypeText, Text: text}}, + Metadata: map[string]any{"glue/guardrail": kind}, + } +} diff --git a/loop/guardrails_test.go b/loop/guardrails_test.go new file mode 100644 index 0000000..fe70f33 --- /dev/null +++ b/loop/guardrails_test.go @@ -0,0 +1,209 @@ +package loop + +import ( + "context" + "encoding/json" + "errors" + "testing" +) + +// guardTool is a scriptable tool whose executor returns errors or +// successes per call. +func guardTool(name string, results []bool) Tool { + i := 0 + return Tool{ + ToolSpec: ToolSpec{Name: name}, + Execute: func(_ context.Context, _ ToolCall) (ToolResult, error) { + ok := true + if i < len(results) { + ok = results[i] + } + i++ + if ok { + return ToolResult{Content: []ContentPart{{Type: ContentTypeText, Text: "fine"}}}, nil + } + return ToolResult{IsError: true, Content: []ContentPart{{Type: ContentTypeText, Text: "boom"}}}, nil + }, + } +} + +// callTurn scripts an assistant turn making one tool call. +func callTurn(id, name, args string) []ProviderEvent { + return []ProviderEvent{ + {Type: ProviderEventToolCall, ToolCall: &ToolCall{ID: id, Name: name, Arguments: json.RawMessage(args)}}, + {Type: ProviderEventDone}, + } +} + +func TestGuardrailRepeatNudgeThenHalt(t *testing.T) { + t.Parallel() + // The model repeats the exact same call every turn, ignoring the nudge. + turns := [][]ProviderEvent{} + for i := 0; i < 6; i++ { + turns = append(turns, callTurn("c", "probe", `{"x":1}`)) + } + provider := &fakeProvider{turns: turns} + var events []Event + _, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", nil)}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + Emit: func(e Event) { + if e.Type == EventGuardrail { + events = append(events, e) + } + }, + }) + if !errors.Is(err, ErrRepeatedToolCalls) { + t.Fatalf("err = %v, want ErrRepeatedToolCalls", err) + } + if provider.calls != 5 { + t.Fatalf("provider calls = %d, want 5 (halt at the 5th identical call)", provider.calls) + } + if len(events) != 2 { + t.Fatalf("guardrail events = %d, want 2 (nudge then halt)", len(events)) + } + if events[0].Metadata["action"] != "nudge" || events[1].Metadata["action"] != "halt" { + t.Fatalf("event actions = %v / %v", events[0].Metadata["action"], events[1].Metadata["action"]) + } +} + +func TestGuardrailRepeatResetsOnDifferentArgs(t *testing.T) { + t.Parallel() + turns := [][]ProviderEvent{ + callTurn("a", "probe", `{"x":1}`), + callTurn("b", "probe", `{"x":2}`), + callTurn("c", "probe", `{"x":3}`), + callTurn("d", "probe", `{"x":4}`), + {{Type: ProviderEventTextDelta, Delta: "done"}, {Type: ProviderEventDone}}, + } + provider := &fakeProvider{turns: turns} + res, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", nil)}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + for _, m := range res.NewMessages { + if m.Metadata["glue/guardrail"] != nil { + t.Fatalf("unexpected guardrail message: %#v", m) + } + } +} + +func TestGuardrailMistakesNudgeThenHalt(t *testing.T) { + t.Parallel() + // Different args each turn (no repeat trigger), but every result + // fails. Nudge after 3 failed rounds, halt after 6. + turns := [][]ProviderEvent{} + for i := 0; i < 7; i++ { + turns = append(turns, callTurn("c", "probe", `{"attempt":`+string(rune('0'+i))+`}`)) + } + provider := &fakeProvider{turns: turns} + fails := make([]bool, 10) // all false = all errors + var nudges, halts int + _, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", fails)}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + Emit: func(e Event) { + if e.Type == EventGuardrail { + switch e.Metadata["action"] { + case "nudge": + nudges++ + case "halt": + halts++ + } + } + }, + }) + if !errors.Is(err, ErrTooManyMistakes) { + t.Fatalf("err = %v, want ErrTooManyMistakes", err) + } + if provider.calls != 6 { + t.Fatalf("provider calls = %d, want 6", provider.calls) + } + if nudges != 1 || halts != 1 { + t.Fatalf("nudges = %d halts = %d, want 1/1", nudges, halts) + } +} + +func TestGuardrailMistakesResetOnSuccess(t *testing.T) { + t.Parallel() + turns := [][]ProviderEvent{ + callTurn("a", "probe", `{"x":1}`), + callTurn("b", "probe", `{"x":2}`), + callTurn("c", "probe", `{"x":3}`), // success resets the streak + callTurn("d", "probe", `{"x":4}`), + {{Type: ProviderEventTextDelta, Delta: "done"}, {Type: ProviderEventDone}}, + } + provider := &fakeProvider{turns: turns} + res, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", []bool{false, false, true, false})}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + for _, m := range res.NewMessages { + if m.Metadata["glue/guardrail"] != nil { + t.Fatalf("streak should have reset, got nudge: %#v", m) + } + } +} + +func TestGuardrailDisabled(t *testing.T) { + t.Parallel() + turns := [][]ProviderEvent{} + for i := 0; i < 7; i++ { + turns = append(turns, callTurn("c", "probe", `{"x":1}`)) + } + turns = append(turns, []ProviderEvent{{Type: ProviderEventTextDelta, Delta: "done"}, {Type: ProviderEventDone}}) + provider := &fakeProvider{turns: turns} + _, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", nil)}, + Guardrails: GuardrailPolicy{Disabled: true}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + if provider.calls != 8 { + t.Fatalf("provider calls = %d, want 8 (no guardrails)", provider.calls) + } +} + +func TestGuardrailNudgeMessageShape(t *testing.T) { + t.Parallel() + turns := [][]ProviderEvent{ + callTurn("a", "probe", `{"x":1}`), + callTurn("b", "probe", `{"x":1}`), + callTurn("c", "probe", `{"x":1}`), + {{Type: ProviderEventTextDelta, Delta: "ok I'll stop"}, {Type: ProviderEventDone}}, + } + provider := &fakeProvider{turns: turns} + res, err := Run(context.Background(), RunRequest{ + Provider: provider, + Tools: []Tool{guardTool("probe", nil)}, + Messages: []Message{{Role: MessageRoleUser, Content: []ContentPart{{Type: ContentTypeText, Text: "go"}}}}, + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + var nudge *Message + for i := range res.NewMessages { + if res.NewMessages[i].Metadata["glue/guardrail"] == "repeat" { + nudge = &res.NewMessages[i] + } + } + if nudge == nil { + t.Fatalf("no guardrail nudge in transcript: %#v", res.NewMessages) + } + if nudge.Role != MessageRoleUser { + t.Fatalf("nudge role = %s", nudge.Role) + } +} diff --git a/loop/run.go b/loop/run.go index b19c661..8a9303b 100644 --- a/loop/run.go +++ b/loop/run.go @@ -49,6 +49,11 @@ type RunRequest struct { // message (at most twice per run) instead of ending the turn. // Recovers the classic Gemini narrate-then-stop stall. AutoContinue bool + + // Guardrails bounds the repeated-call and consecutive-mistake + // detectors. The zero value enables them with defaults; set + // Guardrails.Disabled to turn them off. + Guardrails GuardrailPolicy } // RunResult is returned by [Run]. Messages is the full transcript including @@ -105,6 +110,7 @@ func Run(ctx context.Context, req RunRequest) (RunResult, error) { lastAssistantMsg := -1 lastAssistantNew := -1 autoContinues := 0 + guard := newGuardState(req.Guardrails) for turn := 0; turn < maxTurns; turn++ { if err := ctx.Err(); err != nil { return fail(err) @@ -147,6 +153,21 @@ func Run(ctx context.Context, req RunRequest) (RunResult, error) { messages = append(messages, toolMessages...) newMessages = append(newMessages, toolMessages...) + switch verdict := guard.observe(toolCalls, toolMessages); verdict.action { + case guardNudge: + nudge := guardrailUserMessage(verdict.message, verdict.kind) + messages = append(messages, nudge) + newMessages = append(newMessages, nudge) + emit(Event{Type: EventGuardrail, Error: verdict.message, Metadata: map[string]any{ + "kind": verdict.kind, "count": verdict.count, "action": "nudge", + }}) + case guardHalt: + emit(Event{Type: EventGuardrail, Error: verdict.err.Error(), Metadata: map[string]any{ + "kind": verdict.kind, "count": verdict.count, "action": "halt", + }}) + return fail(verdict.err) + } + emit(Event{Type: EventTurnEnd, Message: messagePtr(assistant)}) } diff --git a/loop/run_test.go b/loop/run_test.go index c907485..badde1d 100644 --- a/loop/run_test.go +++ b/loop/run_test.go @@ -271,6 +271,9 @@ func TestRunMaxTurnsDefaultIs32(t *testing.T) { _, err := Run(context.Background(), RunRequest{ Provider: &fakeProvider{turns: turns}, Tools: []Tool{noop}, + // This test exercises the turn budget; the identical-call + // guardrail would halt these scripted repeats first. + Guardrails: GuardrailPolicy{Disabled: true}, }) if err == nil || !strings.Contains(err.Error(), "maximum turns exceeded (32)") { t.Fatalf("err = %v, want default max turns 32", err)