From ee9c5be6efb7c0e6a5c726881652bca87d2a02d5 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Sat, 27 Jun 2026 22:17:05 -0400 Subject: [PATCH] ci(fleet): retry heavy lane on suite failure Wrap the dispatch-suite dispatch+recover+watch cycle in a bounded outer retry loop gated by a new retry-attempts input (default 1, no retry). On a non-success suite run, a recovery miss, or a watch timeout, it re-dispatches a fresh run up to the configured total, logging each attempt; only the final attempt's failure reds the step. This is safe because every scenario-suite run begins by resetting and seeding dev, so a re-dispatch starts clean. Set retry-attempts: 2 on the 4env heavy lane only, the most race-prone surface, leaving every other lane at the historical single-dispatch default. Signed-off-by: Joshua Temple --- .github/actions/dispatch-suite/action.yaml | 250 +++++++++++++-------- .github/workflows/fleet-e2e.yaml | 7 + 2 files changed, 161 insertions(+), 96 deletions(-) diff --git a/.github/actions/dispatch-suite/action.yaml b/.github/actions/dispatch-suite/action.yaml index 93975e9..730c649 100644 --- a/.github/actions/dispatch-suite/action.yaml +++ b/.github/actions/dispatch-suite/action.yaml @@ -34,6 +34,21 @@ inputs: GitHub secondary rate limits when many suites are watched concurrently. required: false default: '60' + retry-attempts: + description: >- + Total dispatch attempts for this suite. Default 1 keeps the historical + behavior (a single dispatch, no retry). Set to 2 or more on the heaviest, + most race-prone lane so a whole-suite failure re-dispatches a fresh run. + This is safe because every scenario-suite run begins with its own + "reset and seed dev" step, so a re-dispatch starts from a clean slate. + Only a non-success conclusion (or a run that never became visible) + consumes a retry; a successful run returns immediately. + required: false + default: '1' + retry-backoff: + description: 'Seconds to sleep between a failed attempt and the next dispatch' + required: false + default: '30' runs: using: 'composite' @@ -57,115 +72,158 @@ runs: RECOVER_ATTEMPTS: ${{ inputs.recover-attempts }} RECOVER_INTERVAL: ${{ inputs.recover-interval }} WATCH_INTERVAL: ${{ inputs.watch-interval }} + RETRY_ATTEMPTS: ${{ inputs.retry-attempts }} + RETRY_BACKOFF: ${{ inputs.retry-backoff }} run: | set -euo pipefail - # Capture a UTC timestamp BEFORE dispatching so the recovery filter only - # matches runs this action created, not pre-existing ones. - DISPATCH_TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) - echo "Dispatching $TARGET_WORKFLOW in $TARGET_REPO @ $TARGET_REF (since $DISPATCH_TS)" - - # NOTE: do NOT pass -f cascade_version=... here. The suites do not define - # that input yet, so an extra input would error with "unexpected inputs". - # The version under test is computed and logged by the orchestrator but - # is inert until the suites accept the input. - gh workflow run "$TARGET_WORKFLOW" \ - --repo "$TARGET_REPO" \ - --ref "$TARGET_REF" - - # Recover the run id. Cross-repo dispatch is async; the run may not be - # listable immediately, so poll with a bounded retry. - RUN_ID="" - for attempt in $(seq 1 "$RECOVER_ATTEMPTS"); do - RUN_ID=$(gh run list \ + # One full dispatch -> recover -> watch cycle. Returns 0 only when the + # recovered run concludes success, and non-zero on a failed/cancelled + # run, a recovery miss, or a watch timeout. It never calls `exit`, so the + # outer retry loop stays in control of the step's final status. + attempt_suite() { + # Capture a UTC timestamp BEFORE dispatching so the recovery filter only + # matches runs this action created, not pre-existing ones. + local DISPATCH_TS + DISPATCH_TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) + echo "Dispatching $TARGET_WORKFLOW in $TARGET_REPO @ $TARGET_REF (since $DISPATCH_TS)" + + # NOTE: do NOT pass -f cascade_version=... here. The suites do not define + # that input yet, so an extra input would error with "unexpected inputs". + # The version under test is computed and logged by the orchestrator but + # is inert until the suites accept the input. + gh workflow run "$TARGET_WORKFLOW" \ --repo "$TARGET_REPO" \ - --workflow "$TARGET_WORKFLOW" \ - --event workflow_dispatch \ - --created ">=$DISPATCH_TS" \ - --limit 20 \ - --json databaseId,status,conclusion,createdAt \ - --jq 'sort_by(.createdAt) | reverse | .[0].databaseId // empty') - if [ -n "$RUN_ID" ]; then - echo "Recovered run id $RUN_ID on attempt $attempt" - break - fi - echo "Run not visible yet (attempt $attempt/$RECOVER_ATTEMPTS); sleeping ${RECOVER_INTERVAL}s" - sleep "$RECOVER_INTERVAL" - done + --ref "$TARGET_REF" + + # Recover the run id. Cross-repo dispatch is async; the run may not be + # listable immediately, so poll with a bounded retry. + local RUN_ID="" recover_attempt + for recover_attempt in $(seq 1 "$RECOVER_ATTEMPTS"); do + RUN_ID=$(gh run list \ + --repo "$TARGET_REPO" \ + --workflow "$TARGET_WORKFLOW" \ + --event workflow_dispatch \ + --created ">=$DISPATCH_TS" \ + --limit 20 \ + --json databaseId,status,conclusion,createdAt \ + --jq 'sort_by(.createdAt) | reverse | .[0].databaseId // empty') + if [ -n "$RUN_ID" ]; then + echo "Recovered run id $RUN_ID on attempt $recover_attempt" + break + fi + echo "Run not visible yet (attempt $recover_attempt/$RECOVER_ATTEMPTS); sleeping ${RECOVER_INTERVAL}s" + sleep "$RECOVER_INTERVAL" + done - if [ -z "$RUN_ID" ]; then - echo "::error::Could not recover a $TARGET_WORKFLOW run in $TARGET_REPO after dispatch" - exit 1 - fi + if [ -z "$RUN_ID" ]; then + echo "::error::Could not recover a $TARGET_WORKFLOW run in $TARGET_REPO after dispatch" + return 1 + fi - RUN_URL="https://github.com/$TARGET_REPO/actions/runs/$RUN_ID" - echo "Watching $RUN_URL" - { - echo "- **$TARGET_REPO**: [run $RUN_ID]($RUN_URL)" - } >> "$GITHUB_STEP_SUMMARY" - - # Block on the recovered run's conclusion via a bounded poll loop. - # gh run watch --exit-status fails the whole step on a transient - # 401/403/5xx mid-poll, even when the watched run ultimately succeeds. - # This loop retries on transient API errors while still failing closed on - # real run failures and on timeout. - # - # MAX_ATTEMPTS * WATCH_INTERVAL = wall-clock cap (~180 min (3h) at 60s each, headroom for queue+run under full-fleet load, well under GitHub's 6h job cap). - # The 4env suite runs the full lifecycle plus a chained multi-env hotfix - # and the Step 13 conflict probe, each a serial cascade run, so the cap - # must clear its 60-minute internal job timeout with headroom. - MAX_ATTEMPTS=180 - CONSEC_ERRORS=0 - MAX_CONSEC_ERRORS=5 - attempt=0 - - while [ "$attempt" -lt "$MAX_ATTEMPTS" ]; do - attempt=$((attempt + 1)) - - view_output=$(gh run view "$RUN_ID" \ - --repo "$TARGET_REPO" \ - --json status,conclusion 2>&1) - exit_code=$? - - if [ "$exit_code" -ne 0 ]; then - # Decide whether this looks transient (auth blip, rate-limit, 5xx). - if echo "$view_output" | grep -qiE \ - 'HTTP (401|403|5[0-9]{2})|bad credentials|rate.?limit|temporary|connection|timed? ?out|network'; then - echo "Transient API error on attempt $attempt, retrying in ${WATCH_INTERVAL}s..." - CONSEC_ERRORS=$((CONSEC_ERRORS + 1)) - else - echo "gh run view failed (attempt $attempt): $view_output" - CONSEC_ERRORS=$((CONSEC_ERRORS + 1)) + local RUN_URL="https://github.com/$TARGET_REPO/actions/runs/$RUN_ID" + echo "Watching $RUN_URL" + { + echo "- **$TARGET_REPO**: [run $RUN_ID]($RUN_URL)" + } >> "$GITHUB_STEP_SUMMARY" + + # Block on the recovered run's conclusion via a bounded poll loop. + # gh run watch --exit-status fails the whole step on a transient + # 401/403/5xx mid-poll, even when the watched run ultimately succeeds. + # This loop retries on transient API errors while still failing closed on + # real run failures and on timeout. + # + # MAX_ATTEMPTS * WATCH_INTERVAL = wall-clock cap (~180 min (3h) at 60s each, headroom for queue+run under full-fleet load, well under GitHub's 6h job cap). + # The 4env suite runs the full lifecycle plus a chained multi-env hotfix + # and the Step 13 conflict probe, each a serial cascade run, so the cap + # must clear its 60-minute internal job timeout with headroom. + local MAX_ATTEMPTS=180 + local CONSEC_ERRORS=0 + local MAX_CONSEC_ERRORS=5 + local attempt=0 + local view_output exit_code status conclusion + + while [ "$attempt" -lt "$MAX_ATTEMPTS" ]; do + attempt=$((attempt + 1)) + + view_output=$(gh run view "$RUN_ID" \ + --repo "$TARGET_REPO" \ + --json status,conclusion 2>&1) && exit_code=0 || exit_code=$? + + if [ "$exit_code" -ne 0 ]; then + # Decide whether this looks transient (auth blip, rate-limit, 5xx). + if echo "$view_output" | grep -qiE \ + 'HTTP (401|403|5[0-9]{2})|bad credentials|rate.?limit|temporary|connection|timed? ?out|network'; then + echo "Transient API error on attempt $attempt, retrying in ${WATCH_INTERVAL}s..." + CONSEC_ERRORS=$((CONSEC_ERRORS + 1)) + else + echo "gh run view failed (attempt $attempt): $view_output" + CONSEC_ERRORS=$((CONSEC_ERRORS + 1)) + fi + + if [ "$CONSEC_ERRORS" -ge "$MAX_CONSEC_ERRORS" ]; then + echo "::error::$MAX_CONSEC_ERRORS consecutive gh run view failures - failing closed" + return 1 + fi + + sleep "$WATCH_INTERVAL" + continue fi - if [ "$CONSEC_ERRORS" -ge "$MAX_CONSEC_ERRORS" ]; then - echo "::error::$MAX_CONSEC_ERRORS consecutive gh run view failures - failing closed" - exit 1 - fi + # Reset consecutive-error counter on a clean response. + CONSEC_ERRORS=0 - sleep "$WATCH_INTERVAL" - continue - fi + status=$(echo "$view_output" | jq -r '.status // empty') + conclusion=$(echo "$view_output" | jq -r '.conclusion // empty') - # Reset consecutive-error counter on a clean response. - CONSEC_ERRORS=0 + if [ "$status" = "completed" ]; then + if [ "$conclusion" = "success" ]; then + echo "Run $RUN_ID in $TARGET_REPO completed successfully" + return 0 + else + echo "::error::Run $RUN_ID in $TARGET_REPO completed with conclusion: $conclusion" + return 1 + fi + fi - status=$(echo "$view_output" | jq -r '.status // empty') - conclusion=$(echo "$view_output" | jq -r '.conclusion // empty') + echo "Run $RUN_ID status=$status (attempt $attempt/$MAX_ATTEMPTS), waiting ${WATCH_INTERVAL}s..." + sleep "$WATCH_INTERVAL" + done + + echo "::error::Timed out waiting for run $RUN_ID in $TARGET_REPO after $MAX_ATTEMPTS attempts" + return 1 + } + + # Outer retry loop. A green first attempt costs nothing; only a + # non-success suite run consumes a retry. Each attempt re-dispatches a + # fresh run, which is safe because the suite self-resets on its first + # step. The default RETRY_ATTEMPTS=1 means no retry, preserving the + # historical single-dispatch behavior on every lane that does not opt in. + total_attempts="$RETRY_ATTEMPTS" + if ! [ "$total_attempts" -ge 1 ] 2>/dev/null; then + echo "::warning::retry-attempts '$RETRY_ATTEMPTS' is not a positive integer; defaulting to 1" + total_attempts=1 + fi - if [ "$status" = "completed" ]; then - if [ "$conclusion" = "success" ]; then - echo "Run $RUN_ID in $TARGET_REPO completed successfully" - exit 0 - else - echo "::error::Run $RUN_ID in $TARGET_REPO completed with conclusion: $conclusion" - exit 1 - fi + suite_attempt=1 + while [ "$suite_attempt" -le "$total_attempts" ]; do + echo "::group::Suite attempt $suite_attempt of $total_attempts for $TARGET_REPO" + if attempt_suite; then + echo "::endgroup::" + echo "Suite for $TARGET_REPO passed on attempt $suite_attempt of $total_attempts" + exit 0 fi - - echo "Run $RUN_ID status=$status (attempt $attempt/$MAX_ATTEMPTS), waiting ${WATCH_INTERVAL}s..." - sleep "$WATCH_INTERVAL" + echo "::endgroup::" + + if [ "$suite_attempt" -lt "$total_attempts" ]; then + echo "::warning::Suite attempt $suite_attempt of $total_attempts for $TARGET_REPO did not succeed; re-dispatching after ${RETRY_BACKOFF}s" + { + echo "- _retry_: **$TARGET_REPO** attempt $suite_attempt of $total_attempts failed; re-dispatching" + } >> "$GITHUB_STEP_SUMMARY" + sleep "$RETRY_BACKOFF" + fi + suite_attempt=$((suite_attempt + 1)) done - echo "::error::Timed out waiting for run $RUN_ID in $TARGET_REPO after $MAX_ATTEMPTS attempts" + echo "::error::Suite for $TARGET_REPO failed after $total_attempts attempt(s)" exit 1 diff --git a/.github/workflows/fleet-e2e.yaml b/.github/workflows/fleet-e2e.yaml index ce47d9d..0b08eea 100644 --- a/.github/workflows/fleet-e2e.yaml +++ b/.github/workflows/fleet-e2e.yaml @@ -584,6 +584,13 @@ jobs: with: repo: ${{ env.FLEET_OWNER }}/cascade-example-4env token: ${{ secrets.CASCADE_STATE_TOKEN }} + # The heavy lane is the single most race-prone surface (full lifecycle + # plus chained multi-env hotfix, conflict, rollback, and merge_queue + # against one live repo). Allow one whole-suite re-dispatch so a live + # GitHub eventual-consistency blip does not red the lane. Safe because + # each suite run self-resets on its first step. Other lanes keep the + # default (no retry) so a retry stays a scoped concession here. + retry-attempts: '2' # Lane 3: the light remainder, capped at two repos in flight. Sequenced AFTER # the heavy lane (via needs) so 4env and this capped matrix never overlap;