From 13d4c05c71f3202b94fb4aa17f8ab46ae3598567 Mon Sep 17 00:00:00 2001
From: konard <drakonard@gmail.com>
Date: Sun, 14 Jun 2026 21:59:32 +0000
Subject: [PATCH 1/4] Initial commit with task details

Adding .gitkeep for PR creation (default mode).
This file will be removed when the task is complete.

Issue: https://github.com/link-foundation/box/issues/104
---
 .gitkeep | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitkeep

diff --git a/.gitkeep b/.gitkeep
new file mode 100644
index 0000000..ccc5b10
--- /dev/null
+++ b/.gitkeep
@@ -0,0 +1 @@
+# .gitkeep file auto-generated at 2026-06-14T21:59:32.354Z for PR creation at branch issue-104-8849a1f71a19 for issue https://github.com/link-foundation/box/issues/104
\ No newline at end of file

From 5b9944eda8113bde31b50893a6a5e0de5916382d Mon Sep 17 00:00:00 2001
From: konard <drakonard@gmail.com>
Date: Sun, 14 Jun 2026 22:12:47 +0000
Subject: [PATCH 2/4] dind-box: warn when the nested daemon runs on the vfs
 storage driver (issue #104)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Landing on the `vfs` storage driver was silent: a single `log` line named the
driver, but nothing flagged that vfs performs NO copy-on-write. vfs stores every
image layer as a full, independent copy, so a multi-GB image's on-disk footprint
becomes the SUM of all cumulative layer sizes (many times the image size), and
`docker pull`/`docker run` can fail with `failed to register layer: no space
left on device` on a disk far larger than the image — with no breadcrumb
pointing at the driver. Downstream this overflowed a disk with a >30 GB image
(link-assistant/hive-mind#1914).

The active driver ends up being vfs either pinned explicitly via
`DIND_STORAGE_DRIVER=vfs` (legitimate for overlay-on-overlay compatibility) or
reached as the last-resort auto-detect fallback. This is observability, not a
default change — vfs stays the safe fallback. `start_dockerd` now calls
`warn_if_vfs_storage_driver` right after the daemon becomes ready, emitting one
actionable warning whenever the active driver is vfs: it explains the
copy-on-write/disk implication and names the `DIND_STORAGE_DRIVER=fuse-overlayfs`
remediation (copy-on-write, works overlay-on-overlay, already shipped in the
image). The remediation adapts to whether `/dev/fuse` is present, pointing at
`--privileged` / `--device /dev/fuse` first when it is missing. The
`DIND_STORAGE_DRIVER` doc comment now spells out the vfs disk amplification too.

Covered by a new unit test (experiments/test-issue104-vfs-warning.sh) and a new
assertion in the CI-run tests/dind/example-storage-driver-vfs.sh; documented in
docs/dind/USAGE.md. Adds a patch changeset.
---
 .../issue-104-vfs-storage-driver-warning.md   | 29 ++++++
 .gitkeep                                      |  1 -
 docs/dind/USAGE.md                            | 23 +++++
 experiments/test-issue104-vfs-warning.sh      | 89 +++++++++++++++++++
 tests/dind/example-storage-driver-vfs.sh      | 13 +++
 ubuntu/24.04/dind/dind-entrypoint.sh          | 44 ++++++++-
 6 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/issue-104-vfs-storage-driver-warning.md
 delete mode 100644 .gitkeep
 create mode 100755 experiments/test-issue104-vfs-warning.sh

diff --git a/.changeset/issue-104-vfs-storage-driver-warning.md b/.changeset/issue-104-vfs-storage-driver-warning.md
new file mode 100644
index 0000000..4d541b6
--- /dev/null
+++ b/.changeset/issue-104-vfs-storage-driver-warning.md
@@ -0,0 +1,29 @@
+---
+bump: patch
+---
+
+dind-box: warn when the nested daemon runs on the `vfs` storage driver (issue
+#104).
+
+When the inner dockerd ends up on `vfs` — either pinned explicitly via
+`DIND_STORAGE_DRIVER=vfs` (e.g. for overlay-on-overlay compatibility) or reached
+as the last-resort auto-detect fallback — large images could fail to pull/run
+with a cryptic `failed to register layer: no space left on device` and **no
+hint** that the storage driver was the cause. `vfs` performs no copy-on-write: it
+stores every image layer as a full, independent copy, so a multi-GB image's
+on-disk footprint becomes the *sum* of all cumulative layer sizes (many times the
+image size), and a >30 GB image can overflow a disk with far more than 30 GB free
+(`link-assistant/hive-mind#1914`).
+
+This is observability, not a default change — `vfs` stays the safe fallback. The
+entrypoint now emits a single, actionable warning right after the daemon becomes
+ready whenever the active driver is `vfs`, explaining the copy-on-write/disk
+implication and naming the `DIND_STORAGE_DRIVER=fuse-overlayfs` remediation
+(copy-on-write, works overlay-on-overlay, already shipped in the image). The
+remediation line adapts to whether `/dev/fuse` is present, so when it is missing
+it points at `--privileged` / `--device /dev/fuse` first. The
+`DIND_STORAGE_DRIVER` doc comment now spells out the `vfs` disk amplification too.
+
+Covered by a new unit test (`experiments/test-issue104-vfs-warning.sh`) and a new
+assertion in the CI-run `tests/dind/example-storage-driver-vfs.sh`; documented in
+`docs/dind/USAGE.md`.
diff --git a/.gitkeep b/.gitkeep
deleted file mode 100644
index ccc5b10..0000000
--- a/.gitkeep
+++ /dev/null
@@ -1 +0,0 @@
-# .gitkeep file auto-generated at 2026-06-14T21:59:32.354Z for PR creation at branch issue-104-8849a1f71a19 for issue https://github.com/link-foundation/box/issues/104
\ No newline at end of file
diff --git a/docs/dind/USAGE.md b/docs/dind/USAGE.md
index c9f4557..80aaa5b 100644
--- a/docs/dind/USAGE.md
+++ b/docs/dind/USAGE.md
@@ -352,6 +352,29 @@ docker run -d --privileged \
   konard/box-dind sleep infinity
 ```
 
+Because that trade-off is easy to hit by accident, the entrypoint emits a
+one-time warning whenever the **active** driver ends up being `vfs` — whether
+pinned explicitly or reached as the last-resort fallback (issue #104). `vfs`
+stores every image layer as a full, independent copy, so a multi-GB image's
+on-disk footprint becomes the *sum* of all cumulative layer sizes — many times
+the image size — and `docker pull`/`docker run` can fail with `failed to register
+layer: no space left on device` on a disk far larger than the image. The warning
+makes that failure traceable instead of looking like a generic "out of disk".
+
+If your host supports it, prefer `DIND_STORAGE_DRIVER=fuse-overlayfs`: it is
+copy-on-write **and** works overlay-on-overlay (the compatibility reason `vfs` is
+sometimes chosen), is already shipped in the image, and needs `/dev/fuse`
+(provided by `--privileged`). The warning's remediation line adapts to whether
+`/dev/fuse` is present, so when it is missing it tells you to add `--privileged`
+or `--device /dev/fuse` before switching.
+
+```bash
+docker run -d --privileged \
+  -e DIND_STORAGE_DRIVER=fuse-overlayfs \
+  --name box-dind-cow \
+  konard/box-dind sleep infinity
+```
+
 CI verifies the forced `vfs` path:
 
 ```bash
diff --git a/experiments/test-issue104-vfs-warning.sh b/experiments/test-issue104-vfs-warning.sh
new file mode 100755
index 0000000..806803b
--- /dev/null
+++ b/experiments/test-issue104-vfs-warning.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Isolated unit test for the issue #104 vfs storage-driver warning in
+# dind-entrypoint.sh.
+#
+# Landing on the `vfs` storage driver used to be silent (only a `log` line named
+# the driver), so an operator hitting `failed to register layer: no space left on
+# device` had no breadcrumb pointing at the copy-on-write-less driver. The
+# entrypoint now emits a one-time `warn` whenever the *active* driver is `vfs`,
+# with a remediation hint whose wording depends on whether the fuse-overlayfs
+# device node is available.
+#
+# Building the full box-dind image requires overlay-backed nested Docker, which
+# this sandbox cannot provide, so — exactly like preload-unit-test.sh — we source
+# the real entrypoint (DIND_ENTRYPOINT_SOURCE_ONLY=1 returns before the
+# startup/handoff flow) to get `warn_if_vfs_storage_driver` verbatim and drive it
+# directly. The /dev/fuse probe is pointed at a temp path via DIND_FUSE_DEVICE so
+# both remediation branches are exercised deterministically without a real device
+# node.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ENTRYPOINT="$SCRIPT_DIR/../ubuntu/24.04/dind/dind-entrypoint.sh"
+
+WORK="$(mktemp -d)"
+trap 'rm -rf "$WORK"' EXIT
+
+# Source the real entrypoint for its functions only.
+# shellcheck disable=SC1090
+DIND_ENTRYPOINT_SOURCE_ONLY=1 . "$ENTRYPOINT"
+
+pass=0; fail=0
+check() { # check <description> <condition-cmd...>
+  desc="$1"; shift
+  if "$@"; then echo "  PASS: $desc"; pass=$((pass+1)); else echo "  FAIL: $desc"; fail=$((fail+1)); fi
+}
+
+ERR="$WORK/err.log"
+PRESENT_FUSE="$WORK/fuse-present"   # an existing path: stands in for /dev/fuse
+MISSING_FUSE="$WORK/fuse-missing"   # a path that does not exist
+: > "$PRESENT_FUSE"
+rm -f "$MISSING_FUSE"
+
+echo "== Case 1: vfs driver emits the copy-on-write warning =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$PRESENT_FUSE" warn_if_vfs_storage_driver vfs 2>"$ERR"
+check "warns the driver is vfs"                     grep -q "'vfs' storage driver" "$ERR"
+check "calls out NO copy-on-write"                  grep -q "NO copy-on-write" "$ERR"
+check "names the disk failure mode"                 grep -q "no space left on device" "$ERR"
+check "names the fuse-overlayfs remediation"        grep -q "DIND_STORAGE_DRIVER=fuse-overlayfs" "$ERR"
+
+echo "== Case 2: overlay2 driver stays silent =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$PRESENT_FUSE" warn_if_vfs_storage_driver overlay2 2>"$ERR"
+check "no warning for overlay2" bash -c '! test -s "$1"' _ "$ERR"
+
+echo "== Case 3: fuse-overlayfs driver stays silent =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$PRESENT_FUSE" warn_if_vfs_storage_driver fuse-overlayfs 2>"$ERR"
+check "no warning for fuse-overlayfs" bash -c '! test -s "$1"' _ "$ERR"
+
+echo "== Case 4: empty/unknown driver stays silent =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$PRESENT_FUSE" warn_if_vfs_storage_driver "" 2>"$ERR"
+check "no warning for empty driver" bash -c '! test -s "$1"' _ "$ERR"
+
+echo "== Case 5: /dev/fuse present -> 'set fuse-overlayfs' remediation =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$PRESENT_FUSE" warn_if_vfs_storage_driver vfs 2>"$ERR"
+check "remediation says the device is present"   grep -q "is present" "$ERR"
+check "remediation does NOT claim it is missing" bash -c '! grep -q "is missing" "$1"' _ "$ERR"
+
+echo "== Case 6: /dev/fuse missing -> explains why fuse-overlayfs is unavailable =="
+: > "$ERR"
+DIND_FUSE_DEVICE="$MISSING_FUSE" warn_if_vfs_storage_driver vfs 2>"$ERR"
+check "remediation explains the device is missing"   grep -q "is missing" "$ERR"
+check "remediation suggests --device /dev/fuse"      grep -q -- "--device /dev/fuse" "$ERR"
+check "remediation suggests --privileged"            grep -q -- "--privileged" "$ERR"
+check "still names the fuse-overlayfs driver"        grep -q "DIND_STORAGE_DRIVER=fuse-overlayfs" "$ERR"
+
+echo "== Case 7: function returns success so the start_dockerd success branch is unaffected =="
+# warn_if_vfs_storage_driver runs immediately before `return 0`; under `set -e`
+# a non-zero return would abort startup. Assert exit status 0 for both vfs and
+# non-vfs drivers.
+check "returns 0 for vfs"      bash -c 'DIND_ENTRYPOINT_SOURCE_ONLY=1 . "$1"; DIND_FUSE_DEVICE="$2" warn_if_vfs_storage_driver vfs >/dev/null 2>&1' _ "$ENTRYPOINT" "$PRESENT_FUSE"
+check "returns 0 for overlay2" bash -c 'DIND_ENTRYPOINT_SOURCE_ONLY=1 . "$1"; warn_if_vfs_storage_driver overlay2 >/dev/null 2>&1' _ "$ENTRYPOINT"
+
+echo
+echo "RESULT: $pass passed, $fail failed"
+[ "$fail" -eq 0 ]
diff --git a/tests/dind/example-storage-driver-vfs.sh b/tests/dind/example-storage-driver-vfs.sh
index ec8dc9e..94ed706 100755
--- a/tests/dind/example-storage-driver-vfs.sh
+++ b/tests/dind/example-storage-driver-vfs.sh
@@ -19,3 +19,16 @@ if [ "$driver" != "vfs" ]; then
 fi
 
 log "inner dockerd is using the vfs storage driver"
+
+# issue #104: landing on vfs must not be silent. The entrypoint (PID 1) emits a
+# copy-on-write warning to stderr, which docker captures in the container logs.
+log "verifying the vfs copy-on-write warning was emitted (issue #104)"
+logs="$(docker logs "$container" 2>&1 || true)"
+for needle in "'vfs' storage driver" "no space left on device" "DIND_STORAGE_DRIVER=fuse-overlayfs"; do
+  if ! printf '%s' "$logs" | grep -qF "$needle"; then
+    printf '%s\n' "$logs" >&2
+    fail "expected the vfs warning to mention \"${needle}\", but it was absent from the container logs"
+  fi
+done
+
+log "vfs copy-on-write warning is present in the container logs"
diff --git a/ubuntu/24.04/dind/dind-entrypoint.sh b/ubuntu/24.04/dind/dind-entrypoint.sh
index 44b7907..efb581c 100644
--- a/ubuntu/24.04/dind/dind-entrypoint.sh
+++ b/ubuntu/24.04/dind/dind-entrypoint.sh
@@ -16,7 +16,14 @@
 #   - Sysbox :  docker run --runtime=sysbox-runc konard/<base>-dind   (no --privileged)
 #
 # Environment overrides:
-#   DIND_STORAGE_DRIVER  Override storage driver (default: auto-detected: overlay2, fuse-overlayfs, vfs)
+#   DIND_STORAGE_DRIVER  Override storage driver (default: auto-detected: overlay2,
+#                        fuse-overlayfs, vfs). Note: vfs has NO copy-on-write — it
+#                        stores every image layer as a full, independent copy, so
+#                        large (multi-GB) images consume many times their size on
+#                        disk and 'docker pull'/'docker run' can fail with 'no
+#                        space left on device'. When the active driver ends up
+#                        being vfs the entrypoint emits a one-time warning naming
+#                        the fuse-overlayfs (copy-on-write) remediation. (issue #104)
 #   DIND_DATA_ROOT       Override --data-root for dockerd (default: /var/lib/docker)
 #   DIND_LOG_FILE        Where to write dockerd logs (default: /var/log/dockerd.log)
 #   DIND_WAIT_SECONDS    How long to wait for dockerd to come up (default: 30)
@@ -189,6 +196,40 @@ wait_for_dockerd_ready() {
   return 2
 }
 
+# Device node fuse-overlayfs needs for copy-on-write. Overridable so the unit
+# test can exercise both the "present" and "missing" remediation branches without
+# a real device node; in production it is always /dev/fuse.
+DIND_FUSE_DEVICE="${DIND_FUSE_DEVICE:-/dev/fuse}"
+
+# When the active storage driver is vfs, emit a one-time warning explaining the
+# copy-on-write footgun. vfs is a safe last-resort fallback (and a legitimate
+# explicit pin for overlay-on-overlay compatibility), so this is observability,
+# not a default change: it stores every image layer as a full, independent copy,
+# so a multi-GB image's on-disk footprint becomes the SUM of all cumulative layer
+# sizes — many times the image size — and 'docker pull'/'docker run' can fail with
+# 'failed to register layer: no space left on device' on a disk far larger than
+# the image. Without this breadcrumb the generic disk error is easily misdiagnosed
+# as "not enough disk" instead of "wrong driver wastes the disk"
+# (link-assistant/hive-mind#1914). The remediation depends on whether the
+# copy-on-write fuse-overlayfs driver's device node is available. (issue #104)
+warn_if_vfs_storage_driver() {
+  [ "$1" = "vfs" ] || return 0
+
+  warn "dockerd is using the 'vfs' storage driver, which has NO copy-on-write:"
+  warn "every image layer is stored as a full copy, so a multi-GB image's on-disk"
+  warn "footprint becomes the SUM of all cumulative layer sizes (many times the"
+  warn "image size). 'docker pull'/'docker run' can then fail with 'failed to"
+  warn "register layer: no space left on device' on a disk far larger than the image."
+  if [ -e "$DIND_FUSE_DEVICE" ]; then
+    warn "For copy-on-write here, set DIND_STORAGE_DRIVER=fuse-overlayfs (works"
+    warn "overlay-on-overlay; ${DIND_FUSE_DEVICE} is present)."
+  else
+    warn "fuse-overlayfs (copy-on-write, works overlay-on-overlay) is unavailable"
+    warn "because ${DIND_FUSE_DEVICE} is missing; run with --privileged or"
+    warn "--device /dev/fuse, then set DIND_STORAGE_DRIVER=fuse-overlayfs."
+  fi
+}
+
 start_dockerd() {
   if pgrep -x dockerd >/dev/null 2>&1; then
     log "dockerd already running (pid $(pgrep -x dockerd | head -n1))"
@@ -215,6 +256,7 @@ start_dockerd() {
     launch_dockerd "$DIND_STORAGE_DRIVER"
 
     if wait_for_dockerd_ready "$DIND_DOCKERD_PID" "$DIND_STORAGE_DRIVER"; then
+      warn_if_vfs_storage_driver "$DIND_STORAGE_DRIVER"
       return 0
     else
       result="$?"

From 3958614d6e37812218597c2b17b5143fef770747 Mon Sep 17 00:00:00 2001
From: konard <drakonard@gmail.com>
Date: Sun, 14 Jun 2026 23:24:20 +0000
Subject: [PATCH 3/4] dind/js CI: kill two transient build/test flakes (issue
 #104)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #105's three red checks (docker-build-test, pr-test / dind-js,
pr-test / lean) were not caused by the vfs-warning feature itself — both
the dind-js and lean jobs were failing on pre-existing transient flakes
that the feature's new test surfaced. docker-build-test only aggregates
the matrix, so it goes green once the other two do.

pr-test / dind-js — SIGPIPE false-negative in the example suite
  The dind example tests asserted on container logs with
  `docker logs … | grep -q "NEEDLE"`. Under `set -o pipefail`, `grep -q`
  closes the pipe the instant it matches; the still-streaming `docker
  logs` upstream gets SIGPIPE (exit 141), pipefail propagates the 141,
  and a message that WAS present reads as absent — failing the one-shot
  checks spuriously (confirmed in the CI log: the expected preload line
  and the "complete" marker were both present, yet grep reported absent).
  tests/dind/lib.sh now provides a pipe-free `logs_contain` helper
  (capture once, match with a `case` glob — no pipe, no SIGPIPE) and
  every example assertion uses it. A focused regression test shows the
  old pattern false-negatives 30/30 under load while the new one never
  does.

pr-test / lean — un-retried transient npm failure
  The JS image build ran `npm install -g npm@latest --no-fund --silent`
  with no retry; a single transient registry blip aborted the whole
  build, and `--silent` hid the cause. ubuntu/24.04/js/install.sh now
  routes its npm registry installs through a `run_with_retry` wrapper
  (exponential backoff, env-overridable budget mirroring
  apt_update_with_retry in common.sh). Build-time resilience only — the
  image is unchanged on success.

Also hardened the issue-104 vfs example test to assert each warning
needle via logs_contain. New unit tests:
experiments/test-issue104-npm-retry.sh and
experiments/test-issue104-logs-contain.sh. Changeset: bump patch.
---
 .changeset/issue-104-ci-reliability.md    | 26 +++++++
 experiments/test-issue104-logs-contain.sh | 89 ++++++++++++++++++++++
 experiments/test-issue104-npm-retry.sh    | 91 +++++++++++++++++++++++
 tests/dind/example-preload-images.sh      | 12 +--
 tests/dind/example-storage-driver-vfs.sh  |  5 +-
 tests/dind/lib.sh                         | 18 +++++
 ubuntu/24.04/js/install.sh                | 31 +++++++-
 7 files changed, 261 insertions(+), 11 deletions(-)
 create mode 100644 .changeset/issue-104-ci-reliability.md
 create mode 100755 experiments/test-issue104-logs-contain.sh
 create mode 100755 experiments/test-issue104-npm-retry.sh

diff --git a/.changeset/issue-104-ci-reliability.md b/.changeset/issue-104-ci-reliability.md
new file mode 100644
index 0000000..657150a
--- /dev/null
+++ b/.changeset/issue-104-ci-reliability.md
@@ -0,0 +1,26 @@
+---
+bump: patch
+---
+
+CI reliability: retry transient npm failures in the JS image build and remove a
+SIGPIPE false-negative from the dind example tests (issue #104 / PR #105).
+
+The JS image build occasionally failed on a single transient npm registry error
+during `npm install -g npm@latest` (and the Playwright/Puppeteer install), aborting
+the whole build with no retry. Those npm registry operations now go through a
+`run_with_retry` wrapper in `ubuntu/24.04/js/install.sh` that retries with
+exponential backoff (mirroring `apt_update_with_retry` in `common.sh`, with the
+same overridable retry budget so it stays unit-testable). This is build-time
+resilience only — the resulting image is unchanged on success.
+
+Separately, the dind example suite asserted on container logs with
+`docker logs … | grep -q "needle"`. Under `set -o pipefail`, `grep -q` closes the
+pipe the instant it matches, which can deliver SIGPIPE to the still-streaming
+`docker logs`; pipefail then propagates that 141 and a present message reads as
+absent, failing the test spuriously (observed on the preload test even though the
+expected line was right there in the logs). `tests/dind/lib.sh` now provides a
+pipe-free `logs_contain` helper (capture once, match with a `case` glob) and all
+example assertions use it.
+
+Covered by new unit tests `experiments/test-issue104-npm-retry.sh` and
+`experiments/test-issue104-logs-contain.sh`.
diff --git a/experiments/test-issue104-logs-contain.sh b/experiments/test-issue104-logs-contain.sh
new file mode 100755
index 0000000..918160c
--- /dev/null
+++ b/experiments/test-issue104-logs-contain.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Regression test for the dind example-suite log assertions (issue #104 / PR #105).
+#
+# The pr-test / dind-js job intermittently failed on assertions shaped like
+#   if ! docker logs "$c" 2>&1 | grep -q "NEEDLE"; then fail ...; fi
+# Under `set -o pipefail`, `grep -q` closes the pipe the instant it matches, which
+# delivers SIGPIPE (exit 141) to the still-streaming `docker logs`. pipefail then
+# propagates that 141, so a needle that WAS present reads as absent and the test
+# fails spuriously. tests/dind/lib.sh now provides logs_contain(), which captures
+# the logs first and matches with a `case` glob -- no pipe, no SIGPIPE.
+#
+# This test asserts the POLICY (the helper exists, every example uses it, and no
+# raw `docker logs | grep` survives) and the BEHAVIOR (capture+case is correct and
+# immune to the pipefail false-negative the old pattern suffered).
+set -euo pipefail
+
+cd "$(dirname "$0")/.."
+
+fail=0
+pass() { echo "  PASS: $1"; }
+miss() { echo "  FAIL: $1"; fail=1; }
+
+lib="tests/dind/lib.sh"
+
+echo "== Case 1: lib.sh defines the pipe-free logs_contain helper =="
+if grep -qE '^logs_contain\(\) \{' "$lib"; then pass "defines logs_contain()"; else miss "defines logs_contain()"; fi
+if grep -q 'docker logs' "$lib"; then pass "logs_contain captures docker logs"; else miss "logs_contain captures docker logs"; fi
+
+echo "== Case 2: no raw 'docker logs | grep' assertion survives in tests/dind =="
+# A pipe straight from docker logs into grep is the vulnerable shape we removed.
+if grep -rnE 'docker logs [^|]*\| *grep' tests/dind/ >/dev/null 2>&1; then
+  grep -rnE 'docker logs [^|]*\| *grep' tests/dind/ >&2
+  miss "no docker-logs|grep pipelines remain"
+else
+  pass "no docker-logs|grep pipelines remain"
+fi
+
+echo "== Case 3: example scripts assert via logs_contain =="
+for f in tests/dind/example-preload-images.sh tests/dind/example-storage-driver-vfs.sh; do
+  if grep -q 'logs_contain' "$f"; then pass "$(basename "$f") uses logs_contain"; else miss "$(basename "$f") uses logs_contain"; fi
+done
+
+echo "== Case 4: capture+case is correct and SIGPIPE-immune =="
+NEEDLE="image preload/passthrough complete"
+# Producer prints the needle EARLY then streams a large tail, so a matcher that
+# short-circuits is reliably killed by SIGPIPE while the producer is still writing
+# -- the exact shape of `docker logs` on a busy dind container.
+producer() {
+  printf '%s\n' "starting dockerd"
+  printf '%s\n' "$NEEDLE"
+  for n in $(seq 1 5000); do
+    printf 'trailing log line %s aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n' "$n"
+  done
+}
+old_match() { producer | grep -q "$NEEDLE"; }            # vulnerable
+new_match() {                                            # logs_contain's core
+  local logs
+  logs="$(producer 2>&1 || true)"
+  case "$logs" in *"$NEEDLE"*) return 0 ;; *) return 1 ;; esac
+}
+
+iterations=30
+old_fn=0
+new_fn=0
+for _ in $(seq 1 "$iterations"); do
+  if ! old_match; then old_fn=$((old_fn + 1)); fi
+  if ! new_match; then new_fn=$((new_fn + 1)); fi
+done
+echo "  (old pipe|grep -q false negatives: ${old_fn}/${iterations}; new capture+case: ${new_fn}/${iterations})"
+if [ "$new_fn" -eq 0 ]; then pass "capture+case never false-negatives under pipefail"; else miss "capture+case false-negatived ${new_fn}/${iterations}"; fi
+
+# Correctness: reject an absent needle; match needles containing glob/regex
+# metacharacters as literals (the vfs warning needles do).
+absent="no marker here"
+case "$absent" in *"$NEEDLE"*) miss "matched an absent needle" ;; *) pass "rejects an absent needle" ;; esac
+meta="warning: 'vfs' storage driver [no copy-on-write] DIND_STORAGE_DRIVER=fuse-overlayfs"
+meta_ok=1
+for n in "'vfs' storage driver" "[no copy-on-write]" "DIND_STORAGE_DRIVER=fuse-overlayfs"; do
+  case "$meta" in *"$n"*) ;; *) meta_ok=0 ;; esac
+done
+if [ "$meta_ok" -eq 1 ]; then pass "matches glob/regex metacharacters as literals"; else miss "failed to match a literal metacharacter needle"; fi
+
+echo ""
+if [ "$fail" -eq 0 ]; then
+  echo "RESULT: PASS - logs_contain is wired in and SIGPIPE-immune"
+else
+  echo "RESULT: FAIL"
+  exit 1
+fi
diff --git a/experiments/test-issue104-npm-retry.sh b/experiments/test-issue104-npm-retry.sh
new file mode 100755
index 0000000..2223360
--- /dev/null
+++ b/experiments/test-issue104-npm-retry.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# Unit test for the JS image's npm retry wrapper (issue #104 / PR #105).
+#
+# The lean (pr-test / language) image build intermittently failed on a transient
+# npm registry error during `npm install -g npm@latest`, with no retry. js/install.sh
+# now wraps its npm registry operations in run_with_retry(), which retries with
+# exponential backoff. This test asserts both the POLICY (the wrapper exists and
+# wraps every npm install) and the BEHAVIOR (it retries, then succeeds or gives up).
+set -euo pipefail
+
+cd "$(dirname "$0")/.."
+
+install="ubuntu/24.04/js/install.sh"
+fail=0
+
+assert_grep() {  # extended-regex, description
+  if grep -qE "$1" "$install"; then
+    echo "  PASS: $2"
+  else
+    echo "  FAIL: $2"
+    fail=1
+  fi
+}
+
+assert_not_grep() {  # extended-regex, description
+  if grep -qE "$1" "$install"; then
+    echo "  FAIL: $2"
+    fail=1
+  else
+    echo "  PASS: $2"
+  fi
+}
+
+echo "== Case 1: js/install.sh defines and uses run_with_retry =="
+assert_grep '^run_with_retry\(\) \{' "defines run_with_retry()"
+assert_grep 'NPM_RETRY_MAX_RETRIES' "retry budget is overridable (NPM_RETRY_MAX_RETRIES)"
+assert_grep 'NPM_RETRY_INITIAL_DELAY' "initial delay is overridable (NPM_RETRY_INITIAL_DELAY)"
+assert_grep 'run_with_retry npm install -g npm@latest' "wraps the npm@latest self-update"
+assert_grep 'run_with_retry npm install -g playwright' "wraps the playwright/puppeteer install"
+
+echo "== Case 2: no un-retried npm install -g remains =="
+# A bare line that *starts* with `npm install -g` would be un-retried; the wrapped
+# calls start with `run_with_retry`, so they must not match.
+assert_not_grep '^[[:space:]]*npm install -g' "every 'npm install -g' goes through run_with_retry"
+
+echo "== Case 3: run_with_retry retry semantics =="
+# install.sh as a whole performs real installs, so load just the function body.
+fn="$(awk '/^run_with_retry\(\) \{/{p=1} p{print} p&&/^\}/{exit}' "$install")"
+eval "$fn"
+# Stub the helpers the wrapper leans on so the test stays fast and silent.
+log_warning() { :; }
+sleep() { :; }
+
+attempts=0
+succeed_now() { attempts=$((attempts + 1)); return 0; }
+if NPM_RETRY_INITIAL_DELAY=0 run_with_retry succeed_now && [ "$attempts" -eq 1 ]; then
+  echo "  PASS: succeeds on the first attempt without retrying"
+else
+  echo "  FAIL: expected exactly one attempt, got ${attempts}"
+  fail=1
+fi
+
+attempts=0
+fail_then_succeed() { attempts=$((attempts + 1)); [ "$attempts" -ge 3 ]; }
+if NPM_RETRY_INITIAL_DELAY=0 NPM_RETRY_MAX_RETRIES=5 run_with_retry fail_then_succeed \
+  && [ "$attempts" -eq 3 ]; then
+  echo "  PASS: retries transient failures and ultimately succeeds (3 attempts)"
+else
+  echo "  FAIL: expected success on the 3rd attempt, got ${attempts}"
+  fail=1
+fi
+
+attempts=0
+always_fail() { attempts=$((attempts + 1)); return 1; }
+if NPM_RETRY_INITIAL_DELAY=0 NPM_RETRY_MAX_RETRIES=3 run_with_retry always_fail; then
+  echo "  FAIL: should have given up after exhausting retries"
+  fail=1
+elif [ "$attempts" -eq 3 ]; then
+  echo "  PASS: gives up after NPM_RETRY_MAX_RETRIES attempts and returns non-zero"
+else
+  echo "  FAIL: expected exactly 3 attempts before giving up, got ${attempts}"
+  fail=1
+fi
+
+echo ""
+if [ "$fail" -eq 0 ]; then
+  echo "RESULT: PASS - npm retry wrapper is present and behaves correctly"
+else
+  echo "RESULT: FAIL"
+  exit 1
+fi
diff --git a/tests/dind/example-preload-images.sh b/tests/dind/example-preload-images.sh
index a1968f5..db1a9b6 100755
--- a/tests/dind/example-preload-images.sh
+++ b/tests/dind/example-preload-images.sh
@@ -50,7 +50,7 @@ wait_for_preload_complete() {
   local i=0
 
   while [ "$i" -lt "$limit" ]; do
-    if docker logs "$container" 2>&1 | grep -q "image preload/passthrough complete"; then
+    if logs_contain "$container" "image preload/passthrough complete"; then
       log "image preload/passthrough completed in ${container} after ${i}s"
       return 0
     fi
@@ -92,7 +92,7 @@ wait_for_inner_docker "$dir_container"
 wait_for_preload_complete "$dir_container"
 assert_inner_has_image "$dir_container"
 
-if ! docker logs "$dir_container" 2>&1 | grep -q "preload image already present, skipping pull"; then
+if ! logs_contain "$dir_container" "preload image already present, skipping pull"; then
   docker logs "$dir_container" >&2 || true
   fail "expected DIND_PRELOAD_IMAGES to skip the pull for an already-loaded image"
 fi
@@ -181,7 +181,7 @@ if docker exec "$public_container" docker image inspect "$fixture_image" >/dev/n
   docker logs "$public_container" >&2 || true
   fail "public mode must NOT pass through the local fixture image (no RepoDigest)"
 fi
-if ! docker logs "$public_container" 2>&1 | grep -q "host-image passthrough (mode=public)"; then
+if ! logs_contain "$public_container" "host-image passthrough (mode=public)"; then
   docker logs "$public_container" >&2 || true
   fail "expected the consumer to run host-image passthrough in public mode"
 fi
@@ -194,7 +194,7 @@ if ! docker exec "$public_container" docker image inspect "$public_image" >/dev/
   docker exec "$public_container" docker images >&2 || true
   fail "public mode must pass through a host image that has a public RepoDigest (${public_image})"
 fi
-if ! docker logs "$public_container" 2>&1 | grep -q "passthrough loading host image: ${public_image}"; then
+if ! logs_contain "$public_container" "passthrough loading host image: ${public_image}"; then
   docker logs "$public_container" >&2 || true
   fail "expected public mode to log loading the public host image (${public_image})"
 fi
@@ -222,7 +222,7 @@ if docker exec "$images_container" docker image inspect "$public_image" >/dev/nu
   docker logs "$images_container" >&2 || true
   fail "DIND_HOST_PASSTHROUGH_IMAGES must exclude ${public_image} (not in the allowlist)"
 fi
-if ! docker logs "$images_container" 2>&1 | grep -q "images=${fixture_repo}"; then
+if ! logs_contain "$images_container" "images=${fixture_repo}"; then
   docker logs "$images_container" >&2 || true
   fail "expected the consumer to log the active DIND_HOST_PASSTHROUGH_IMAGES allowlist"
 fi
@@ -241,7 +241,7 @@ run_dind_container "$no_sock_container" \
   -e "DIND_HOST_PASSTHROUGH_IMAGES=$fixture_repo"
 wait_for_inner_docker "$no_sock_container"
 wait_for_preload_complete "$no_sock_container"
-if ! docker logs "$no_sock_container" 2>&1 | grep -q "DIND_HOST_PASSTHROUGH_IMAGES is set, but no host docker socket is mounted"; then
+if ! logs_contain "$no_sock_container" "DIND_HOST_PASSTHROUGH_IMAGES is set, but no host docker socket is mounted"; then
   docker logs "$no_sock_container" >&2 || true
   fail "expected a warning when DIND_HOST_PASSTHROUGH_IMAGES is set but no host socket is mounted"
 fi
diff --git a/tests/dind/example-storage-driver-vfs.sh b/tests/dind/example-storage-driver-vfs.sh
index 94ed706..35e988a 100755
--- a/tests/dind/example-storage-driver-vfs.sh
+++ b/tests/dind/example-storage-driver-vfs.sh
@@ -23,10 +23,9 @@ log "inner dockerd is using the vfs storage driver"
 # issue #104: landing on vfs must not be silent. The entrypoint (PID 1) emits a
 # copy-on-write warning to stderr, which docker captures in the container logs.
 log "verifying the vfs copy-on-write warning was emitted (issue #104)"
-logs="$(docker logs "$container" 2>&1 || true)"
 for needle in "'vfs' storage driver" "no space left on device" "DIND_STORAGE_DRIVER=fuse-overlayfs"; do
-  if ! printf '%s' "$logs" | grep -qF "$needle"; then
-    printf '%s\n' "$logs" >&2
+  if ! logs_contain "$container" "$needle"; then
+    docker logs "$container" >&2 || true
     fail "expected the vfs warning to mention \"${needle}\", but it was absent from the container logs"
   fi
 done
diff --git a/tests/dind/lib.sh b/tests/dind/lib.sh
index 83c0ea7..d0b2727 100755
--- a/tests/dind/lib.sh
+++ b/tests/dind/lib.sh
@@ -66,6 +66,24 @@ require_docker() {
   fi
 }
 
+# logs_contain CONTAINER NEEDLE
+# Succeeds when CONTAINER's combined stdout+stderr logs contain the literal
+# NEEDLE substring. The logs are captured into a variable and matched with a
+# bash `case` glob instead of being piped into `grep -q`. Under `set -o pipefail`
+# `grep -q` closes the pipe the instant it matches, which can deliver SIGPIPE
+# (exit 141) to `docker logs` while it is still streaming; pipefail then
+# propagates that 141 and a genuine match reads as a false "not found". Capturing
+# first removes the pipe entirely. The quoted needle in the case pattern is
+# matched literally, so any glob/regex metacharacters in it are not special.
+logs_contain() {
+  local container="$1" needle="$2" logs
+  logs="$(docker logs "$container" 2>&1 || true)"
+  case "$logs" in
+    *"$needle"*) return 0 ;;
+    *) return 1 ;;
+  esac
+}
+
 run_container_from_image() {
   local name="$1"
   local image="$2"
diff --git a/ubuntu/24.04/js/install.sh b/ubuntu/24.04/js/install.sh
index 095a09a..1d84645 100644
--- a/ubuntu/24.04/js/install.sh
+++ b/ubuntu/24.04/js/install.sh
@@ -17,6 +17,33 @@ else
   command_exists() { command -v "$1" &>/dev/null; }
 fi
 
+# npm registry operations are network-bound and occasionally fail transiently in
+# CI (ECONNRESET, 429, registry 5xx), which used to fail the whole image build on
+# a single blip. Retry a command a few times with exponential backoff before
+# giving up. Mirrors apt_update_with_retry() in ../common.sh, including the
+# overridable retry budget so it can be unit-tested with a zero delay.
+run_with_retry() {
+  local max_retries="${NPM_RETRY_MAX_RETRIES:-5}"
+  local delay="${NPM_RETRY_INITIAL_DELAY:-5}"
+  local attempt=1
+
+  while [ "$attempt" -le "$max_retries" ]; do
+    if "$@"; then
+      return 0
+    fi
+
+    if [ "$attempt" -eq "$max_retries" ]; then
+      log_warning "command still failing after ${max_retries} attempts: $*"
+      return 1
+    fi
+
+    log_warning "attempt ${attempt}/${max_retries} failed: $* — retrying in ${delay}s"
+    sleep "$delay"
+    attempt=$((attempt + 1))
+    delay=$((delay * 2))
+  done
+}
+
 log_step "Installing JavaScript/TypeScript runtimes"
 
 # --- Bun ---
@@ -75,14 +102,14 @@ fi
 nvm use 20
 
 log_info "Updating npm to latest version..."
-npm install -g npm@latest --no-fund --silent
+run_with_retry npm install -g npm@latest --no-fund --silent
 log_success "npm updated to latest version"
 
 # --- Playwright CLI + @playwright/test + @puppeteer/browsers ---
 log_step "Installing Playwright, @playwright/test, and @puppeteer/browsers CLIs"
 
 log_info "Installing playwright, @playwright/test, and @puppeteer/browsers globally via npm..."
-npm install -g playwright @playwright/test @puppeteer/browsers --no-fund --force
+run_with_retry npm install -g playwright @playwright/test @puppeteer/browsers --no-fund --force
 log_success "playwright, @playwright/test, and @puppeteer/browsers CLIs installed"
 
 # Verify installations

From 55e2efa2e108634eedde7f781e8a0dbd7d58f3df Mon Sep 17 00:00:00 2001
From: konard <drakonard@gmail.com>
Date: Mon, 15 Jun 2026 00:31:43 +0000
Subject: [PATCH 4/4] js CI: retry the Playwright browser install to survive a
 transient msedge GPG flake (issue #104)

The dind-swift image build failed on `playwright install ... msedge ...` when
packages.microsoft.com served an invalid GPG key body ("gpg: no valid OpenPGP
data found" -> "Failed to install msedge"). msedge and chrome are fetched from
third-party apt repos (packages.microsoft.com / Google), so they are subject to
the same transient blips that already motivated wrapping the npm installs.

Wrap both `playwright install` browser-download invocations in run_with_retry so
a single blip retries instead of failing the whole image build. Playwright skips
already-present browsers, so a retry only re-attempts the one that blipped.
Generalize the retry wrapper's env-var prefix NPM_RETRY_* -> BUILD_RETRY_* to
reflect that it now covers every network-bound build step (npm self-update,
Playwright/Puppeteer CLI install, browser-binary download).

Rename experiments/test-issue104-npm-retry.sh -> test-issue104-build-retry.sh
and extend it to assert the browser download is wrapped and no bare
`playwright install` survives; update the changeset accordingly.
---
 .changeset/issue-104-ci-reliability.md        | 24 +++++++----
 ...-retry.sh => test-issue104-build-retry.sh} | 43 +++++++++++--------
 ubuntu/24.04/js/install.sh                    | 28 ++++++++----
 3 files changed, 60 insertions(+), 35 deletions(-)
 rename experiments/{test-issue104-npm-retry.sh => test-issue104-build-retry.sh} (50%)

diff --git a/.changeset/issue-104-ci-reliability.md b/.changeset/issue-104-ci-reliability.md
index 657150a..ccfe930 100644
--- a/.changeset/issue-104-ci-reliability.md
+++ b/.changeset/issue-104-ci-reliability.md
@@ -2,15 +2,21 @@
 bump: patch
 ---
 
-CI reliability: retry transient npm failures in the JS image build and remove a
-SIGPIPE false-negative from the dind example tests (issue #104 / PR #105).
+CI reliability: retry transient network failures in the JS image build and remove
+a SIGPIPE false-negative from the dind example tests (issue #104 / PR #105).
 
-The JS image build occasionally failed on a single transient npm registry error
-during `npm install -g npm@latest` (and the Playwright/Puppeteer install), aborting
-the whole build with no retry. Those npm registry operations now go through a
-`run_with_retry` wrapper in `ubuntu/24.04/js/install.sh` that retries with
-exponential backoff (mirroring `apt_update_with_retry` in `common.sh`, with the
-same overridable retry budget so it stays unit-testable). This is build-time
+The JS image build (`ubuntu/24.04/js/install.sh`, `COPY`'d into every dind/language
+image) occasionally died on a single transient third-party error, with no retry:
+the lean/language build hit a flaky npm registry response during
+`npm install -g npm@latest`, and the dind-swift build hit
+`playwright install … msedge …` getting an invalid GPG key body from
+packages.microsoft.com ("gpg: no valid OpenPGP data found" → "Failed to install
+msedge"). Every network-bound build step — the npm self-update, the
+Playwright/Puppeteer CLI install, and the Playwright browser-binary download — now
+goes through a `run_with_retry` wrapper that retries with exponential backoff
+(mirroring `apt_update_with_retry` in `common.sh`, with the same overridable retry
+budget so it stays unit-testable). `playwright install` skips already-present
+browsers, so a retry only re-attempts the one that blipped. This is build-time
 resilience only — the resulting image is unchanged on success.
 
 Separately, the dind example suite asserted on container logs with
@@ -22,5 +28,5 @@ expected line was right there in the logs). `tests/dind/lib.sh` now provides a
 pipe-free `logs_contain` helper (capture once, match with a `case` glob) and all
 example assertions use it.
 
-Covered by new unit tests `experiments/test-issue104-npm-retry.sh` and
+Covered by new unit tests `experiments/test-issue104-build-retry.sh` and
 `experiments/test-issue104-logs-contain.sh`.
diff --git a/experiments/test-issue104-npm-retry.sh b/experiments/test-issue104-build-retry.sh
similarity index 50%
rename from experiments/test-issue104-npm-retry.sh
rename to experiments/test-issue104-build-retry.sh
index 2223360..ef90f45 100755
--- a/experiments/test-issue104-npm-retry.sh
+++ b/experiments/test-issue104-build-retry.sh
@@ -1,11 +1,18 @@
 #!/usr/bin/env bash
-# Unit test for the JS image's npm retry wrapper (issue #104 / PR #105).
+# Unit test for the JS image's network-bound build-step retry wrapper
+# (issue #104 / PR #105).
 #
-# The lean (pr-test / language) image build intermittently failed on a transient
-# npm registry error during `npm install -g npm@latest`, with no retry. js/install.sh
-# now wraps its npm registry operations in run_with_retry(), which retries with
-# exponential backoff. This test asserts both the POLICY (the wrapper exists and
-# wraps every npm install) and the BEHAVIOR (it retries, then succeeds or gives up).
+# Two transient, third-party failures used to abort the whole JS image build with
+# no retry:
+#   * the lean (pr-test / language) build died on a transient npm registry error
+#     during `npm install -g npm@latest`; and
+#   * the dind-swift build died on `playwright install ... msedge ...` when
+#     packages.microsoft.com served an invalid GPG key body
+#     ("gpg: no valid OpenPGP data found" -> "Failed to install msedge").
+# js/install.sh now routes every such network-bound step through run_with_retry(),
+# which retries with exponential backoff. This test asserts both the POLICY (the
+# wrapper exists and wraps every npm install AND the playwright browser download)
+# and the BEHAVIOR (it retries, then succeeds or gives up).
 set -euo pipefail
 
 cd "$(dirname "$0")/.."
@@ -33,15 +40,17 @@ assert_not_grep() {  # extended-regex, description
 
 echo "== Case 1: js/install.sh defines and uses run_with_retry =="
 assert_grep '^run_with_retry\(\) \{' "defines run_with_retry()"
-assert_grep 'NPM_RETRY_MAX_RETRIES' "retry budget is overridable (NPM_RETRY_MAX_RETRIES)"
-assert_grep 'NPM_RETRY_INITIAL_DELAY' "initial delay is overridable (NPM_RETRY_INITIAL_DELAY)"
+assert_grep 'BUILD_RETRY_MAX_RETRIES' "retry budget is overridable (BUILD_RETRY_MAX_RETRIES)"
+assert_grep 'BUILD_RETRY_INITIAL_DELAY' "initial delay is overridable (BUILD_RETRY_INITIAL_DELAY)"
 assert_grep 'run_with_retry npm install -g npm@latest' "wraps the npm@latest self-update"
-assert_grep 'run_with_retry npm install -g playwright' "wraps the playwright/puppeteer install"
+assert_grep 'run_with_retry npm install -g playwright' "wraps the playwright/puppeteer CLI install"
+assert_grep 'run_with_retry playwright install ' "wraps the playwright browser binary download (msedge/chrome flake)"
 
-echo "== Case 2: no un-retried npm install -g remains =="
-# A bare line that *starts* with `npm install -g` would be un-retried; the wrapped
-# calls start with `run_with_retry`, so they must not match.
+echo "== Case 2: no un-retried network install remains =="
+# A bare line that *starts* with `npm install -g` or `playwright install` would be
+# un-retried; the wrapped calls start with `run_with_retry`, so they must not match.
 assert_not_grep '^[[:space:]]*npm install -g' "every 'npm install -g' goes through run_with_retry"
+assert_not_grep '^[[:space:]]*playwright install ' "every 'playwright install' goes through run_with_retry"
 
 echo "== Case 3: run_with_retry retry semantics =="
 # install.sh as a whole performs real installs, so load just the function body.
@@ -53,7 +62,7 @@ sleep() { :; }
 
 attempts=0
 succeed_now() { attempts=$((attempts + 1)); return 0; }
-if NPM_RETRY_INITIAL_DELAY=0 run_with_retry succeed_now && [ "$attempts" -eq 1 ]; then
+if BUILD_RETRY_INITIAL_DELAY=0 run_with_retry succeed_now && [ "$attempts" -eq 1 ]; then
   echo "  PASS: succeeds on the first attempt without retrying"
 else
   echo "  FAIL: expected exactly one attempt, got ${attempts}"
@@ -62,7 +71,7 @@ fi
 
 attempts=0
 fail_then_succeed() { attempts=$((attempts + 1)); [ "$attempts" -ge 3 ]; }
-if NPM_RETRY_INITIAL_DELAY=0 NPM_RETRY_MAX_RETRIES=5 run_with_retry fail_then_succeed \
+if BUILD_RETRY_INITIAL_DELAY=0 BUILD_RETRY_MAX_RETRIES=5 run_with_retry fail_then_succeed \
   && [ "$attempts" -eq 3 ]; then
   echo "  PASS: retries transient failures and ultimately succeeds (3 attempts)"
 else
@@ -72,11 +81,11 @@ fi
 
 attempts=0
 always_fail() { attempts=$((attempts + 1)); return 1; }
-if NPM_RETRY_INITIAL_DELAY=0 NPM_RETRY_MAX_RETRIES=3 run_with_retry always_fail; then
+if BUILD_RETRY_INITIAL_DELAY=0 BUILD_RETRY_MAX_RETRIES=3 run_with_retry always_fail; then
   echo "  FAIL: should have given up after exhausting retries"
   fail=1
 elif [ "$attempts" -eq 3 ]; then
-  echo "  PASS: gives up after NPM_RETRY_MAX_RETRIES attempts and returns non-zero"
+  echo "  PASS: gives up after BUILD_RETRY_MAX_RETRIES attempts and returns non-zero"
 else
   echo "  FAIL: expected exactly 3 attempts before giving up, got ${attempts}"
   fail=1
@@ -84,7 +93,7 @@ fi
 
 echo ""
 if [ "$fail" -eq 0 ]; then
-  echo "RESULT: PASS - npm retry wrapper is present and behaves correctly"
+  echo "RESULT: PASS - build-step retry wrapper is present and behaves correctly"
 else
   echo "RESULT: FAIL"
   exit 1
diff --git a/ubuntu/24.04/js/install.sh b/ubuntu/24.04/js/install.sh
index 1d84645..ddd3db0 100644
--- a/ubuntu/24.04/js/install.sh
+++ b/ubuntu/24.04/js/install.sh
@@ -17,14 +17,17 @@ else
   command_exists() { command -v "$1" &>/dev/null; }
 fi
 
-# npm registry operations are network-bound and occasionally fail transiently in
-# CI (ECONNRESET, 429, registry 5xx), which used to fail the whole image build on
-# a single blip. Retry a command a few times with exponential backoff before
-# giving up. Mirrors apt_update_with_retry() in ../common.sh, including the
-# overridable retry budget so it can be unit-tested with a zero delay.
+# Network-bound build steps — npm registry installs and Playwright browser
+# downloads — occasionally fail transiently in CI (ECONNRESET, 429, registry 5xx,
+# or a flaky third-party repo such as packages.microsoft.com serving an invalid
+# GPG key body when Playwright installs the 'msedge' browser), which used to fail
+# the whole image build on a single blip. Retry a command a few times with
+# exponential backoff before giving up. Mirrors apt_update_with_retry() in
+# ../common.sh, including the overridable retry budget so it can be unit-tested
+# with a zero delay.
 run_with_retry() {
-  local max_retries="${NPM_RETRY_MAX_RETRIES:-5}"
-  local delay="${NPM_RETRY_INITIAL_DELAY:-5}"
+  local max_retries="${BUILD_RETRY_MAX_RETRIES:-5}"
+  local delay="${BUILD_RETRY_INITIAL_DELAY:-5}"
   local attempt=1
 
   while [ "$attempt" -le "$max_retries" ]; do
@@ -119,13 +122,20 @@ log_success "playwright CLI verified"
 # --- Download Playwright browser binaries ---
 log_step "Downloading Playwright browser binaries"
 
+# 'playwright install' downloads browser binaries: chromium/firefox/webkit/
+# chromium-headless-shell come from Playwright's CDN, but msedge and chrome are
+# fetched from third-party apt repos (packages.microsoft.com / Google) that
+# occasionally return a transient error — e.g. an invalid GPG key body that makes
+# the install abort with "gpg: no valid OpenPGP data found" / "Failed to install
+# msedge". Retry the whole step; Playwright skips already-installed browsers, so a
+# retry only re-attempts the one that blipped.
 ARCH=$(uname -m)
 if [ "$ARCH" = "x86_64" ] || [ "$ARCH" = "amd64" ]; then
   log_info "x86_64 detected: installing all browsers (chromium, firefox, webkit, msedge, chromium-headless-shell, chrome)"
-  playwright install chromium firefox webkit msedge chromium-headless-shell chrome
+  run_with_retry playwright install chromium firefox webkit msedge chromium-headless-shell chrome
 else
   log_info "$ARCH detected: installing compatible browsers (chromium, firefox, webkit, chromium-headless-shell)"
-  playwright install chromium firefox webkit chromium-headless-shell
+  run_with_retry playwright install chromium firefox webkit chromium-headless-shell
 fi
 log_success "Playwright browser binaries downloaded"