From f9ede6165a75a7e40a2cd88d472626212afac5d7 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 6 Mar 2026 10:37:56 +0100 Subject: [PATCH] fix: retry rctx.execute() on SIGKILL (exit 137) for macOS 26 macOS 26 (Tahoe) has a bug in its networking framework's pthread_atfork handler that intermittently kills child processes with SIGKILL (exit 137) when spawned via fork+exec. This affects all rctx.execute() calls in repository rules, causing yq, cp, and mkdir commands to fail randomly during npm_translate_lock. Add an execute_with_retry() wrapper in utils.bzl that catches exit code 137 and retries up to 3 times. Apply it to all rctx.execute() call sites in npm_translate_lock_state.bzl (yq lockfile parsing, mkdir, cp) and utils.bzl (reverse_force_copy). The failure is intermittent so retries reliably work around it. References: - https://github.com/GoogleContainerTools/skaffold/issues/9925 - https://github.com/bazelbuild/bazel/issues/27026 Made-with: Cursor --- npm/private/npm_translate_lock_state.bzl | 8 +++--- npm/private/utils.bzl | 34 ++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/npm/private/npm_translate_lock_state.bzl b/npm/private/npm_translate_lock_state.bzl index 138dd596be..317eb01e25 100644 --- a/npm/private/npm_translate_lock_state.bzl +++ b/npm/private/npm_translate_lock_state.bzl @@ -335,7 +335,8 @@ def _copy_input_file(priv, rctx, attr, path, repository_path): if len(dst_segments) > 1: dirname = "/".join(dst_segments[:-1]) mkdir_args = [coreutils, "mkdir", "-p", dirname] - result = rctx.execute( + result = utils.execute_with_retry( + rctx, mkdir_args, quiet = attr.quiet, ) @@ -344,7 +345,8 @@ def _copy_input_file(priv, rctx, attr, path, repository_path): fail(msg) cp_args = [coreutils, "cp", path, repository_path] - result = rctx.execute( + result = utils.execute_with_retry( + rctx, cp_args, quiet = attr.quiet, ) @@ -395,7 +397,7 @@ def _yaml_to_json(rctx, yaml_path, is_windows): yaml_path, "-o=json", ] - result = rctx.execute(yq_args) + result = utils.execute_with_retry(rctx, yq_args) if result.return_code: return None, "failed to parse {} with yq. '{}' exited with {}: \nSTDOUT:\n{}\nSTDERR:\n{}".format(yq_args, yaml_path, result.return_code, result.stdout, result.stderr) diff --git a/npm/private/utils.bzl b/npm/private/utils.bzl index 6bb5ae55a2..6df124f357 100644 --- a/npm/private/utils.bzl +++ b/npm/private/utils.bzl @@ -12,6 +12,35 @@ load("@bazel_skylib//lib:paths.bzl", "paths") _SUPPORTS_SYMLINK_TARGET_TYPE = bazel_features.rules.symlink_action_has_target_type INTERNAL_ERROR_MSG = "ERROR: rules_js internal error, please file an issue: https://github.com/aspect-build/rules_js/issues" + +# macOS 26 (Tahoe) has a bug in its networking framework's pthread_atfork +# handler that intermittently kills child processes with SIGKILL (exit 137) +# when spawned via fork+exec. This retry wrapper handles the transient failure. +# See https://github.com/GoogleContainerTools/skaffold/issues/9925 +_MACOS_SIGKILL_EXIT_CODE = 137 +_MACOS_SIGKILL_MAX_ATTEMPTS = 3 + +def _execute_with_retry(rctx, args, **kwargs): + """Wrapper around rctx.execute that retries on SIGKILL (exit 137). + + On macOS 26+, child processes spawned by Bazel repository rules can be + intermittently killed with SIGKILL due to a bug in the OS networking + framework's pthread_atfork handler. This wrapper retries the command + to work around the transient failure. + """ + result = None + for _ in range(_MACOS_SIGKILL_MAX_ATTEMPTS): + result = rctx.execute(args, **kwargs) + if result.return_code != _MACOS_SIGKILL_EXIT_CODE: + return result + + # buildifier: disable=print + print("WARNING: command '{}' was killed (exit 137) after {} attempts".format( + " ".join([str(a) for a in args]), + _MACOS_SIGKILL_MAX_ATTEMPTS, + )) + return result + DEFAULT_REGISTRY_DOMAIN = "registry.npmjs.org" DEFAULT_REGISTRY_DOMAIN_SLASH = "{}/".format(DEFAULT_REGISTRY_DOMAIN) DEFAULT_REGISTRY_PROTOCOL = "https" @@ -212,7 +241,7 @@ def _reverse_force_copy(rctx, label, dst = None): dst_dirname = paths.dirname(dst) if dst_dirname: mkdir_args = [coreutils, "mkdir", "-p", dst_dirname] - result = rctx.execute(mkdir_args) + result = _execute_with_retry(rctx, mkdir_args) if result.return_code != 0: msg = """ @@ -229,7 +258,7 @@ STDERR: fail(msg) cp_args = [coreutils, "cp", src, dst] - result = rctx.execute(cp_args) + result = _execute_with_retry(rctx, cp_args) if result.return_code != 0: msg = """ @@ -375,6 +404,7 @@ utils = struct( hash = _hash, dicts_match = _dicts_match, reverse_force_copy = _reverse_force_copy, + execute_with_retry = _execute_with_retry, replace_npmrc_token_envvar = _replace_npmrc_token_envvar, is_tarball_extension = _is_tarball_extension, hex_to_base64 = _hex_to_base64,