Skip to content

Commit 9f4106e

Browse files
fix: retry rctx.execute() on SIGKILL (exit 137) for macOS 26
macOS 26 (Tahoe) has a bug in its networking framework's pthread_atfork handler that intermittently kills child processes with SIGKILL (exit 137) when spawned via fork+exec. This affects all rctx.execute() calls in repository rules, causing yq, cp, and mkdir commands to fail randomly during npm_translate_lock. Add an execute_with_retry() wrapper in utils.bzl that catches exit code 137 and retries up to 3 times. Apply it to all rctx.execute() call sites in npm_translate_lock_state.bzl (yq lockfile parsing, mkdir, cp) and utils.bzl (reverse_force_copy). The failure is intermittent so retries reliably work around it. References: - GoogleContainerTools/skaffold#9925 - bazelbuild/bazel#27026 Made-with: Cursor
1 parent 786a74a commit 9f4106e

2 files changed

Lines changed: 38 additions & 5 deletions

File tree

npm/private/npm_translate_lock_state.bzl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,8 @@ def _copy_input_file(priv, rctx, attr, path, repository_path):
335335
if len(dst_segments) > 1:
336336
dirname = "/".join(dst_segments[:-1])
337337
mkdir_args = [coreutils, "mkdir", "-p", dirname]
338-
result = rctx.execute(
338+
result = utils.execute_with_retry(
339+
rctx,
339340
mkdir_args,
340341
quiet = attr.quiet,
341342
)
@@ -344,7 +345,8 @@ def _copy_input_file(priv, rctx, attr, path, repository_path):
344345
fail(msg)
345346

346347
cp_args = [coreutils, "cp", path, repository_path]
347-
result = rctx.execute(
348+
result = utils.execute_with_retry(
349+
rctx,
348350
cp_args,
349351
quiet = attr.quiet,
350352
)
@@ -395,7 +397,7 @@ def _yaml_to_json(rctx, yaml_path, is_windows):
395397
yaml_path,
396398
"-o=json",
397399
]
398-
result = rctx.execute(yq_args)
400+
result = utils.execute_with_retry(rctx, yq_args)
399401
if result.return_code:
400402
return None, "failed to parse {} with yq. '{}' exited with {}: \nSTDOUT:\n{}\nSTDERR:\n{}".format(yq_args, yaml_path, result.return_code, result.stdout, result.stderr)
401403

npm/private/utils.bzl

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,36 @@ load("@bazel_skylib//lib:paths.bzl", "paths")
1212
_SUPPORTS_SYMLINK_TARGET_TYPE = bazel_features.rules.symlink_action_has_target_type
1313

1414
INTERNAL_ERROR_MSG = "ERROR: rules_js internal error, please file an issue: https://github.com/aspect-build/rules_js/issues"
15+
16+
# macOS 26 (Tahoe) has a bug in its networking framework's pthread_atfork
17+
# handler that intermittently kills child processes with SIGKILL (exit 137)
18+
# when spawned via fork+exec. This retry wrapper handles the transient failure.
19+
# See https://github.com/GoogleContainerTools/skaffold/issues/9925
20+
_MACOS_SIGKILL_EXIT_CODE = 137
21+
_MACOS_SIGKILL_MAX_ATTEMPTS = 3
22+
23+
def _execute_with_retry(rctx, args, **kwargs):
24+
"""Wrapper around rctx.execute that retries on SIGKILL (exit 137).
25+
26+
On macOS 26+, child processes spawned by Bazel repository rules can be
27+
intermittently killed with SIGKILL due to a bug in the OS networking
28+
framework's pthread_atfork handler. This wrapper retries the command
29+
to work around the transient failure.
30+
"""
31+
result = None
32+
for attempt in range(_MACOS_SIGKILL_MAX_ATTEMPTS):
33+
result = rctx.execute(args, **kwargs)
34+
if result.return_code != _MACOS_SIGKILL_EXIT_CODE:
35+
return result
36+
37+
# buildifier: disable=print
38+
print("WARNING: command '{}' was killed (exit 137), attempt {}/{} failed".format(
39+
" ".join([str(a) for a in args]),
40+
attempt + 1,
41+
_MACOS_SIGKILL_MAX_ATTEMPTS,
42+
))
43+
return result
44+
1545
DEFAULT_REGISTRY_DOMAIN = "registry.npmjs.org"
1646
DEFAULT_REGISTRY_DOMAIN_SLASH = "{}/".format(DEFAULT_REGISTRY_DOMAIN)
1747
DEFAULT_REGISTRY_PROTOCOL = "https"
@@ -212,7 +242,7 @@ def _reverse_force_copy(rctx, label, dst = None):
212242
dst_dirname = paths.dirname(dst)
213243
if dst_dirname:
214244
mkdir_args = [coreutils, "mkdir", "-p", dst_dirname]
215-
result = rctx.execute(mkdir_args)
245+
result = _execute_with_retry(rctx, mkdir_args)
216246
if result.return_code != 0:
217247
msg = """
218248
@@ -229,7 +259,7 @@ STDERR:
229259
fail(msg)
230260

231261
cp_args = [coreutils, "cp", src, dst]
232-
result = rctx.execute(cp_args)
262+
result = _execute_with_retry(rctx, cp_args)
233263
if result.return_code != 0:
234264
msg = """
235265
@@ -375,6 +405,7 @@ utils = struct(
375405
hash = _hash,
376406
dicts_match = _dicts_match,
377407
reverse_force_copy = _reverse_force_copy,
408+
execute_with_retry = _execute_with_retry,
378409
replace_npmrc_token_envvar = _replace_npmrc_token_envvar,
379410
is_tarball_extension = _is_tarball_extension,
380411
hex_to_base64 = _hex_to_base64,

0 commit comments

Comments
 (0)