diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 5d2f340..072ae42 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -19,3 +19,24 @@ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright RUN npm install -g playwright \ && playwright install --with-deps chromium \ && chmod -R a+rx /ms-playwright + +# Tell dev container features where the target user's home directory is. +# The devcontainer CLI passes `_REMOTE_USER_HOME`/`_CONTAINER_USER_HOME` to every +# feature's install.sh, but envbuilder (used by Coder) only sets `_REMOTE_USER` +# and `_CONTAINER_USER` -- it leaves the *_HOME variables empty. Features that +# rely on them then break: e.g. the claude-code feature runs +# `cp "$_REMOTE_USER_HOME/.local/bin/claude" /usr/local/bin/claude`, which under +# envbuilder expands to `cp /.local/bin/claude ...` and fails the whole build. +# Supplying the values here (as build ARGs, so they don't leak into the running +# container's environment) keeps feature installs working on the envbuilder path. +# ARGs declared here remain in scope for the feature install steps envbuilder +# appends after this Dockerfile, but reset for the devcontainer CLI (which sets +# these variables itself), so this is a no-op there. +ARG _REMOTE_USER_HOME=/home/node +ARG _CONTAINER_USER_HOME=/home/node + +# Default to the non-root `node` user. envbuilder (used by Coder) picks its target +# user from the last `USER` directive when `containerUser`/`remoteUser` are not +# applied from image metadata, so make the intended user explicit here to avoid +# dropping into a root shell. +USER node diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ee38034..92b0684 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -31,6 +31,20 @@ }, "overrideFeatureInstallOrder": ["ghcr.io/devcontainers/features/rust"], + // Connect as the `node` user. Set this explicitly rather than relying on the + // base image's `remoteUser` metadata: envbuilder (used by Coder) does not honor + // image-metadata `remoteUser` and falls back to the image's `USER` (root), + // which would otherwise drop Coder workspaces into a root shell. + "remoteUser": "node", + "containerUser": "node", + + // Don't remap the container user's UID/GID to match the host user. The sshd + // feature creates an `ssh` group at GID 1001, which collides with the host + // UID/GID (1001) used by CI runners. The CLI's automatic remap then fails with + // `groupmod: GID '1001' already exists`, breaking `devcontainer up`. + // (envbuilder ignores this key; it only affects the devcontainer CLI.) + "updateRemoteUserUID": false, + // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, @@ -38,7 +52,9 @@ // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "kubectl version --client --short 2>/dev/null || kubectl version --client 2>/dev/null", + // The sentinel file lets CI verify that the lifecycle actually ran to + // completion (both the devcontainer CLI and envbuilder honor this hook). + "postCreateCommand": "kubectl version --client 2>/dev/null || true; touch \"$HOME/.devcontainer-postcreate-done\"", // Configure tool-specific properties. "customizations": { diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml index 24ac6ca..7e3afd2 100644 --- a/.github/workflows/build-containers.yml +++ b/.github/workflows/build-containers.yml @@ -4,12 +4,15 @@ on: pull_request: branches: [ main ] +permissions: + contents: read + jobs: build-dockerfile: name: Build Dockerfile runs-on: namespace-profile-devcontainer timeout-minutes: 20 - + steps: - name: Checkout code uses: actions/checkout@v6 @@ -23,11 +26,32 @@ jobs: docker build -t devcontainer-dockerfile .devcontainer/ echo "Dockerfile build completed successfully" + build-envbuilder: + name: Build Envbuilder + runs-on: namespace-profile-devcontainer + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + # Coder builds workspaces with envbuilder, NOT the devcontainer CLI. The two + # diverge on user selection, UID/GID remapping and feature install order, so + # the CLI job below can pass while a Coder workspace fails to start. This job + # exercises the envbuilder path and runs scripts/verify-devcontainer.sh as the + # post-build init script to assert: user is `node` (not root), tmux works, and + # the lifecycle (postCreateCommand) ran to completion. A non-zero exit from the + # init script fails the job. + - name: Build and verify with envbuilder + run: | + echo "Building dev container with envbuilder (Coder's builder)..." + scripts/test-envbuilder.sh + build-devcontainer: name: Build Devcontainer runs-on: namespace-profile-devcontainer timeout-minutes: 30 - + steps: - name: Checkout code uses: actions/checkout@v6 @@ -39,8 +63,26 @@ jobs: run: | npm install -g @devcontainers/cli - - name: Build devcontainer + # `devcontainer build` only builds the image (Dockerfile + features); it does + # NOT create the container or run lifecycle commands such as postCreateCommand. + # Use `devcontainer up` so the container is actually started and the lifecycle + # commands run, which catches startup failures the plain build step misses. + - name: Start devcontainer + run: | + echo "Starting devcontainer (build + create + lifecycle commands)..." + devcontainer up --workspace-folder . + echo "Devcontainer started successfully" + + - name: Smoke test devcontainer + run: | + echo "Running smoke test inside the devcontainer..." + devcontainer exec --workspace-folder . bash -lc 'echo "Devcontainer is up and exec works"' + + - name: Tear down devcontainer + if: always() run: | - echo "Building devcontainer..." - devcontainer build --workspace-folder . - echo "Devcontainer build completed successfully" \ No newline at end of file + container_id=$(docker ps -aq --filter "label=devcontainer.local_folder=${GITHUB_WORKSPACE}") + if [ -n "$container_id" ]; then + echo "Removing devcontainer ($container_id)..." + docker rm -f "$container_id" + fi diff --git a/scripts/test-envbuilder.sh b/scripts/test-envbuilder.sh new file mode 100755 index 0000000..6662818 --- /dev/null +++ b/scripts/test-envbuilder.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# +# Build this dev container the same way Coder does (with envbuilder) and run the +# verification checks, so you can reproduce and debug Coder-only startup issues +# (e.g. "drops to root", "tmux won't run", "something isn't completing") +# locally and read the full envbuilder build log. +# +# Coder builds workspaces with envbuilder, NOT the `@devcontainers/cli`. The two +# diverge on user selection, UID/GID remapping and feature install order, so the +# CLI can succeed while Coder fails. This script exercises the envbuilder path. +# +# Usage: +# scripts/test-envbuilder.sh +# +# Requirements: +# - docker +# +# Optional environment variables: +# ENVBUILDER_IMAGE envbuilder image to use (default: ghcr.io/coder/envbuilder:latest) +# CACHE_DIR host directory for the envbuilder layer cache (speeds up reruns) + +set -euo pipefail + +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +envbuilder_image="${ENVBUILDER_IMAGE:-ghcr.io/coder/envbuilder:latest}" + +if ! command -v docker >/dev/null 2>&1; then + echo "error: docker is required but not found on PATH" >&2 + exit 1 +fi + +# The repo is mounted into the envbuilder workspace. envbuilder discovers +# `.devcontainer/devcontainer.json` automatically. After the build + lifecycle +# commands complete, it runs ENVBUILDER_INIT_SCRIPT as the target user; we point +# that at the verification script (resolved from the mounted workspace). +workspace_folder="/workspaces/devcontainer" +init_script="bash ${workspace_folder}/scripts/verify-devcontainer.sh" + +docker_args=( + run --rm + -e "ENVBUILDER_WORKSPACE_FOLDER=${workspace_folder}" + -e "ENVBUILDER_INIT_SCRIPT=${init_script}" + -v "${repo_root}:${workspace_folder}" +) + +# Optionally persist the layer cache between runs for faster iteration. +if [ -n "${CACHE_DIR:-}" ]; then + mkdir -p "${CACHE_DIR}" + docker_args+=(-v "${CACHE_DIR}:/cache" -e "ENVBUILDER_LAYER_CACHE_DIR=/cache") +fi + +docker_args+=("${envbuilder_image}") + +echo "=== Running envbuilder ($envbuilder_image) on $repo_root ===" +echo "docker ${docker_args[*]}" +exec docker "${docker_args[@]}" diff --git a/scripts/verify-devcontainer.sh b/scripts/verify-devcontainer.sh new file mode 100755 index 0000000..f2148c4 --- /dev/null +++ b/scripts/verify-devcontainer.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# +# Verify that the built dev container started up correctly. +# +# This runs *inside* the container after the build and lifecycle commands have +# completed. It is used both as the envbuilder `ENVBUILDER_INIT_SCRIPT` (the +# source of truth for how Coder builds this workspace) and can be run by hand +# inside any container started from this dev container. +# +# It asserts the things that have previously broken on Coder/envbuilder: +# 1. the effective user is `node`, not root, +# 2. tmux is on PATH and can start a session that reads ~/.tmux.conf, +# 3. the lifecycle ran to completion (postCreateCommand sentinel exists). +# +# Any failed check exits non-zero so CI fails loudly instead of silently +# dropping you into a broken shell. + +set -euo pipefail + +failures=0 + +fail() { + echo "FAIL: $*" >&2 + failures=$((failures + 1)) +} + +pass() { + echo "PASS: $*" +} + +echo "=== Verifying dev container startup ===" +echo "whoami: $(whoami)" +echo "id: $(id)" +echo "HOME: ${HOME:-}" + +# 1. Effective user must be `node`, not root. +if [ "$(id -un)" = "node" ]; then + pass "running as expected user 'node'" +else + fail "expected to run as 'node' but running as '$(id -un)' (uid=$(id -u))" +fi + +# 2. tmux must be on PATH and able to start a session that reads ~/.tmux.conf. +if command -v tmux >/dev/null 2>&1; then + pass "tmux is on PATH ($(command -v tmux))" + + session="verify-$$" + tmux_conf="${HOME}/.tmux.conf" + tmux_err="$(mktemp)" + + if [ -f "$tmux_conf" ]; then + pass "tmux config present at $tmux_conf" + else + fail "tmux config missing at $tmux_conf" + fi + + # Start a detached session, explicitly loading the user's config so a broken + # config (or a missing HOME) fails the check rather than silently starting a + # default session. + if tmux -f "$tmux_conf" new-session -d -s "$session" 'sleep 5' 2>"$tmux_err"; then + if tmux has-session -t "$session" 2>/dev/null; then + pass "tmux started a session using $tmux_conf" + else + fail "tmux session '$session' did not stay alive" + fi + tmux kill-session -t "$session" 2>/dev/null || true + else + fail "tmux could not start a session: $(cat "$tmux_err" 2>/dev/null)" + fi + rm -f "$tmux_err" +else + fail "tmux is not on PATH" +fi + +# 3. The lifecycle (postCreateCommand) must have completed. +sentinel="${HOME}/.devcontainer-postcreate-done" +if [ -f "$sentinel" ]; then + pass "lifecycle sentinel present ($sentinel)" +else + fail "lifecycle sentinel missing ($sentinel); postCreateCommand did not complete" +fi + +echo "=== Verification complete ===" +if [ "$failures" -ne 0 ]; then + echo "$failures check(s) failed." >&2 + exit 1 +fi +echo "All checks passed."