diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 38e87c12..9f6245fd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -186,6 +186,117 @@ jobs: if-no-files-found: ignore retention-days: 14 + # Sanitizer builds: AddressSanitizer and UndefinedBehaviorSanitizer, one per + # matrix leg so each reports and gates independently (combining them in one + # build would couple their pass/fail and stop UBSan findings from surfacing + # after an ASan abort). Single config (ubuntu-24.04 / gcc / MPICH) like the + # coverage job — sanitizers catch bad memory/UB in the code, not platform or + # MPI-variant breakage (that's the matrix above), and MPICH is much quieter + # under the sanitizers than OpenMPI. ROSS is built Debug WITHOUT sanitizers; + # only CODES is instrumented (via the asan/ubsan presets -> CODES_SANITIZER), + # so bugs reached through ROSS internals aren't flagged yet — a follow-up can + # instrument ROSS too. + # + # Both lanes gate: a finding fails the job and blocks merge. UBSan aborts on + # any UB via -fno-sanitize-recover (baked into the preset); ASan aborts on the + # first memory error. LeakSanitizer stays disabled (detect_leaks=0) so the + # lane delivers the high-value detections (heap-buffer-overflow, use-after-free, + # stack overflow) without MPICH and teardown leak noise — enabling leaks with a + # suppression file is a follow-up. The asan preset builds at -O1 (ASan- + # recommended) so the full suite runs in ~5 min instead of ~1 h at -O0. + sanitizers: + name: sanitizers (${{ matrix.sanitizer }}) + runs-on: ubuntu-24.04 + strategy: + # See every sanitizer's result on every run. + fail-fast: false + matrix: + sanitizer: [asan, ubsan] + env: + # ROSS install prefix; find_package(ROSS) honors ROSS_ROOT (CODES presets). + ROSS_ROOT: ${{ github.workspace }}/ross-install + # detect_leaks=0: corruption detection first; leaks are the noisiest part + # (MPICH internals + CODES's known unfreed globals) and come later with a + # suppression file. abort_on_error=1: guarantee a nonzero exit so ctest + # fails on a finding. log_path: CODES test harnesses redirect per-rank + # stderr (the ping-pong determinism test does `2> model-output-error.txt`), + # so a report would otherwise reach neither the ctest log nor the artifact — + # only the abort code. log_path writes each report to . + # files in sanitizer-logs/, which is uploaded on failure. (No-op on ubsan.) + ASAN_OPTIONS: detect_leaks=0:abort_on_error=1:strict_string_checks=1:log_path=${{ github.workspace }}/sanitizer-logs/asan + # Abort with a symbolized stack on the first UB (pairs with the preset's + # -fno-sanitize-recover); log_path captures it past harness redirects as + # above. (No-op on the asan leg.) + UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1:log_path=${{ github.workspace }}/sanitizer-logs/ubsan + steps: + - name: Checkout CODES + uses: actions/checkout@v4 + with: + path: codes + + - name: Checkout ROSS + uses: actions/checkout@v4 + with: + repository: ROSS-org/ROSS + # Push/PR runs use the pin; the nightly scheduled run tracks + # ROSS master so pin drift surfaces within a day. + ref: ${{ github.event_name == 'schedule' && 'master' || env.ROSS_REF }} + path: ross + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + mpich libmpich-dev \ + cmake ninja-build pkg-config \ + flex bison + + - name: Configure ROSS + # ROSS is built without sanitizers (default gcc); CODES links it and is + # the only side instrumented for now. + run: > + cmake -S ross -B ross/build -G Ninja + -DCMAKE_BUILD_TYPE=Debug + -DROSS_BUILD_MODELS=ON + -DCMAKE_INSTALL_PREFIX=$PWD/ross-install + + - name: Build and install ROSS + run: cmake --build ross/build --target install -j + + - name: Configure CODES (${{ matrix.sanitizer }}) + working-directory: codes + # The asan/ubsan preset sets CODES_SANITIZER; heavy deps forced off to + # match the stock matrix legs. + run: cmake --preset ${{ matrix.sanitizer }} -DCODES_USE_TORCH=OFF -DCODES_USE_ZEROMQ=OFF + + - name: Build CODES + working-directory: codes + run: cmake --build --preset ${{ matrix.sanitizer }} + + - name: Run CODES tests + working-directory: codes + # Pre-create the sanitizer log dir: ASAN/UBSAN_OPTIONS log_path writes + # . report files into it, and the runtime needs the dir + # to exist. + run: | + mkdir -p "$GITHUB_WORKSPACE/sanitizer-logs" + ctest --preset ${{ matrix.sanitizer }} + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: build-logs-sanitizer-${{ matrix.sanitizer }} + path: | + sanitizer-logs/ + codes/build/${{ matrix.sanitizer }}/Testing/Temporary/LastTest.log + codes/build/${{ matrix.sanitizer }}/Testing/Temporary/LastTestsFailed.log + codes/build/${{ matrix.sanitizer }}/CMakeFiles/CMakeError.log + codes/build/${{ matrix.sanitizer }}/CMakeFiles/CMakeOutput.log + codes/build/${{ matrix.sanitizer }}/CMakeCache.txt + if-no-files-found: ignore + retention-days: 14 + # Dedicated coverage build: instrument the codes library (--coverage), run the # test suite, capture with lcov, and upload to Codecov. Single config # (ubuntu-24.04 / gcc / MPICH) — coverage is about which lines run, not diff --git a/CMakeLists.txt b/CMakeLists.txt index 93e05c30..c7955236 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,33 @@ add_compile_options(-Wundef) # default; the coverage CI job turns it on. gcc/clang only. option(CODES_ENABLE_COVERAGE "Instrument the codes library with --coverage (gcov)" OFF) +# Optional sanitizer instrumentation, applied globally to ALL CODES targets built +# below (the library, the executables, and the tests) via add_*_options — unlike +# coverage, which scopes to the `codes` target. ROSS is a prebuilt imported target +# (find_package(ROSS) below) and is NOT reinstrumented, so this is CODES-only for +# now; bugs reached through ROSS internals won't be flagged until ROSS itself is +# built with a sanitizer. +# +# One string knob rather than a bool per sanitizer, so a build dir carries exactly +# one sanitizer (matching the separate asan/ubsan presets and CI lanes); pass a +# comma list ("address,undefined") to combine if ever wanted. Off by default; the +# sanitizer CI jobs turn one on. gcc/clang only, and mutually exclusive with +# coverage (the two instrumentations don't compose). +set(CODES_SANITIZER "" CACHE STRING "Build CODES with a sanitizer: '' (none), 'address', or 'undefined'") +set_property(CACHE CODES_SANITIZER PROPERTY STRINGS "" "address" "undefined") +if(CODES_SANITIZER) + if(CODES_ENABLE_COVERAGE) + message(FATAL_ERROR "CODES_SANITIZER and CODES_ENABLE_COVERAGE are mutually exclusive; pick one.") + endif() + message(STATUS "Building CODES with sanitizer: ${CODES_SANITIZER}") + # -fno-sanitize-recover=all makes a UBSan finding abort with a nonzero exit + # (UBSan otherwise prints and continues, so ctest/CI would pass over real UB). + # -fno-omit-frame-pointer keeps stacks readable. Both compile and link must + # carry -fsanitize so the runtime is linked into every executable. + add_compile_options(-fsanitize=${CODES_SANITIZER} -fno-omit-frame-pointer -fno-sanitize-recover=all) + add_link_options(-fsanitize=${CODES_SANITIZER}) +endif() + #prevent cmake from stripping the runtime path (important if shared libraries are imported) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) diff --git a/CMakePresets.json b/CMakePresets.json index 96fadc8f..59254d7a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -53,6 +53,26 @@ "CODES_ENABLE_COVERAGE": "ON" } }, + { + "name": "asan", + "inherits": "debug", + "displayName": "ASan", + "description": "Debug build with AddressSanitizer (CODES_SANITIZER=address); mirrors the CI asan job. CODES is instrumented; ROSS is not. Run ctest with ASAN_OPTIONS as needed — the CI lane sets detect_leaks=0 initially to skip MPI/teardown leak noise. WIP: builds at -O1 (ASan-recommended) instead of Debug's -O0 to cut the slow ping-pong tests' runtime; remove the CMAKE_*_FLAGS_DEBUG overrides to revert.", + "cacheVariables": { + "CODES_SANITIZER": "address", + "CMAKE_C_FLAGS_DEBUG": "-g -O1", + "CMAKE_CXX_FLAGS_DEBUG": "-g -O1" + } + }, + { + "name": "ubsan", + "inherits": "debug", + "displayName": "UBSan", + "description": "Debug build with UndefinedBehaviorSanitizer (CODES_SANITIZER=undefined). -fno-sanitize-recover is baked in so any UB aborts and fails ctest; mirrors the CI ubsan job. CODES is instrumented; ROSS is not.", + "cacheVariables": { + "CODES_SANITIZER": "undefined" + } + }, { "name": "full-ci", "inherits": "debug", @@ -120,6 +140,16 @@ "configurePreset": "coverage", "displayName": "Coverage" }, + { + "name": "asan", + "configurePreset": "asan", + "displayName": "ASan" + }, + { + "name": "ubsan", + "configurePreset": "ubsan", + "displayName": "UBSan" + }, { "name": "full-ci", "configurePreset": "full-ci", @@ -166,6 +196,18 @@ "configurePreset": "coverage", "displayName": "Coverage" }, + { + "name": "asan", + "inherits": "base", + "configurePreset": "asan", + "displayName": "ASan" + }, + { + "name": "ubsan", + "inherits": "base", + "configurePreset": "ubsan", + "displayName": "UBSan" + }, { "name": "full-ci", "inherits": "base", diff --git a/doc/dev/ci.md b/doc/dev/ci.md index 05eb785d..9486953e 100644 --- a/doc/dev/ci.md +++ b/doc/dev/ci.md @@ -10,7 +10,7 @@ All workflows live in [`.github/workflows/`](../../.github/workflows). | Workflow | File | Triggers | What it does | | --- | --- | --- | --- | -| **Build** | `build.yml` | push/PR to `master`, nightly `0 7 * * *` | Builds ROSS + CODES and runs `ctest` across an OS × MPI × compiler matrix, plus a coverage job and a heavy-deps `full` job. | +| **Build** | `build.yml` | push/PR to `master`, nightly `0 7 * * *` | Builds ROSS + CODES and runs `ctest` across an OS × MPI × compiler matrix, plus a coverage job, ASan/UBSan sanitizer lanes, and a heavy-deps `full` job. | | **Format check** | `format.yml` | push/PR to `master` | Runs `clang-format-20 --dry-run --Werror` over the C/C++ tree. Any drift fails the PR. Reformat with `clang-format-20 -i `. | | **Full CI image** | `full-ci-image.yml` | weekly `0 8 * * 1`, manual, and on changes to `ci/full/Dockerfile` | Builds and publishes `ghcr.io/codes-org/codes-ci-full` (the heavy-dependency image the `build.yml` `full` job runs inside). Not run on every PR. | @@ -18,6 +18,7 @@ All workflows live in [`.github/workflows/`](../../.github/workflows). - **`build`** — the core matrix. `{ubuntu-22.04, ubuntu-24.04, macos-14} × {mpich, openmpi} × {gcc, clang}`, pruned to 8 legs (macOS only ships Apple clang, so its "gcc" leg is dropped; ubuntu-22.04 carries the older glibc/gcc-11 toolchain and skips clang since the gcc-vs-clang signal comes from 24.04). Heavy optional deps are OFF so every leg runs on a stock runner with no custom image. The macOS leg is the safety net for Mac-specific link/include breakage. - **`coverage`** — single config (ubuntu-24.04 / gcc / MPICH). Builds CODES with `--coverage`, runs the suite, and uploads to Codecov. Informational only — it never gates a PR, and the upload is a no-op until the `CODECOV_TOKEN` repo secret is set. +- **`sanitizers`** — AddressSanitizer (`asan`) and UndefinedBehaviorSanitizer (`ubsan`) as two independent matrix legs, single config (ubuntu-24.04 / gcc / MPICH). Both gate PRs; only CODES is instrumented (ROSS is built without sanitizers). See [Sanitizer lanes](#sanitizer-lanes) below. - **`full`** — every optional subsystem ON (SWM, UNION, DUMPI, Torch, ZeroMQ). Runs inside `ghcr.io/codes-org/codes-ci-full` so the slow dependency compiles happen once in that image; this job only builds ROSS + the zmqml requester + CODES. Each leg builds ROSS fresh from source (see the pinning section below), installs it, @@ -100,6 +101,47 @@ ROSS + the zmqml requester + CODES. `full-ci-image.yml` once via *Actions → Full CI image → Run workflow* to publish `:latest`. +## Sanitizer lanes + +The `sanitizers` job builds CODES under AddressSanitizer (`asan`) and +UndefinedBehaviorSanitizer (`ubsan`) as two independent matrix legs, on a single +ubuntu-24.04 / gcc / MPICH config. They're kept separate (not one combined build) +so each reports and gates on its own, and MPICH is used because it's far quieter +under the sanitizers than OpenMPI. Select a lane with `cmake --preset asan|ubsan`, +which sets the `CODES_SANITIZER` cache variable; **only CODES targets are +instrumented** — ROSS is a prebuilt imported target and is not reinstrumented. + +- **Both lanes gate.** UBSan aborts on any UB (`-fno-sanitize-recover` is baked + into the preset); ASan aborts on the first memory error. A finding fails the job + and blocks merge. +- **ASan builds at `-O1`** (the `asan` preset overrides Debug's `-O0`). At `-O0` + the suite took ~1 h on CI — the six ping-pong tests ran 10–15 min each. `-O1` brings the + full suite to ~5 min. UBSan stays `-O0`. +- **Leak detection is off** (`detect_leaks=0`), so the lane delivers + heap-overflow / use-after-free / stack-overflow detection without MPICH and + teardown leak noise. +- **`log_path`** writes each sanitizer report to a file that's uploaded as an + artifact on failure — the test harnesses redirect per-rank stderr, so otherwise + a bare abort code would be all CI shows. + +Run a lane locally with `export ROSS_ROOT=` then +`cmake --preset ubsan && cmake --build --preset ubsan && ctest --preset ubsan` +(same for `asan`). + +### Deferred follow-ups + +Lower-priority improvements intentionally left out of the initial lanes — pick +them up later: + +1. **Enable LeakSanitizer.** Flip `detect_leaks=1` and add an LSan suppression + file for MPICH internals and CODES's known unfreed globals. Expect the lane to + go red until the suppressions are tuned, so land it as its own focused change, + not folded into unrelated work. + +If ASan runtime ever balloons again, the fallback is to run a fast test subset on +PRs and the full suite on the nightly `schedule` (the cron already exists); at +~5 min the full suite is currently fine to gate as-is. + ## Note for workflow edits Changes to `.github/workflows/*.yml` only run in a PR's CI when they're pushed to the diff --git a/src/networks/model-net/dragonfly-dally.cxx b/src/networks/model-net/dragonfly-dally.cxx index 47a36c7f..03d6549c 100644 --- a/src/networks/model-net/dragonfly-dally.cxx +++ b/src/networks/model-net/dragonfly-dally.cxx @@ -6120,7 +6120,12 @@ static void packet_arrive_rc(terminal_state* s, tw_bf* bf, terminal_dally_messag key.sender_id = msg->sender_lp; hash_link = qhash_search(s->rank_tbl, &key); - tmp = qhash_entry(hash_link, struct dfly_qhash_entry, hash_link); + // qhash_search misses return NULL; qhash_entry is a container_of, so feeding + // it NULL forms an out-of-bounds pointer (NULL - offsetof) — undefined + // behavior even before any dereference (caught by UBSan). Guard like the + // forward handler (packet_arrive) and the sibling reverse handler do. + if (hash_link) + tmp = qhash_entry(hash_link, struct dfly_qhash_entry, hash_link); mn_stats* stat; stat = model_net_find_stats(msg->category, s->dragonfly_stats_array); diff --git a/src/networks/model-net/dragonfly.c b/src/networks/model-net/dragonfly.c index 3769144d..d5b08bf4 100644 --- a/src/networks/model-net/dragonfly.c +++ b/src/networks/model-net/dragonfly.c @@ -2376,7 +2376,12 @@ static void dragonfly_sample_fin(terminal_state* s, tw_lp* lp) { FILE* fp = fopen(rt_fn, "a"); fseek(fp, sample_bytes_written, SEEK_SET); - fwrite(s->sample_stat, sizeof(struct dfly_cn_sample), s->op_arr_size, fp); + // A terminal that recorded no CN samples has op_arr_size == 0 and a NULL + // sample_stat (never allocated). fwrite declares its buffer nonnull, so + // passing NULL is undefined behavior even for a zero-element write — UBSan + // flags it on glibc (where fwrite carries the nonnull attribute). + if (s->op_arr_size > 0) + fwrite(s->sample_stat, sizeof(struct dfly_cn_sample), s->op_arr_size, fp); fclose(fp); sample_bytes_written += (s->op_arr_size * sizeof(struct dfly_cn_sample));