Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,117 @@ jobs:
if-no-files-found: ignore
retention-days: 14

# Sanitizer builds: AddressSanitizer and UndefinedBehaviorSanitizer, one per
# matrix leg so each reports and gates independently (combining them in one
# build would couple their pass/fail and stop UBSan findings from surfacing
# after an ASan abort). Single config (ubuntu-24.04 / gcc / MPICH) like the
# coverage job — sanitizers catch bad memory/UB in the code, not platform or
# MPI-variant breakage (that's the matrix above), and MPICH is much quieter
# under the sanitizers than OpenMPI. ROSS is built Debug WITHOUT sanitizers;
# only CODES is instrumented (via the asan/ubsan presets -> CODES_SANITIZER),
# so bugs reached through ROSS internals aren't flagged yet — a follow-up can
# instrument ROSS too.
#
# Both lanes gate: a finding fails the job and blocks merge. UBSan aborts on
# any UB via -fno-sanitize-recover (baked into the preset); ASan aborts on the
# first memory error. LeakSanitizer stays disabled (detect_leaks=0) so the
# lane delivers the high-value detections (heap-buffer-overflow, use-after-free,
# stack overflow) without MPICH and teardown leak noise — enabling leaks with a
# suppression file is a follow-up. The asan preset builds at -O1 (ASan-
# recommended) so the full suite runs in ~5 min instead of ~1 h at -O0.
sanitizers:
name: sanitizers (${{ matrix.sanitizer }})
runs-on: ubuntu-24.04
strategy:
# See every sanitizer's result on every run.
fail-fast: false
matrix:
sanitizer: [asan, ubsan]
env:
# ROSS install prefix; find_package(ROSS) honors ROSS_ROOT (CODES presets).
ROSS_ROOT: ${{ github.workspace }}/ross-install
# detect_leaks=0: corruption detection first; leaks are the noisiest part
# (MPICH internals + CODES's known unfreed globals) and come later with a
# suppression file. abort_on_error=1: guarantee a nonzero exit so ctest
# fails on a finding. log_path: CODES test harnesses redirect per-rank
# stderr (the ping-pong determinism test does `2> model-output-error.txt`),
# so a report would otherwise reach neither the ctest log nor the artifact —
# only the abort code. log_path writes each report to <log_path>.<pid>
# files in sanitizer-logs/, which is uploaded on failure. (No-op on ubsan.)
ASAN_OPTIONS: detect_leaks=0:abort_on_error=1:strict_string_checks=1:log_path=${{ github.workspace }}/sanitizer-logs/asan
# Abort with a symbolized stack on the first UB (pairs with the preset's
# -fno-sanitize-recover); log_path captures it past harness redirects as
# above. (No-op on the asan leg.)
UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1:log_path=${{ github.workspace }}/sanitizer-logs/ubsan
steps:
- name: Checkout CODES
uses: actions/checkout@v4
with:
path: codes

- name: Checkout ROSS
uses: actions/checkout@v4
with:
repository: ROSS-org/ROSS
# Push/PR runs use the pin; the nightly scheduled run tracks
# ROSS master so pin drift surfaces within a day.
ref: ${{ github.event_name == 'schedule' && 'master' || env.ROSS_REF }}
path: ross

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
mpich libmpich-dev \
cmake ninja-build pkg-config \
flex bison

- name: Configure ROSS
# ROSS is built without sanitizers (default gcc); CODES links it and is
# the only side instrumented for now.
run: >
cmake -S ross -B ross/build -G Ninja
-DCMAKE_BUILD_TYPE=Debug
-DROSS_BUILD_MODELS=ON
-DCMAKE_INSTALL_PREFIX=$PWD/ross-install

- name: Build and install ROSS
run: cmake --build ross/build --target install -j

- name: Configure CODES (${{ matrix.sanitizer }})
working-directory: codes
# The asan/ubsan preset sets CODES_SANITIZER; heavy deps forced off to
# match the stock matrix legs.
run: cmake --preset ${{ matrix.sanitizer }} -DCODES_USE_TORCH=OFF -DCODES_USE_ZEROMQ=OFF

- name: Build CODES
working-directory: codes
run: cmake --build --preset ${{ matrix.sanitizer }}

- name: Run CODES tests
working-directory: codes
# Pre-create the sanitizer log dir: ASAN/UBSAN_OPTIONS log_path writes
# <log_path>.<pid> report files into it, and the runtime needs the dir
# to exist.
run: |
mkdir -p "$GITHUB_WORKSPACE/sanitizer-logs"
ctest --preset ${{ matrix.sanitizer }}

- name: Upload logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: build-logs-sanitizer-${{ matrix.sanitizer }}
path: |
sanitizer-logs/
codes/build/${{ matrix.sanitizer }}/Testing/Temporary/LastTest.log
codes/build/${{ matrix.sanitizer }}/Testing/Temporary/LastTestsFailed.log
codes/build/${{ matrix.sanitizer }}/CMakeFiles/CMakeError.log
codes/build/${{ matrix.sanitizer }}/CMakeFiles/CMakeOutput.log
codes/build/${{ matrix.sanitizer }}/CMakeCache.txt
if-no-files-found: ignore
retention-days: 14

# Dedicated coverage build: instrument the codes library (--coverage), run the
# test suite, capture with lcov, and upload to Codecov. Single config
# (ubuntu-24.04 / gcc / MPICH) — coverage is about which lines run, not
Expand Down
27 changes: 27 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,33 @@ add_compile_options(-Wundef)
# default; the coverage CI job turns it on. gcc/clang only.
option(CODES_ENABLE_COVERAGE "Instrument the codes library with --coverage (gcov)" OFF)

# Optional sanitizer instrumentation, applied globally to ALL CODES targets built
# below (the library, the executables, and the tests) via add_*_options — unlike
# coverage, which scopes to the `codes` target. ROSS is a prebuilt imported target
# (find_package(ROSS) below) and is NOT reinstrumented, so this is CODES-only for
# now; bugs reached through ROSS internals won't be flagged until ROSS itself is
# built with a sanitizer.
#
# One string knob rather than a bool per sanitizer, so a build dir carries exactly
# one sanitizer (matching the separate asan/ubsan presets and CI lanes); pass a
# comma list ("address,undefined") to combine if ever wanted. Off by default; the
# sanitizer CI jobs turn one on. gcc/clang only, and mutually exclusive with
# coverage (the two instrumentations don't compose).
set(CODES_SANITIZER "" CACHE STRING "Build CODES with a sanitizer: '' (none), 'address', or 'undefined'")
set_property(CACHE CODES_SANITIZER PROPERTY STRINGS "" "address" "undefined")
if(CODES_SANITIZER)
if(CODES_ENABLE_COVERAGE)
message(FATAL_ERROR "CODES_SANITIZER and CODES_ENABLE_COVERAGE are mutually exclusive; pick one.")
endif()
message(STATUS "Building CODES with sanitizer: ${CODES_SANITIZER}")
# -fno-sanitize-recover=all makes a UBSan finding abort with a nonzero exit
# (UBSan otherwise prints and continues, so ctest/CI would pass over real UB).
# -fno-omit-frame-pointer keeps stacks readable. Both compile and link must
# carry -fsanitize so the runtime is linked into every executable.
add_compile_options(-fsanitize=${CODES_SANITIZER} -fno-omit-frame-pointer -fno-sanitize-recover=all)
add_link_options(-fsanitize=${CODES_SANITIZER})
endif()

#prevent cmake from stripping the runtime path (important if shared libraries are imported)
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

Expand Down
42 changes: 42 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,26 @@
"CODES_ENABLE_COVERAGE": "ON"
}
},
{
"name": "asan",
"inherits": "debug",
"displayName": "ASan",
"description": "Debug build with AddressSanitizer (CODES_SANITIZER=address); mirrors the CI asan job. CODES is instrumented; ROSS is not. Run ctest with ASAN_OPTIONS as needed — the CI lane sets detect_leaks=0 initially to skip MPI/teardown leak noise. WIP: builds at -O1 (ASan-recommended) instead of Debug's -O0 to cut the slow ping-pong tests' runtime; remove the CMAKE_*_FLAGS_DEBUG overrides to revert.",
"cacheVariables": {
"CODES_SANITIZER": "address",
"CMAKE_C_FLAGS_DEBUG": "-g -O1",
"CMAKE_CXX_FLAGS_DEBUG": "-g -O1"
}
},
{
"name": "ubsan",
"inherits": "debug",
"displayName": "UBSan",
"description": "Debug build with UndefinedBehaviorSanitizer (CODES_SANITIZER=undefined). -fno-sanitize-recover is baked in so any UB aborts and fails ctest; mirrors the CI ubsan job. CODES is instrumented; ROSS is not.",
"cacheVariables": {
"CODES_SANITIZER": "undefined"
}
},
{
"name": "full-ci",
"inherits": "debug",
Expand Down Expand Up @@ -120,6 +140,16 @@
"configurePreset": "coverage",
"displayName": "Coverage"
},
{
"name": "asan",
"configurePreset": "asan",
"displayName": "ASan"
},
{
"name": "ubsan",
"configurePreset": "ubsan",
"displayName": "UBSan"
},
{
"name": "full-ci",
"configurePreset": "full-ci",
Expand Down Expand Up @@ -166,6 +196,18 @@
"configurePreset": "coverage",
"displayName": "Coverage"
},
{
"name": "asan",
"inherits": "base",
"configurePreset": "asan",
"displayName": "ASan"
},
{
"name": "ubsan",
"inherits": "base",
"configurePreset": "ubsan",
"displayName": "UBSan"
},
{
"name": "full-ci",
"inherits": "base",
Expand Down
44 changes: 43 additions & 1 deletion doc/dev/ci.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ All workflows live in [`.github/workflows/`](../../.github/workflows).

| Workflow | File | Triggers | What it does |
| --- | --- | --- | --- |
| **Build** | `build.yml` | push/PR to `master`, nightly `0 7 * * *` | Builds ROSS + CODES and runs `ctest` across an OS × MPI × compiler matrix, plus a coverage job and a heavy-deps `full` job. |
| **Build** | `build.yml` | push/PR to `master`, nightly `0 7 * * *` | Builds ROSS + CODES and runs `ctest` across an OS × MPI × compiler matrix, plus a coverage job, ASan/UBSan sanitizer lanes, and a heavy-deps `full` job. |
| **Format check** | `format.yml` | push/PR to `master` | Runs `clang-format-20 --dry-run --Werror` over the C/C++ tree. Any drift fails the PR. Reformat with `clang-format-20 -i <file>`. |
| **Full CI image** | `full-ci-image.yml` | weekly `0 8 * * 1`, manual, and on changes to `ci/full/Dockerfile` | Builds and publishes `ghcr.io/codes-org/codes-ci-full` (the heavy-dependency image the `build.yml` `full` job runs inside). Not run on every PR. |

### `build.yml` jobs

- **`build`** — the core matrix. `{ubuntu-22.04, ubuntu-24.04, macos-14} × {mpich, openmpi} × {gcc, clang}`, pruned to 8 legs (macOS only ships Apple clang, so its "gcc" leg is dropped; ubuntu-22.04 carries the older glibc/gcc-11 toolchain and skips clang since the gcc-vs-clang signal comes from 24.04). Heavy optional deps are OFF so every leg runs on a stock runner with no custom image. The macOS leg is the safety net for Mac-specific link/include breakage.
- **`coverage`** — single config (ubuntu-24.04 / gcc / MPICH). Builds CODES with `--coverage`, runs the suite, and uploads to Codecov. Informational only — it never gates a PR, and the upload is a no-op until the `CODECOV_TOKEN` repo secret is set.
- **`sanitizers`** — AddressSanitizer (`asan`) and UndefinedBehaviorSanitizer (`ubsan`) as two independent matrix legs, single config (ubuntu-24.04 / gcc / MPICH). Both gate PRs; only CODES is instrumented (ROSS is built without sanitizers). See [Sanitizer lanes](#sanitizer-lanes) below.
- **`full`** — every optional subsystem ON (SWM, UNION, DUMPI, Torch, ZeroMQ). Runs inside `ghcr.io/codes-org/codes-ci-full` so the slow dependency compiles happen once in that image; this job only builds ROSS + the zmqml requester + CODES.

Each leg builds ROSS fresh from source (see the pinning section below), installs it,
Expand Down Expand Up @@ -100,6 +101,47 @@ ROSS + the zmqml requester + CODES.
`full-ci-image.yml` once via *Actions → Full CI image → Run workflow* to publish
`:latest`.

## Sanitizer lanes

The `sanitizers` job builds CODES under AddressSanitizer (`asan`) and
UndefinedBehaviorSanitizer (`ubsan`) as two independent matrix legs, on a single
ubuntu-24.04 / gcc / MPICH config. They're kept separate (not one combined build)
so each reports and gates on its own, and MPICH is used because it's far quieter
under the sanitizers than OpenMPI. Select a lane with `cmake --preset asan|ubsan`,
which sets the `CODES_SANITIZER` cache variable; **only CODES targets are
instrumented** — ROSS is a prebuilt imported target and is not reinstrumented.

- **Both lanes gate.** UBSan aborts on any UB (`-fno-sanitize-recover` is baked
into the preset); ASan aborts on the first memory error. A finding fails the job
and blocks merge.
- **ASan builds at `-O1`** (the `asan` preset overrides Debug's `-O0`). At `-O0`
the suite took ~1 h on CI — the six ping-pong tests ran 10–15 min each. `-O1` brings the
full suite to ~5 min. UBSan stays `-O0`.
- **Leak detection is off** (`detect_leaks=0`), so the lane delivers
heap-overflow / use-after-free / stack-overflow detection without MPICH and
teardown leak noise.
- **`log_path`** writes each sanitizer report to a file that's uploaded as an
artifact on failure — the test harnesses redirect per-rank stderr, so otherwise
a bare abort code would be all CI shows.

Run a lane locally with `export ROSS_ROOT=<ross-install>` then
`cmake --preset ubsan && cmake --build --preset ubsan && ctest --preset ubsan`
(same for `asan`).

### Deferred follow-ups

Lower-priority improvements intentionally left out of the initial lanes — pick
them up later:

1. **Enable LeakSanitizer.** Flip `detect_leaks=1` and add an LSan suppression
file for MPICH internals and CODES's known unfreed globals. Expect the lane to
go red until the suppressions are tuned, so land it as its own focused change,
not folded into unrelated work.

If ASan runtime ever balloons again, the fallback is to run a fast test subset on
PRs and the full suite on the nightly `schedule` (the cron already exists); at
~5 min the full suite is currently fine to gate as-is.

## Note for workflow edits

Changes to `.github/workflows/*.yml` only run in a PR's CI when they're pushed to the
Expand Down
7 changes: 6 additions & 1 deletion src/networks/model-net/dragonfly-dally.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -6120,7 +6120,12 @@ static void packet_arrive_rc(terminal_state* s, tw_bf* bf, terminal_dally_messag
key.sender_id = msg->sender_lp;

hash_link = qhash_search(s->rank_tbl, &key);
tmp = qhash_entry(hash_link, struct dfly_qhash_entry, hash_link);
// qhash_search misses return NULL; qhash_entry is a container_of, so feeding
// it NULL forms an out-of-bounds pointer (NULL - offsetof) — undefined
// behavior even before any dereference (caught by UBSan). Guard like the
// forward handler (packet_arrive) and the sibling reverse handler do.
if (hash_link)
tmp = qhash_entry(hash_link, struct dfly_qhash_entry, hash_link);

mn_stats* stat;
stat = model_net_find_stats(msg->category, s->dragonfly_stats_array);
Expand Down
7 changes: 6 additions & 1 deletion src/networks/model-net/dragonfly.c
Original file line number Diff line number Diff line change
Expand Up @@ -2376,7 +2376,12 @@ static void dragonfly_sample_fin(terminal_state* s, tw_lp* lp) {

FILE* fp = fopen(rt_fn, "a");
fseek(fp, sample_bytes_written, SEEK_SET);
fwrite(s->sample_stat, sizeof(struct dfly_cn_sample), s->op_arr_size, fp);
// A terminal that recorded no CN samples has op_arr_size == 0 and a NULL
// sample_stat (never allocated). fwrite declares its buffer nonnull, so
// passing NULL is undefined behavior even for a zero-element write — UBSan
// flags it on glibc (where fwrite carries the nonnull attribute).
if (s->op_arr_size > 0)
fwrite(s->sample_stat, sizeof(struct dfly_cn_sample), s->op_arr_size, fp);
fclose(fp);

sample_bytes_written += (s->op_arr_size * sizeof(struct dfly_cn_sample));
Expand Down
Loading