diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e81fce..afd2d18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@ endif()
 option(CUDECOMP_BUILD_FORTRAN "Build Fortran bindings" ON)
 option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON)
 option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF)
+option(CUDECOMP_BUILD_TESTS "Build tests" OFF)
+option(CUDECOMP_TEST_FETCH_GTEST "Fetch GoogleTest if a system installation is unavailable" OFF)
 option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF)
 set(CUDECOMP_NCCL_HOME CACHE STRING "Path to search for NCCL installation. Use to override NVHPC provided NCCL version.")
 set(CUDECOMP_NVSHMEM_HOME CACHE STRING "Path to search for NVSHMEM installation. Use to override NVHPC provided NVSHMEM version.")
@@ -72,6 +74,10 @@ if (CRAY_CC_BIN)
 endif()
 
 # MPI
+if (CUDECOMP_BUILD_TESTS)
+  # The CTest launcher defaults use MPI_CXX_LIBRARY_VERSION_STRING to identify Open MPI.
+  set(MPI_DETERMINE_LIBRARY_VERSION ON)
+endif()
 find_package(MPI REQUIRED)
 
 if (CRAY_CC_BIN)
@@ -290,18 +296,28 @@ if (CUDECOMP_BUILD_FORTRAN)
    install(FILES ${CMAKE_BINARY_DIR}/include/cudecomp.mod DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
 endif()
 
+if (CUDECOMP_BUILD_TESTS OR CUDECOMP_BUILD_EXTRAS)
+  enable_testing()
+  add_subdirectory(tests/cc)
+
+  if (CUDECOMP_BUILD_FORTRAN)
+    add_subdirectory(tests/fortran)
+  endif()
+
+  if (CUDECOMP_BUILD_TESTS)
+    add_subdirectory(tests/ctest)
+  endif()
+endif()
+
 if (CUDECOMP_BUILD_EXTRAS)
   add_subdirectory(benchmark)
 
-  add_subdirectory(tests/cc)
   add_subdirectory(examples/cc/basic_usage)
   add_subdirectory(examples/cc/taylor_green)
 
   if (CUDECOMP_BUILD_FORTRAN)
-    add_subdirectory(tests/fortran)
     add_subdirectory(examples/fortran/basic_usage)
     add_subdirectory(examples/fortran/poisson)
     add_subdirectory(examples/fortran/taylor_green)
   endif()
 endif()
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2ccd9d9..517273f 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -3,3 +3,4 @@ sphinx_rtd_theme==3.0.2
 breathe==4.36.0
 sphinx-tabs==3.4.7
 sphinx-fortran==1.1.1
+six==1.17.0
diff --git a/tests/README.md b/tests/README.md
index 2f15086..554817e 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,5 +1,92 @@
 # Tests
-This subdirectory contains tests for both the transpose and halo communication routines in cuDecomp, in both C++ and Fortran.
+
+This directory contains cuDecomp functional tests. The primary routine
+development test workflow is the CTest suite under [`ctest/`](ctest/),
+which covers public API behavior plus transpose and halo correctness for the
+backends available in the current build. When Fortran bindings are enabled, the
+CTest workflow also includes focused Fortran API coverage plus MPI functional
+coverage across the Fortran dtype-specialized executables. The
+legacy C++ and Fortran executables are still available for broader sweeps and
+targeted configuration testing.
+
+## CTest Suite
+
+### Build
+
+The default project build does not build tests. Build the CTest suite with
+`CUDECOMP_BUILD_TESTS=ON`:
+
+```shell
+mkdir -p build
+cd build
+cmake -DCUDECOMP_BUILD_TESTS=ON ..
+make -j"$(nproc)"
+```
+
+`CUDECOMP_BUILD_TESTS=ON` also builds the legacy C++/Fortran test executables
+under this directory. The C++ CTest executables use GoogleTest, which is found
+with `find_package(GTest)` by default. If a system GoogleTest package is
+unavailable, configure with `-DCUDECOMP_TEST_FETCH_GTEST=ON` to fetch the pinned
+test dependency.
+
+### Running Tests
+
+List registered tests with:
+
+```shell
+cd build
+ctest -N
+```
+
+The default test order is API, regular transpose tests, regular halo tests, then
+specialized CUDA Graphs, NCCL user-buffer-registration tests, and focused
+Fortran API/functional tests when Fortran bindings are enabled. NVSHMEM tests are
+registered only with NVSHMEM-enabled builds.
+
+Useful labels:
+
+| Label | Tests selected |
+| --- | --- |
+| `api` | Public C API and focused Fortran API behavior tests |
+| `transpose` | All transpose correctness tests |
+| `halo` | All halo correctness tests |
+| `fortran` | Focused Fortran API and MPI functional tests across Fortran dtypes, when built |
+| `mpi` | MPI-backend tests |
+| `nccl` | NCCL-backend tests |
+| `nvshmem` | NVSHMEM-backend tests, when built |
+| `cuda_graphs` | CUDA Graphs functional coverage |
+| `nccl_ubr` | NCCL user buffer registration functional coverage |
+
+Run tests by name or label:
+
+```shell
+cd build
+ctest --output-on-failure -R "cudecomp_(api|transpose_mpi|halo_mpi)$"
+ctest --output-on-failure -L mpi
+ctest --output-on-failure -L cuda_graphs
+```
+
+### GPU Requirements
+
+The CTest suites require GPUs. They run four MPI ranks by default and work best
+on systems with four visible GPUs available. Test setup fails when no CUDA
+device is visible.
+
+On systems with fewer than four visible GPUs, CUDA MPS is required so multiple
+local MPI ranks can share a GPU. Set `CUDA_MPS_ACTIVE_THREAD_PERCENTAGE` to
+`100 / nranks` for the local ranks sharing a GPU. For the default four-rank
+CTest suites on one GPU, this value is `25`.
+
+NCCL tests using MPS also require NCCL 2.30 or newer and
+`NCCL_MULTI_RANK_GPU_ENABLE=1`. If these NCCL-specific requirements are not met,
+the NCCL cases skip so MPI-capable systems can still run the non-NCCL tests.
+
+## Legacy Executables
+
+The CTest suites above are the recommended routine development tests. The legacy
+runners remain useful for broader manual sweeps or for targeting a specific
+configuration that is not part of the focused CTest matrix.
+
 The testing executables accept a number of flags to control the configuration of the test (run `cc/transpose_test -h` or
 `cc/halo_test -h` for a listing of available options). You can use these binaries to test particular configurations of cuDecomp (i.e.
 global grid, process grid, communication backends, datatype, etc.) to verify functionality.
diff --git a/tests/ctest/CMakeLists.txt b/tests/ctest/CMakeLists.txt
new file mode 100644
index 0000000..79b6024
--- /dev/null
+++ b/tests/ctest/CMakeLists.txt
@@ -0,0 +1,304 @@
+find_package(GTest CONFIG QUIET)
+if (NOT GTest_FOUND)
+  find_package(GTest MODULE QUIET)
+endif()
+
+if (NOT GTest_FOUND AND CUDECOMP_TEST_FETCH_GTEST)
+  include(FetchContent)
+  FetchContent_Declare(
+    googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG v1.14.0
+  )
+  set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+endif()
+
+if (TARGET GTest::gtest)
+  set(CUDECOMP_TEST_FRAMEWORK_TARGET GTest::gtest)
+elseif (TARGET GTest::GTest)
+  set(CUDECOMP_TEST_FRAMEWORK_TARGET GTest::GTest)
+endif()
+
+if (NOT CUDECOMP_TEST_FRAMEWORK_TARGET)
+  message(FATAL_ERROR
+    "GoogleTest was not found. Install a system GoogleTest package or configure with "
+    "-DCUDECOMP_TEST_FETCH_GTEST=ON to fetch the pinned test dependency."
+  )
+endif()
+
+set(CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT "")
+if (MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Open MPI")
+  set(CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT "--oversubscribe")
+endif()
+
+set(CUDECOMP_TEST_MPIEXEC_PREFLAGS "${CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT}" CACHE STRING
+    "Extra MPI launcher flags for cuDecomp CTest tests.")
+separate_arguments(CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST UNIX_COMMAND "${CUDECOMP_TEST_MPIEXEC_PREFLAGS}")
+
+set(CUDECOMP_TEST_TIMEOUT 90)
+
+set(CUDECOMP_TEST_ENABLE_NVSHMEM ${CUDECOMP_ENABLE_NVSHMEM})
+configure_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/backend_config.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/backend_config.h
+)
+
+add_library(cudecomp_test_support STATIC)
+target_sources(cudecomp_test_support
+  PRIVATE
+  backend_test_context.cc
+  backend_utils.cc
+  gpu_test_utils.cc
+  mpi_test_utils.cc
+  test_utils.cc
+)
+target_include_directories(cudecomp_test_support
+  PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+  ${NVHPC_CUDA_INCLUDE_DIR}
+  ${NCCL_INCLUDE_DIR}
+)
+target_link_libraries(cudecomp_test_support
+  PUBLIC
+  MPI::MPI_CXX
+  NVHPC::CUDART
+  ${NCCL_LIBRARY}
+  ${CUDECOMP_TEST_FRAMEWORK_TARGET}
+)
+
+add_executable(cudecomp_test_api)
+target_sources(cudecomp_test_api
+  PRIVATE
+  api_tests.cc
+  mpi_test_main.cc
+)
+target_link_libraries(cudecomp_test_api
+  PRIVATE
+  cudecomp
+  cudecomp_test_support
+)
+set_target_properties(cudecomp_test_api PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+)
+
+set(CUDECOMP_TEST_API_MPI_RANKS 4)
+add_test(
+  NAME cudecomp_api
+  COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST}
+          ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_API_MPI_RANKS}
+          $<TARGET_FILE:cudecomp_test_api>
+)
+set_tests_properties(cudecomp_api PROPERTIES
+  LABELS "api;mpi"
+  TIMEOUT ${CUDECOMP_TEST_TIMEOUT}
+)
+
+add_executable(cudecomp_test_transpose)
+target_sources(cudecomp_test_transpose
+  PRIVATE
+  mpi_test_main.cc
+  transpose_tests.cc
+)
+target_link_libraries(cudecomp_test_transpose
+  PRIVATE
+  cudecomp
+  cudecomp_test_support
+  NVHPC::CUTENSOR
+)
+set_target_properties(cudecomp_test_transpose PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+)
+
+set(CUDECOMP_TEST_TRANSPOSE_MPI_RANKS 4)
+function(add_cudecomp_transpose_test test_name filter labels)
+  add_test(
+    NAME ${test_name}
+    COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST}
+            ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_TRANSPOSE_MPI_RANKS}
+            $<TARGET_FILE:cudecomp_test_transpose>
+            --gtest_filter=${filter}
+  )
+  set_tests_properties(${test_name} PROPERTIES
+    LABELS "${labels}"
+    TIMEOUT ${CUDECOMP_TEST_TIMEOUT}
+  )
+endfunction()
+
+add_cudecomp_transpose_test(cudecomp_transpose_mpi "MpiBackends/*" "transpose;mpi")
+add_cudecomp_transpose_test(cudecomp_transpose_nccl "NcclBackends/*" "transpose;nccl")
+set_tests_properties(cudecomp_transpose_nccl PROPERTIES
+  ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl"
+)
+
+if (CUDECOMP_ENABLE_NVSHMEM)
+  add_cudecomp_transpose_test(cudecomp_transpose_nvshmem "NvshmemBackends/*" "transpose;nvshmem")
+  set_tests_properties(cudecomp_transpose_nvshmem PROPERTIES
+    ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nvshmem;NVSHMEM_DISABLE_NCCL=1"
+  )
+endif()
+
+add_executable(cudecomp_test_halo)
+target_sources(cudecomp_test_halo
+  PRIVATE
+  mpi_test_main.cc
+  halo_tests.cc
+)
+target_link_libraries(cudecomp_test_halo
+  PRIVATE
+  cudecomp
+  cudecomp_test_support
+)
+set_target_properties(cudecomp_test_halo PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+)
+
+set(CUDECOMP_TEST_HALO_MPI_RANKS 4)
+function(add_cudecomp_halo_test test_name filter labels)
+  add_test(
+    NAME ${test_name}
+    COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST}
+            ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_HALO_MPI_RANKS}
+            $<TARGET_FILE:cudecomp_test_halo>
+            --gtest_filter=${filter}
+  )
+  set_tests_properties(${test_name} PROPERTIES
+    LABELS "${labels}"
+    TIMEOUT ${CUDECOMP_TEST_TIMEOUT}
+  )
+endfunction()
+
+add_cudecomp_halo_test(cudecomp_halo_mpi "MpiBackends/*" "halo;mpi")
+add_cudecomp_halo_test(cudecomp_halo_nccl "NcclBackends/*" "halo;nccl")
+set_tests_properties(cudecomp_halo_nccl PROPERTIES
+  ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl"
+)
+
+if (CUDECOMP_ENABLE_NVSHMEM)
+  add_cudecomp_halo_test(cudecomp_halo_nvshmem "NvshmemBackends/*" "halo;nvshmem")
+  set_tests_properties(cudecomp_halo_nvshmem PROPERTIES
+    ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nvshmem;NVSHMEM_DISABLE_NCCL=1"
+  )
+endif()
+
+add_cudecomp_transpose_test(cudecomp_cuda_graphs "CudaGraphMpiBackends/*" "cuda_graphs;mpi")
+set_tests_properties(cudecomp_cuda_graphs PROPERTIES
+  ENVIRONMENT "CUDECOMP_ENABLE_CUDA_GRAPHS=1"
+)
+add_cudecomp_transpose_test(cudecomp_nccl_user_buffer_registration "NcclUserBufferRegistration/*"
+                            "nccl;nccl_ubr")
+set_tests_properties(cudecomp_nccl_user_buffer_registration PROPERTIES
+  ENVIRONMENT "CUDECOMP_ENABLE_NCCL_UBR=1;CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl"
+)
+
+if (CUDECOMP_BUILD_FORTRAN)
+  add_executable(cudecomp_test_fortran_api)
+  target_sources(cudecomp_test_fortran_api
+    PRIVATE
+    fortran_api_test.f90
+  )
+  target_include_directories(cudecomp_test_fortran_api
+    PRIVATE
+    ${CMAKE_BINARY_DIR}/include
+    ${MPI_Fortran_INCLUDE_DIRS}
+  )
+  target_link_libraries(cudecomp_test_fortran_api
+    PRIVATE
+    MPI::MPI_Fortran
+    cudecomp
+    cudecomp_fort
+  )
+  target_compile_options(cudecomp_test_fortran_api PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_link_options(cudecomp_test_fortran_api PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_compile_options(cudecomp_test_fortran_api PRIVATE -O3)
+  set_target_properties(cudecomp_test_fortran_api PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+  )
+
+  add_executable(cudecomp_test_fortran_transpose)
+  target_sources(cudecomp_test_fortran_transpose
+    PRIVATE
+    fortran_transpose_tests.f90
+  )
+  target_include_directories(cudecomp_test_fortran_transpose
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_BINARY_DIR}/include
+    ${MPI_Fortran_INCLUDE_DIRS}
+  )
+  target_link_libraries(cudecomp_test_fortran_transpose
+    PRIVATE
+    MPI::MPI_Fortran
+    cudecomp
+    cudecomp_fort
+  )
+  target_compile_options(cudecomp_test_fortran_transpose PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_link_options(cudecomp_test_fortran_transpose PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_compile_options(cudecomp_test_fortran_transpose PRIVATE -O3)
+  set_target_properties(cudecomp_test_fortran_transpose PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+  )
+
+  add_executable(cudecomp_test_fortran_halo)
+  target_sources(cudecomp_test_fortran_halo
+    PRIVATE
+    fortran_halo_tests.f90
+  )
+  target_include_directories(cudecomp_test_fortran_halo
+    PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_BINARY_DIR}/include
+    ${MPI_Fortran_INCLUDE_DIRS}
+  )
+  target_link_libraries(cudecomp_test_fortran_halo
+    PRIVATE
+    MPI::MPI_Fortran
+    cudecomp
+    cudecomp_fort
+  )
+  target_compile_options(cudecomp_test_fortran_halo PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_link_options(cudecomp_test_fortran_halo PRIVATE
+    $<$<COMPILE_LANGUAGE:Fortran>:-cpp -cuda -gpu=${CUF_GPU_ARG}>
+  )
+  target_compile_options(cudecomp_test_fortran_halo PRIVATE -O3)
+  set_target_properties(cudecomp_test_fortran_halo PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest
+  )
+
+  set(CUDECOMP_TEST_FORTRAN_API_MPI_RANKS 4)
+  set(CUDECOMP_TEST_FORTRAN_TRANSPOSE_MPI_RANKS 4)
+  set(CUDECOMP_TEST_FORTRAN_HALO_MPI_RANKS 4)
+
+  function(add_cudecomp_fortran_test test_name target labels mpi_ranks)
+    add_test(
+      NAME ${test_name}
+      COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST}
+              ${MPIEXEC_NUMPROC_FLAG} ${mpi_ranks}
+              $<TARGET_FILE:${target}>
+              ${ARGN}
+    )
+    set_tests_properties(${test_name} PROPERTIES
+      LABELS "${labels}"
+      TIMEOUT ${CUDECOMP_TEST_TIMEOUT}
+    )
+  endfunction()
+
+  add_cudecomp_fortran_test(cudecomp_fortran_api cudecomp_test_fortran_api "fortran;api;mpi"
+                            ${CUDECOMP_TEST_FORTRAN_API_MPI_RANKS})
+  add_cudecomp_fortran_test(cudecomp_fortran_transpose cudecomp_test_fortran_transpose "fortran;transpose;mpi"
+                            ${CUDECOMP_TEST_FORTRAN_TRANSPOSE_MPI_RANKS})
+  add_cudecomp_fortran_test(cudecomp_fortran_halo cudecomp_test_fortran_halo "fortran;halo;mpi"
+                            ${CUDECOMP_TEST_FORTRAN_HALO_MPI_RANKS})
+endif()
diff --git a/tests/ctest/api_tests.cc b/tests/ctest/api_tests.cc
new file mode 100644
index 0000000..9418ba2
--- /dev/null
+++ b/tests/ctest/api_tests.cc
@@ -0,0 +1,769 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <array>
+#include <cstdint>
+#include <memory>
+
+#include <mpi.h>
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "cudecomp.h"
+
+#include "gpu_test_utils.h"
+#include "mpi_test_utils.h"
+#include "test_utils.h"
+
+namespace {
+
+constexpr int kApiTestRanks = 4;
+constexpr std::array<int32_t, 3> kGdims{9, 10, 11};
+constexpr std::array<int32_t, 3> kGdimsDist{8, 9, 10};
+constexpr std::array<int32_t, 2> kPdims{2, 2};
+constexpr std::array<int32_t, 3> kHaloExtents{1, 2, 1};
+constexpr std::array<int32_t, 3> kPadding{1, 0, 2};
+constexpr std::array<bool, 3> kHaloPeriods{false, true, false};
+
+struct ExpectedPencilInfo {
+  std::array<int32_t, 3> shape;
+  std::array<int32_t, 3> lo;
+  std::array<int32_t, 3> hi;
+  std::array<int32_t, 3> order;
+  std::array<int32_t, 3> halo_extents;
+  std::array<int32_t, 3> padding;
+  int64_t size;
+};
+
+constexpr ExpectedPencilInfo kExpectedDefaultPencilInfo[3][kApiTestRanks] = {
+    {
+        {{12, 9, 10}, {0, 0, 0}, {8, 4, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{12, 9, 9}, {0, 0, 6}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+        {{12, 9, 10}, {0, 5, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{12, 9, 9}, {0, 5, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+    },
+    {
+        {{8, 14, 10}, {0, 0, 0}, {4, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1120},
+        {{8, 14, 9}, {0, 0, 6}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1008},
+        {{7, 14, 10}, {5, 0, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 980},
+        {{7, 14, 9}, {5, 0, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 882},
+    },
+    {
+        {{8, 9, 15}, {0, 0, 0}, {4, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{8, 9, 15}, {0, 5, 0}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{7, 9, 15}, {5, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+        {{7, 9, 15}, {5, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+    },
+};
+
+constexpr ExpectedPencilInfo kExpectedColumnMajorPencilInfo[3][kApiTestRanks] = {
+    {
+        {{12, 9, 10}, {0, 0, 0}, {8, 4, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{12, 9, 10}, {0, 5, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{12, 9, 9}, {0, 0, 6}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+        {{12, 9, 9}, {0, 5, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+    },
+    {
+        {{8, 14, 10}, {0, 0, 0}, {4, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1120},
+        {{7, 14, 10}, {5, 0, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 980},
+        {{8, 14, 9}, {0, 0, 6}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1008},
+        {{7, 14, 9}, {5, 0, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 882},
+    },
+    {
+        {{8, 9, 15}, {0, 0, 0}, {4, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{7, 9, 15}, {5, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+        {{8, 9, 15}, {0, 5, 0}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{7, 9, 15}, {5, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+    },
+};
+
+constexpr ExpectedPencilInfo kExpectedGdimsDistPencilInfo[3][kApiTestRanks] = {
+    {
+        {{12, 9, 9}, {0, 0, 0}, {8, 4, 4}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+        {{12, 9, 10}, {0, 0, 5}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{12, 9, 9}, {0, 5, 0}, {8, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 972},
+        {{12, 9, 10}, {0, 5, 5}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+    },
+    {
+        {{7, 14, 9}, {0, 0, 0}, {3, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 882},
+        {{7, 14, 10}, {0, 0, 5}, {3, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 980},
+        {{8, 14, 9}, {4, 0, 0}, {8, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 1008},
+        {{8, 14, 10}, {4, 0, 5}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1120},
+    },
+    {
+        {{7, 9, 15}, {0, 0, 0}, {3, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+        {{7, 9, 15}, {0, 5, 0}, {3, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945},
+        {{8, 9, 15}, {4, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+        {{8, 9, 15}, {4, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080},
+    },
+};
+
+void setDistributedConfig(cudecompGridDescConfig_t& config) {
+  config.gdims[0] = kGdims[0];
+  config.gdims[1] = kGdims[1];
+  config.gdims[2] = kGdims[2];
+  config.pdims[0] = kPdims[0];
+  config.pdims[1] = kPdims[1];
+}
+
+void setMemOrder(cudecompGridDescConfig_t& config, const std::array<int32_t, 3>& order) {
+  for (int axis = 0; axis < 3; ++axis) {
+    for (int i = 0; i < 3; ++i) {
+      config.transpose_mem_order[axis][i] = order[i];
+    }
+  }
+}
+
+void setGdimsDist(cudecompGridDescConfig_t& config) {
+  config.gdims_dist[0] = kGdimsDist[0];
+  config.gdims_dist[1] = kGdimsDist[1];
+  config.gdims_dist[2] = kGdimsDist[2];
+}
+
+void expectPencilInfoEquals(const cudecompPencilInfo_t& actual, const ExpectedPencilInfo& expected) {
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(expected.shape[i], actual.shape[i]);
+    EXPECT_EQ(expected.lo[i], actual.lo[i]);
+    EXPECT_EQ(expected.hi[i], actual.hi[i]);
+    EXPECT_EQ(expected.order[i], actual.order[i]);
+    EXPECT_EQ(expected.halo_extents[i], actual.halo_extents[i]);
+    EXPECT_EQ(expected.padding[i], actual.padding[i]);
+  }
+  EXPECT_EQ(expected.size, actual.size);
+}
+
+void expectGridDescConfigEquals(const cudecompGridDescConfig_t& actual, const cudecompGridDescConfig_t& expected) {
+  EXPECT_EQ(expected.rank_order, actual.rank_order);
+  EXPECT_EQ(expected.transpose_comm_backend, actual.transpose_comm_backend);
+  EXPECT_EQ(expected.halo_comm_backend, actual.halo_comm_backend);
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_EQ(expected.pdims[i], actual.pdims[i]);
+  }
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(expected.gdims[i], actual.gdims[i]);
+    EXPECT_EQ(expected.gdims_dist[i], actual.gdims_dist[i]);
+    EXPECT_EQ(expected.transpose_axis_contiguous[i], actual.transpose_axis_contiguous[i]);
+    for (int j = 0; j < 3; ++j) {
+      EXPECT_EQ(expected.transpose_mem_order[i][j], actual.transpose_mem_order[i][j]);
+    }
+  }
+}
+
+bool isMpiTransposeBackend(cudecompTransposeCommBackend_t backend) {
+  return backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P || backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL ||
+         backend == CUDECOMP_TRANSPOSE_COMM_MPI_A2A;
+}
+
+bool isMpiHaloBackend(cudecompHaloCommBackend_t backend) {
+  return backend == CUDECOMP_HALO_COMM_MPI || backend == CUDECOMP_HALO_COMM_MPI_BLOCKING;
+}
+
+void expectShiftedRanks(const cudecomp_test::MpiTestComm& comm, cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
+                        int axis, int dim, int displacement, bool periodic,
+                        const std::array<int32_t, kApiTestRanks>& expected_ranks) {
+  int32_t shifted_rank = -2;
+  CHECK_CUDECOMP_GLOBAL(comm,
+                        cudecompGetShiftedRank(handle, grid_desc, axis, dim, displacement, periodic, &shifted_rank));
+  EXPECT_EQ(expected_ranks[comm.rank()], shifted_rank);
+}
+
+TEST(ApiGridDescConfigSetDefaultsTest, SetsDocumentedDefaults) {
+  cudecompGridDescConfig_t config;
+  ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config));
+
+  EXPECT_EQ(CUDECOMP_RANK_ORDER_DEFAULT, config.rank_order);
+  EXPECT_EQ(CUDECOMP_TRANSPOSE_COMM_MPI_P2P, config.transpose_comm_backend);
+  EXPECT_EQ(CUDECOMP_HALO_COMM_MPI, config.halo_comm_backend);
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_EQ(0, config.pdims[i]);
+  }
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(0, config.gdims[i]);
+    EXPECT_EQ(0, config.gdims_dist[i]);
+    EXPECT_FALSE(config.transpose_axis_contiguous[i]);
+    for (int j = 0; j < 3; ++j) {
+      EXPECT_EQ(-1, config.transpose_mem_order[i][j]);
+    }
+  }
+}
+
+TEST(ApiGridDescConfigSetDefaultsTest, RejectsInvalidArguments) {
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescConfigSetDefaults(nullptr));
+}
+
+TEST(ApiGridDescAutotuneOptionsSetDefaultsTest, SetsDocumentedDefaults) {
+  cudecompGridDescAutotuneOptions_t options;
+  ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescAutotuneOptionsSetDefaults(&options));
+
+  EXPECT_EQ(3, options.n_warmup_trials);
+  EXPECT_EQ(5, options.n_trials);
+  EXPECT_EQ(CUDECOMP_AUTOTUNE_GRID_TRANSPOSE, options.grid_mode);
+  EXPECT_EQ(CUDECOMP_DOUBLE, options.dtype);
+  EXPECT_TRUE(options.allow_uneven_decompositions);
+  EXPECT_FALSE(options.disable_nccl_backends);
+  EXPECT_FALSE(options.disable_nvshmem_backends);
+  EXPECT_EQ(0.0, options.skip_threshold);
+  EXPECT_FALSE(options.autotune_transpose_backend);
+  EXPECT_FALSE(options.autotune_halo_backend);
+  EXPECT_EQ(0, options.halo_axis);
+
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_FALSE(options.transpose_use_inplace_buffers[i]);
+    EXPECT_EQ(1.0, options.transpose_op_weights[i]);
+    for (int j = 0; j < 3; ++j) {
+      EXPECT_EQ(0, options.transpose_input_halo_extents[i][j]);
+      EXPECT_EQ(0, options.transpose_output_halo_extents[i][j]);
+      EXPECT_EQ(0, options.transpose_input_padding[i][j]);
+      EXPECT_EQ(0, options.transpose_output_padding[i][j]);
+    }
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(0, options.halo_extents[i]);
+    EXPECT_FALSE(options.halo_periods[i]);
+    EXPECT_EQ(0, options.halo_padding[i]);
+  }
+}
+
+TEST(ApiGridDescAutotuneOptionsSetDefaultsTest, RejectsInvalidArguments) {
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescAutotuneOptionsSetDefaults(nullptr));
+}
+
+TEST(ApiGetDataTypeSizeTest, ReturnsSupportedTypeSizes) {
+  int64_t dtype_size = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_FLOAT, &dtype_size));
+  EXPECT_EQ(4, dtype_size);
+  EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_DOUBLE, &dtype_size));
+  EXPECT_EQ(8, dtype_size);
+  EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_FLOAT_COMPLEX, &dtype_size));
+  EXPECT_EQ(8, dtype_size);
+  EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_DOUBLE_COMPLEX, &dtype_size));
+  EXPECT_EQ(16, dtype_size);
+}
+
+TEST(ApiGetDataTypeSizeTest, RejectsInvalidArguments) {
+  int64_t dtype_size = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetDataTypeSize(CUDECOMP_FLOAT, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetDataTypeSize(static_cast<cudecompDataType_t>(999), &dtype_size));
+}
+
+TEST(ApiTransposeCommBackendToStringTest, ReturnsBackendNames) {
+  EXPECT_STREQ("MPI_P2P", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P));
+  EXPECT_STREQ("MPI_P2P (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL));
+  EXPECT_STREQ("MPI_A2A", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_A2A));
+  EXPECT_STREQ("NCCL", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NCCL));
+  EXPECT_STREQ("NCCL (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NCCL_PL));
+  EXPECT_STREQ("NVSHMEM", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM));
+  EXPECT_STREQ("NVSHMEM (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL));
+  EXPECT_STREQ("NVSHMEM_SM", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM));
+}
+
+TEST(ApiTransposeCommBackendToStringTest, ReturnsErrorForInvalidBackend) {
+  EXPECT_STREQ("ERROR", cudecompTransposeCommBackendToString(static_cast<cudecompTransposeCommBackend_t>(999)));
+}
+
+TEST(ApiHaloCommBackendToStringTest, ReturnsBackendNames) {
+  EXPECT_STREQ("MPI", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI));
+  EXPECT_STREQ("MPI (blocking)", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI_BLOCKING));
+  EXPECT_STREQ("NCCL", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NCCL));
+  EXPECT_STREQ("NVSHMEM", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NVSHMEM));
+  EXPECT_STREQ("NVSHMEM (blocking)", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING));
+}
+
+TEST(ApiHaloCommBackendToStringTest, ReturnsErrorForInvalidBackend) {
+  EXPECT_STREQ("ERROR", cudecompHaloCommBackendToString(static_cast<cudecompHaloCommBackend_t>(999)));
+}
+
+class ApiMpiTestBase : public ::testing::Test {
+protected:
+  void SetUp() override {
+    auto world_comm = cudecomp_test::MpiTestComm::world();
+    if (world_comm.size() < kApiTestRanks) {
+      GTEST_SKIP() << "API tests require " << kApiTestRanks << " ranks, launched with " << world_comm.size();
+    }
+
+    active_comm_ = cudecomp_test::MpiTestComm::split(world_comm, kApiTestRanks);
+    if (!active_comm_.valid()) { GTEST_SKIP() << "inactive rank for " << kApiTestRanks << "-rank API case"; }
+
+    const auto setup_decision = cudecomp_test::initializeGpuForTest(active_comm_);
+    ASSERT_FALSE(setup_decision.fail) << setup_decision.reason;
+    if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; }
+
+    const cudecompResult_t init_result = cudecompInit(&handle_, active_comm_.mpiComm());
+    handle_guard_ = std::make_unique<cudecomp_test::cudecompHandleGuard>(handle_);
+    CHECK_CUDECOMP_GLOBAL(active_comm_, init_result);
+  }
+
+  void TearDown() override {
+    handle_guard_.reset();
+    active_comm_.reset();
+  }
+
+  cudecompGridDescConfig_t distributedConfig() {
+    cudecompGridDescConfig_t config;
+    EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config));
+    setDistributedConfig(config);
+    return config;
+  }
+
+  cudecompGridDescConfig_t emptyPencilConfig() {
+    auto config = distributedConfig();
+    config.gdims_dist[0] = kGdims[0];
+    config.gdims_dist[1] = 1;
+    config.gdims_dist[2] = kGdims[2];
+    return config;
+  }
+
+  cudecompGridDescAutotuneOptions_t fastAutotuneOptions() {
+    cudecompGridDescAutotuneOptions_t options;
+    EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescAutotuneOptionsSetDefaults(&options));
+    options.n_warmup_trials = 0;
+    options.n_trials = 1;
+    options.dtype = CUDECOMP_FLOAT;
+    options.disable_nccl_backends = true;
+    options.disable_nvshmem_backends = true;
+    return options;
+  }
+
+  void expectGridDescCreateInvalid(cudecompGridDescConfig_t config) {
+    cudecompGridDesc_t grid_desc = nullptr;
+    const cudecompResult_t result = cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr);
+    EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, result);
+    if (grid_desc) { (void)cudecompGridDescDestroy(handle_, grid_desc); }
+  }
+
+  cudecomp_test::MpiTestComm active_comm_;
+  cudecompHandle_t handle_ = nullptr;
+  std::unique_ptr<cudecomp_test::cudecompHandleGuard> handle_guard_;
+};
+
+class ApiInitTest : public ApiMpiTestBase {};
+class ApiFinalizeTest : public ApiMpiTestBase {};
+class ApiGridDescCreateTest : public ApiMpiTestBase {};
+class ApiGridDescDestroyTest : public ApiMpiTestBase {};
+class ApiGetGridDescConfigTest : public ApiMpiTestBase {};
+class ApiGetPencilInfoTest : public ApiMpiTestBase {};
+class ApiGetTransposeWorkspaceSizeTest : public ApiMpiTestBase {};
+class ApiGetHaloWorkspaceSizeTest : public ApiMpiTestBase {};
+class ApiGetShiftedRankTest : public ApiMpiTestBase {};
+class ApiMallocTest : public ApiMpiTestBase {};
+class ApiFreeTest : public ApiMpiTestBase {};
+class ApiTransposeTest : public ApiMpiTestBase {};
+class ApiHaloTest : public ApiMpiTestBase {};
+
+TEST_F(ApiInitTest, RejectsInvalidArguments) {
+  cudecompHandle_t second_handle = nullptr;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompInit(&second_handle, active_comm_.mpiComm()));
+  EXPECT_EQ(nullptr, second_handle);
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompInit(nullptr, active_comm_.mpiComm()));
+}
+
+TEST_F(ApiFinalizeTest, RejectsInvalidArguments) {
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFinalize(nullptr));
+}
+
+TEST_F(ApiGridDescCreateTest, RejectsInvalidConfigs) {
+  auto config = distributedConfig();
+  config.pdims[0] = 1;
+  config.pdims[1] = 1;
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.pdims[0] = 0;
+  config.pdims[1] = 1;
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.rank_order = static_cast<cudecompRankOrder_t>(999);
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.transpose_comm_backend = static_cast<cudecompTransposeCommBackend_t>(999);
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.halo_comm_backend = static_cast<cudecompHaloCommBackend_t>(999);
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.transpose_mem_order[0][0] = 0;
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  setMemOrder(config, {0, 1, 1});
+  expectGridDescCreateInvalid(config);
+
+  config = distributedConfig();
+  config.gdims_dist[0] = kGdims[0] + 1;
+  config.gdims_dist[1] = kGdims[1];
+  config.gdims_dist[2] = kGdims[2];
+  expectGridDescCreateInvalid(config);
+}
+
+TEST_F(ApiGridDescCreateTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t unused_grid_desc = nullptr;
+
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(nullptr, &unused_grid_desc, &config, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, nullptr, &config, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, nullptr, nullptr));
+}
+
+TEST_F(ApiGridDescCreateTest, RejectsInvalidAutotuneInputs) {
+  auto config = distributedConfig();
+  config.pdims[0] = 0;
+  config.pdims[1] = 0;
+  cudecompGridDesc_t unused_grid_desc = nullptr;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, &config, nullptr));
+
+  config = distributedConfig();
+  auto options = fastAutotuneOptions();
+  options.grid_mode = static_cast<cudecompAutotuneGridMode_t>(999);
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, &config, &options));
+}
+
+TEST_F(ApiGridDescDestroyTest, RejectsInvalidArguments) {
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescDestroy(handle_, nullptr));
+}
+
+TEST_F(ApiGetGridDescConfigTest, CreatePreservesConfigSettings) {
+  auto config = distributedConfig();
+  setGdimsDist(config);
+  config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR;
+  config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_A2A;
+  config.halo_comm_backend = CUDECOMP_HALO_COMM_MPI_BLOCKING;
+  for (int i = 0; i < 3; ++i) {
+    config.transpose_axis_contiguous[i] = true;
+  }
+
+  auto expected_config = config;
+
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  cudecompGridDescConfig_t queried_config;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config));
+  expectGridDescConfigEquals(config, expected_config);
+  expectGridDescConfigEquals(queried_config, expected_config);
+}
+
+TEST_F(ApiGetGridDescConfigTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  cudecompGridDescConfig_t queried_config;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetGridDescConfig(handle_, grid_desc, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetGridDescConfig(handle_, nullptr, &queried_config));
+}
+
+TEST_F(ApiGridDescCreateTest, AutotunesTransposeConfig) {
+  auto config = distributedConfig();
+  config.pdims[0] = 0;
+  config.pdims[1] = 0;
+  auto options = fastAutotuneOptions();
+  options.grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE;
+  options.autotune_transpose_backend = true;
+
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, &options));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  EXPECT_GT(config.pdims[0], 0);
+  EXPECT_GT(config.pdims[1], 0);
+  EXPECT_EQ(kApiTestRanks, config.pdims[0] * config.pdims[1]);
+  EXPECT_TRUE(isMpiTransposeBackend(config.transpose_comm_backend))
+      << cudecompTransposeCommBackendToString(config.transpose_comm_backend);
+  EXPECT_EQ(CUDECOMP_HALO_COMM_MPI, config.halo_comm_backend);
+
+  cudecompGridDescConfig_t queried_config;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config));
+  expectGridDescConfigEquals(queried_config, config);
+}
+
+TEST_F(ApiGridDescCreateTest, AutotunesHaloConfig) {
+  auto config = distributedConfig();
+  config.pdims[0] = 0;
+  config.pdims[1] = 0;
+  auto options = fastAutotuneOptions();
+  options.grid_mode = CUDECOMP_AUTOTUNE_GRID_HALO;
+  options.autotune_halo_backend = true;
+  for (int i = 0; i < 3; ++i) {
+    options.halo_extents[i] = kHaloExtents[i];
+    options.halo_periods[i] = kHaloPeriods[i];
+    options.halo_padding[i] = kPadding[i];
+  }
+
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, &options));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  EXPECT_GT(config.pdims[0], 0);
+  EXPECT_GT(config.pdims[1], 0);
+  EXPECT_EQ(kApiTestRanks, config.pdims[0] * config.pdims[1]);
+  EXPECT_EQ(CUDECOMP_TRANSPOSE_COMM_MPI_P2P, config.transpose_comm_backend);
+  EXPECT_TRUE(isMpiHaloBackend(config.halo_comm_backend)) << cudecompHaloCommBackendToString(config.halo_comm_backend);
+
+  cudecompGridDescConfig_t queried_config;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config));
+  expectGridDescConfigEquals(queried_config, config);
+}
+
+TEST_F(ApiGetPencilInfoTest, MatchesExpectedDefaultDecomposition) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  for (int axis = 0; axis < 3; ++axis) {
+    cudecompPencilInfo_t pinfo;
+    CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(),
+                                                              kPadding.data()));
+    expectPencilInfoEquals(pinfo, kExpectedDefaultPencilInfo[axis][active_comm_.rank()]);
+  }
+}
+
+TEST_F(ApiGetPencilInfoTest, MatchesExpectedColumnMajorDecomposition) {
+  auto config = distributedConfig();
+  config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR;
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  for (int axis = 0; axis < 3; ++axis) {
+    cudecompPencilInfo_t pinfo;
+    CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(),
+                                                              kPadding.data()));
+    expectPencilInfoEquals(pinfo, kExpectedColumnMajorPencilInfo[axis][active_comm_.rank()]);
+  }
+}
+
+TEST_F(ApiGetPencilInfoTest, MatchesExpectedGdimsDistDecomposition) {
+  auto config = distributedConfig();
+  setGdimsDist(config);
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  for (int axis = 0; axis < 3; ++axis) {
+    cudecompPencilInfo_t pinfo;
+    CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(),
+                                                              kPadding.data()));
+    expectPencilInfoEquals(pinfo, kExpectedGdimsDistPencilInfo[axis][active_comm_.rank()]);
+  }
+}
+
+TEST_F(ApiGetPencilInfoTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  cudecompPencilInfo_t pinfo;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetPencilInfo(handle_, grid_desc, nullptr, 0, nullptr, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, -1, nullptr, nullptr));
+}
+
+TEST_F(ApiGetTransposeWorkspaceSizeTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int64_t workspace_size = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetTransposeWorkspaceSize(handle_, grid_desc, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetTransposeWorkspaceSize(handle_, nullptr, &workspace_size));
+}
+
+TEST_F(ApiGetHaloWorkspaceSizeTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int64_t workspace_size = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompGetHaloWorkspaceSize(handle_, grid_desc, 0, nullptr, &workspace_size));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompGetHaloWorkspaceSize(handle_, grid_desc, 0, kHaloExtents.data(), nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompGetHaloWorkspaceSize(handle_, grid_desc, 3, kHaloExtents.data(), &workspace_size));
+}
+
+TEST_F(ApiGetShiftedRankTest, ReturnsExpectedRanksForRowMajorLayout) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, false, {2, 3, -1, -1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, -1, false, {-1, -1, 0, 1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, true, {2, 3, 0, 1});
+
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, false, {1, -1, 3, -1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, -1, false, {-1, 0, -1, 2});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, true, {1, 0, 3, 2});
+}
+
+TEST_F(ApiGetShiftedRankTest, ReturnsExpectedRanksForColumnMajorLayout) {
+  auto config = distributedConfig();
+  config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR;
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, false, {1, -1, 3, -1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, -1, false, {-1, 0, -1, 2});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, true, {1, 0, 3, 2});
+
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, false, {2, 3, -1, -1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, -1, false, {-1, -1, 0, 1});
+  expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, true, {2, 3, 0, 1});
+}
+
+TEST_F(ApiGetShiftedRankTest, HandlesAxisAlignedAndZeroDisplacements) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int32_t shifted_rank = -2;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, 0, false, &shifted_rank));
+  EXPECT_EQ(active_comm_.rank(), shifted_rank);
+
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 0, 1, false, &shifted_rank));
+  EXPECT_EQ(-1, shifted_rank);
+
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 0, 1, true, &shifted_rank));
+  EXPECT_EQ(active_comm_.rank(), shifted_rank);
+
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, kPdims[0], true, &shifted_rank));
+  EXPECT_EQ(active_comm_.rank(), shifted_rank);
+
+  CHECK_CUDECOMP_GLOBAL(active_comm_,
+                        cudecompGetShiftedRank(handle_, grid_desc, 0, 1, kPdims[0], false, &shifted_rank));
+  EXPECT_EQ(-1, shifted_rank);
+}
+
+TEST_F(ApiGetShiftedRankTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int32_t shifted_rank = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, 1, false, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 3, 1, 1, false, &shifted_rank));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 0, 3, 1, false, &shifted_rank));
+}
+
+TEST_F(ApiMallocTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  void* buffer = nullptr;
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompMalloc(handle_, grid_desc, nullptr, 16));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompMalloc(handle_, grid_desc, &buffer, 0));
+}
+
+TEST_F(ApiFreeTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFree(nullptr, grid_desc, nullptr));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFree(handle_, nullptr, nullptr));
+}
+
+TEST_F(ApiTransposeTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int placeholder = 0;
+  void* valid_pointer = &placeholder;
+
+  // X-to-Y is a representative placeholder for all transpose APIs; the directional variants route to the same
+  // generic transpose implementation.
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompTransposeXToY(handle_, grid_desc, nullptr, valid_pointer, valid_pointer, CUDECOMP_FLOAT, nullptr,
+                                  nullptr, nullptr, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompTransposeXToY(handle_, grid_desc, valid_pointer, nullptr, valid_pointer, CUDECOMP_FLOAT, nullptr,
+                                  nullptr, nullptr, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompTransposeXToY(handle_, grid_desc, valid_pointer, valid_pointer, nullptr, CUDECOMP_FLOAT, nullptr,
+                                  nullptr, nullptr, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompTransposeXToY(handle_, grid_desc, valid_pointer, valid_pointer, valid_pointer,
+                                  static_cast<cudecompDataType_t>(999), nullptr, nullptr, nullptr, nullptr, 0));
+}
+
+TEST_F(ApiTransposeTest, RejectsEmptyPencilDecomposition) {
+  auto config = emptyPencilConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int input = 0;
+  int output = 0;
+  int work = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_NOT_SUPPORTED,
+            cudecompTransposeXToY(handle_, grid_desc, &input, &output, &work, CUDECOMP_FLOAT, nullptr, nullptr, nullptr,
+                                  nullptr, 0));
+}
+
+TEST_F(ApiHaloTest, RejectsInvalidArguments) {
+  auto config = distributedConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int placeholder = 0;
+  void* valid_pointer = &placeholder;
+
+  // X-halo updates are a representative placeholder for all halo APIs; the directional variants route to the same
+  // generic halo implementation.
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, nullptr,
+                                 kHaloPeriods.data(), 0, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(),
+                                 nullptr, 0, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, nullptr, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(),
+                                 kHaloPeriods.data(), 0, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, nullptr, CUDECOMP_FLOAT, kHaloExtents.data(),
+                                 kHaloPeriods.data(), 0, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(),
+                                 kHaloPeriods.data(), 3, nullptr, 0));
+  EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE,
+            cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, static_cast<cudecompDataType_t>(999),
+                                 kHaloExtents.data(), kHaloPeriods.data(), 0, nullptr, 0));
+}
+
+TEST_F(ApiHaloTest, RejectsEmptyPencilDecomposition) {
+  auto config = emptyPencilConfig();
+  cudecompGridDesc_t grid_desc = nullptr;
+  CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr));
+  cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc);
+
+  int input = 0;
+  int work = 0;
+  EXPECT_EQ(CUDECOMP_RESULT_NOT_SUPPORTED,
+            cudecompUpdateHalosX(handle_, grid_desc, &input, &work, CUDECOMP_FLOAT, kHaloExtents.data(),
+                                 kHaloPeriods.data(), 0, nullptr, 0));
+}
+
+} // namespace
diff --git a/tests/ctest/backend_config.h.in b/tests/ctest/backend_config.h.in
new file mode 100644
index 0000000..9ba017b
--- /dev/null
+++ b/tests/ctest/backend_config.h.in
@@ -0,0 +1,11 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_BACKEND_CONFIG_H
+#define CUDECOMP_TEST_BACKEND_CONFIG_H
+
+#cmakedefine01 CUDECOMP_TEST_ENABLE_NVSHMEM
+
+#endif
diff --git a/tests/ctest/backend_test_context.cc b/tests/ctest/backend_test_context.cc
new file mode 100644
index 0000000..093f780
--- /dev/null
+++ b/tests/ctest/backend_test_context.cc
@@ -0,0 +1,139 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backend_test_context.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "test_utils.h"
+
+namespace cudecomp_test {
+namespace {
+
+constexpr const char* kKeepaliveBackendEnv = "CUDECOMP_TEST_KEEPALIVE_BACKEND";
+
+class SharedBackendTestContextState {
+public:
+  SharedBackendTestContextState(std::string backend_label, int active_ranks, MpiTestComm active_comm,
+                                cudecompHandle_t handle)
+      : backend_label_(std::move(backend_label)), active_ranks_(active_ranks), active_comm_(std::move(active_comm)),
+        handle_(handle), handle_guard_(handle) {}
+
+  const MpiTestComm& comm() const { return active_comm_; }
+  cudecompHandle_t handle() const { return handle_; }
+
+  bool compatibleWith(const char* backend_label, int active_ranks) const {
+    return backend_label_ == backend_label && active_ranks_ == active_ranks;
+  }
+
+  testing::AssertionResult ensureKeepaliveGridDesc(const cudecompGridDescConfig_t& config) {
+    if (keepalive_grid_desc_guard_) return testing::AssertionSuccess();
+
+    cudecompGridDescConfig_t keepalive_config = config;
+    cudecompGridDesc_t grid_desc = nullptr;
+    const cudecompResult_t result = cudecompGridDescCreate(handle_, &grid_desc, &keepalive_config, nullptr);
+    keepalive_grid_desc_guard_ = std::make_unique<gridDescGuard>(handle_, grid_desc);
+    return checkCudecompGlobal(active_comm_, result, __FILE__, __LINE__);
+  }
+
+private:
+  std::string backend_label_;
+  int active_ranks_ = 0;
+  MpiTestComm active_comm_;
+  cudecompHandle_t handle_ = nullptr;
+  cudecompHandleGuard handle_guard_;
+  std::unique_ptr<gridDescGuard> keepalive_grid_desc_guard_;
+};
+
+std::unique_ptr<SharedBackendTestContextState> shared_context;
+
+std::string requestedKeepaliveBackend() {
+  const char* value = std::getenv(kKeepaliveBackendEnv);
+  if (!value || value[0] == '\0') return {};
+
+  const std::string backend(value);
+  if (backend == "nccl" || backend == "nvshmem") return backend;
+  return {};
+}
+
+} // namespace
+
+testing::AssertionResult BackendTestContext::initialize(const MpiTestComm& world_comm, int active_ranks,
+                                                        const char* backend_label, bool check_nccl,
+                                                        const cudecompGridDescConfig_t& config,
+                                                        TestSetupDecision* setup_decision) {
+  if (!setup_decision) { return testing::AssertionFailure() << "test setup decision argument cannot be null"; }
+
+  *setup_decision = {};
+  active_comm_ = nullptr;
+  handle_ = nullptr;
+
+  const std::string keepalive_backend = requestedKeepaliveBackend();
+  const bool use_shared_context =
+      !keepalive_backend.empty() && keepalive_backend == backend_label && active_ranks == world_comm.size();
+
+  if (use_shared_context) {
+    if (shared_context && !shared_context->compatibleWith(backend_label, active_ranks)) {
+      resetSharedBackendTestContext();
+    }
+
+    if (!shared_context) {
+      auto active_comm = MpiTestComm::split(world_comm, active_ranks);
+      if (!active_comm.valid()) {
+        *setup_decision = {true, false, "inactive rank for shared backend test context"};
+        return testing::AssertionSuccess();
+      }
+
+      *setup_decision = initializeGpuForTest(active_comm, check_nccl);
+      if (setup_decision->skip || setup_decision->fail) return testing::AssertionSuccess();
+
+      cudecompHandle_t handle = nullptr;
+      const cudecompResult_t init_result = cudecompInit(&handle, active_comm.mpiComm());
+      auto state =
+          std::make_unique<SharedBackendTestContextState>(backend_label, active_ranks, std::move(active_comm), handle);
+
+      testing::AssertionResult init_status = checkCudecompGlobal(state->comm(), init_result, __FILE__, __LINE__);
+      if (!init_status) return init_status;
+
+      testing::AssertionResult keepalive_status = state->ensureKeepaliveGridDesc(config);
+      if (!keepalive_status) return keepalive_status;
+
+      shared_context = std::move(state);
+    }
+
+    active_comm_ = &shared_context->comm();
+    handle_ = shared_context->handle();
+    return testing::AssertionSuccess();
+  }
+
+  resetSharedBackendTestContext();
+
+  local_active_comm_ = MpiTestComm::split(world_comm, active_ranks);
+  if (!local_active_comm_.valid()) {
+    *setup_decision = {true, false,
+                       std::string("inactive rank for ") + std::to_string(active_ranks) + "-rank " + backend_label +
+                           " test case"};
+    return testing::AssertionSuccess();
+  }
+
+  *setup_decision = initializeGpuForTest(local_active_comm_, check_nccl);
+  if (setup_decision->skip || setup_decision->fail) return testing::AssertionSuccess();
+
+  const cudecompResult_t init_result = cudecompInit(&local_handle_, local_active_comm_.mpiComm());
+  local_handle_guard_ = std::make_unique<cudecompHandleGuard>(local_handle_);
+  testing::AssertionResult init_status = checkCudecompGlobal(local_active_comm_, init_result, __FILE__, __LINE__);
+  if (!init_status) return init_status;
+
+  active_comm_ = &local_active_comm_;
+  handle_ = local_handle_;
+  return testing::AssertionSuccess();
+}
+
+void resetSharedBackendTestContext() { shared_context.reset(); }
+
+} // namespace cudecomp_test
diff --git a/tests/ctest/backend_test_context.h b/tests/ctest/backend_test_context.h
new file mode 100644
index 0000000..1e63646
--- /dev/null
+++ b/tests/ctest/backend_test_context.h
@@ -0,0 +1,41 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_BACKEND_TEST_CONTEXT_H
+#define CUDECOMP_TEST_BACKEND_TEST_CONTEXT_H
+
+#include <memory>
+
+#include <gtest/gtest.h>
+
+#include "cudecomp.h"
+#include "gpu_test_utils.h"
+#include "mpi_test_utils.h"
+#include "test_utils.h"
+
+namespace cudecomp_test {
+
+class BackendTestContext {
+public:
+  testing::AssertionResult initialize(const MpiTestComm& world_comm, int active_ranks, const char* backend_label,
+                                      bool check_nccl, const cudecompGridDescConfig_t& config,
+                                      TestSetupDecision* setup_decision);
+
+  const MpiTestComm& comm() const { return *active_comm_; }
+  cudecompHandle_t handle() const { return handle_; }
+
+private:
+  MpiTestComm local_active_comm_;
+  cudecompHandle_t local_handle_ = nullptr;
+  std::unique_ptr<cudecompHandleGuard> local_handle_guard_;
+  const MpiTestComm* active_comm_ = nullptr;
+  cudecompHandle_t handle_ = nullptr;
+};
+
+void resetSharedBackendTestContext();
+
+} // namespace cudecomp_test
+
+#endif
diff --git a/tests/ctest/backend_utils.cc b/tests/ctest/backend_utils.cc
new file mode 100644
index 0000000..19d2e6a
--- /dev/null
+++ b/tests/ctest/backend_utils.cc
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "backend_utils.h"
+
+#include "backend_config.h"
+
+namespace cudecomp_test {
+
+std::vector<TransposeBackend> transposeBackends() {
+  std::vector<TransposeBackend> backends = {
+      {CUDECOMP_TRANSPOSE_COMM_MPI_P2P, "mpi-p2p", "mpi"},  {CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL, "mpi-p2p-pl", "mpi"},
+      {CUDECOMP_TRANSPOSE_COMM_MPI_A2A, "mpi-a2a", "mpi"},  {CUDECOMP_TRANSPOSE_COMM_NCCL, "nccl", "nccl"},
+      {CUDECOMP_TRANSPOSE_COMM_NCCL_PL, "nccl-pl", "nccl"},
+  };
+
+#if CUDECOMP_TEST_ENABLE_NVSHMEM
+  backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM, "nvshmem", "nvshmem"});
+  backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL, "nvshmem-pl", "nvshmem"});
+  backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM, "nvshmem-sm", "nvshmem"});
+#endif
+
+  return backends;
+}
+
+std::vector<HaloBackend> haloBackends() {
+  std::vector<HaloBackend> backends = {
+      {CUDECOMP_HALO_COMM_MPI, "mpi", "mpi"},
+      {CUDECOMP_HALO_COMM_MPI_BLOCKING, "mpi-blocking", "mpi"},
+      {CUDECOMP_HALO_COMM_NCCL, "nccl", "nccl"},
+  };
+
+#if CUDECOMP_TEST_ENABLE_NVSHMEM
+  backends.push_back({CUDECOMP_HALO_COMM_NVSHMEM, "nvshmem", "nvshmem"});
+  backends.push_back({CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING, "nvshmem-blocking", "nvshmem"});
+#endif
+
+  return backends;
+}
+
+} // namespace cudecomp_test
diff --git a/tests/ctest/backend_utils.h b/tests/ctest/backend_utils.h
new file mode 100644
index 0000000..b532d3d
--- /dev/null
+++ b/tests/ctest/backend_utils.h
@@ -0,0 +1,32 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_BACKEND_UTILS_H
+#define CUDECOMP_TEST_BACKEND_UTILS_H
+
+#include <vector>
+
+#include "cudecomp.h"
+
+namespace cudecomp_test {
+
+struct TransposeBackend {
+  cudecompTransposeCommBackend_t backend;
+  const char* name;
+  const char* label;
+};
+
+struct HaloBackend {
+  cudecompHaloCommBackend_t backend;
+  const char* name;
+  const char* label;
+};
+
+std::vector<TransposeBackend> transposeBackends();
+std::vector<HaloBackend> haloBackends();
+
+} // namespace cudecomp_test
+
+#endif
diff --git a/tests/ctest/fortran_api_test.f90 b/tests/ctest/fortran_api_test.f90
new file mode 100644
index 0000000..37ed0a8
--- /dev/null
+++ b/tests/ctest/fortran_api_test.f90
@@ -0,0 +1,420 @@
+! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+! SPDX-License-Identifier: Apache-2.0
+
+program cudecomp_fortran_api_test
+  use, intrinsic :: iso_fortran_env, only: real32, real64
+  use cudafor
+  use cudecomp
+  use mpi
+
+  implicit none
+
+  integer, parameter :: api_test_ranks = 4
+  integer, parameter :: k_gdims(3) = [9, 10, 11]
+  integer, parameter :: k_gdims_dist(3) = [8, 9, 10]
+  integer, parameter :: k_pdims(2) = [2, 2]
+  integer, parameter :: k_halo_extents(3) = [1, 2, 1]
+  integer, parameter :: k_padding(3) = [1, 0, 2]
+
+  integer :: rank = -1
+  integer :: nranks = 0
+  integer :: ierr = 0
+  integer :: local_comm = MPI_COMM_NULL
+  integer :: local_rank = 0
+  integer :: failures = 0
+  integer :: global_failures = 0
+  logical :: handle_initialized = .false.
+  type(cudecompHandle) :: handle
+
+  call MPI_Init(ierr)
+  call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
+  call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr)
+
+  if (nranks /= api_test_ranks) then
+    call record_failure("Fortran API test requires exactly 4 MPI ranks")
+  else
+    call initialize_gpu()
+  endif
+
+  if (failures == 0) then
+    call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit")
+    handle_initialized = failures == 0
+  endif
+
+  if (handle_initialized) then
+    call test_default_values()
+    call test_grid_descriptor_contracts(handle)
+    call test_descriptor_queries(handle)
+    call test_workspace_and_shifted_rank(handle)
+    call test_dtype_sizes_and_strings()
+    call test_typed_malloc_free(handle)
+    call expect_success(cudecompFinalize(handle), "cudecompFinalize")
+  endif
+
+  call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+  if (rank == 0 .and. global_failures == 0) then
+    write(*, '("Fortran API test passed")')
+  elseif (rank == 0) then
+    write(*, '("Fortran API test failed with ", i0, " rank-local failure(s)")') global_failures
+  endif
+
+  if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr)
+  call MPI_Finalize(ierr)
+  if (global_failures /= 0) call exit(1)
+
+contains
+
+  subroutine initialize_gpu()
+    integer :: status
+    integer :: num_devices
+
+    call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
+    call MPI_Comm_rank(local_comm, local_rank, ierr)
+
+    status = cudaGetDeviceCount(num_devices)
+    if (status /= cudaSuccess) then
+      call record_failure("cudaGetDeviceCount failed")
+      return
+    endif
+    if (num_devices <= 0) then
+      call record_failure("Fortran API test requires at least one visible CUDA device")
+      return
+    endif
+
+    status = cudaSetDevice(mod(local_rank, num_devices))
+    if (status /= cudaSuccess) call record_failure("cudaSetDevice failed")
+  end subroutine initialize_gpu
+
+  subroutine test_default_values()
+    type(cudecompGridDescConfig) :: config
+    type(cudecompGridDescAutotuneOptions) :: options
+
+    call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults")
+    call expect_equal_int(config%rank_order, CUDECOMP_RANK_ORDER_DEFAULT, "default rank order")
+    call expect_equal_int(config%transpose_comm_backend, CUDECOMP_TRANSPOSE_COMM_MPI_P2P, &
+                          "default transpose backend")
+    call expect_equal_int(config%halo_comm_backend, CUDECOMP_HALO_COMM_MPI, "default halo backend")
+    call expect_int_array(config%pdims, [0, 0], "default pdims")
+    call expect_int_array(config%gdims, [0, 0, 0], "default gdims")
+    call expect_int_array(config%gdims_dist, [0, 0, 0], "default gdims_dist")
+    call expect_int_array(reshape(config%transpose_mem_order, [9]), [-1, -1, -1, -1, -1, -1, -1, -1, -1], &
+                          "default transpose_mem_order")
+    if (any(config%transpose_axis_contiguous)) call record_failure("default axis-contiguous flags are not false")
+
+    call expect_success(cudecompGridDescAutotuneOptionsSetDefaults(options), &
+                        "cudecompGridDescAutotuneOptionsSetDefaults")
+    call expect_equal_int(options%n_warmup_trials, 3, "default warmup trials")
+    call expect_equal_int(options%n_trials, 5, "default trials")
+    call expect_equal_int(options%grid_mode, CUDECOMP_AUTOTUNE_GRID_TRANSPOSE, "default grid mode")
+    call expect_equal_int(options%dtype, CUDECOMP_DOUBLE, "default autotune dtype")
+    call expect_equal_int(options%halo_axis, 1, "default Fortran halo axis")
+    if (.not. options%allow_uneven_decompositions) call record_failure("default uneven decompositions flag is false")
+    if (options%autotune_transpose_backend) call record_failure("default transpose backend autotune flag is true")
+    if (options%autotune_halo_backend) call record_failure("default halo backend autotune flag is true")
+  end subroutine test_default_values
+
+  subroutine test_grid_descriptor_contracts(handle)
+    type(cudecompHandle) :: handle
+    type(cudecompGridDescConfig) :: config
+    type(cudecompGridDescConfig) :: expected_config
+    type(cudecompGridDescConfig) :: queried_config
+    type(cudecompGridDescAutotuneOptions) :: options
+    type(cudecompGridDesc) :: grid_desc
+    integer :: res
+
+    call setup_explicit_config(config)
+    expected_config = config
+    res = cudecompGridDescCreate(handle, grid_desc, config)
+    call expect_success(res, "cudecompGridDescCreate without options")
+    call expect_config_equal(config, expected_config, "config restored after create without options")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      call expect_success(cudecompGetGridDescConfig(handle, grid_desc, queried_config), &
+                          "cudecompGetGridDescConfig without options")
+      call expect_config_equal(queried_config, expected_config, "queried config without options")
+      call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor without options")
+    endif
+
+    call setup_explicit_config(config)
+    expected_config = config
+    call expect_success(cudecompGridDescAutotuneOptionsSetDefaults(options), &
+                        "set defaults before create with options")
+    options%n_warmup_trials = 0
+    options%n_trials = 1
+    options%dtype = CUDECOMP_FLOAT
+    options%disable_nccl_backends = .true.
+    options%disable_nvshmem_backends = .true.
+    options%halo_axis = 3
+    options%halo_extents = k_halo_extents
+    options%halo_periods = [.false., .true., .false.]
+    options%halo_padding = k_padding
+
+    res = cudecompGridDescCreate(handle, grid_desc, config, options)
+    call expect_success(res, "cudecompGridDescCreate with options")
+    call expect_config_equal(config, expected_config, "config restored after create with options")
+    call expect_equal_int(options%halo_axis, 3, "options halo_axis restored after create")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      call expect_success(cudecompGetGridDescConfig(handle, grid_desc, queried_config), &
+                          "cudecompGetGridDescConfig with options")
+      call expect_config_equal(queried_config, expected_config, "queried config with options")
+      call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor with options")
+    endif
+  end subroutine test_grid_descriptor_contracts
+
+  subroutine test_descriptor_queries(handle)
+    type(cudecompHandle) :: handle
+    type(cudecompGridDescConfig) :: config
+    type(cudecompGridDesc) :: grid_desc
+    type(cudecompPencilInfo) :: pinfo
+    integer :: res
+
+    call setup_distributed_config(config)
+    res = cudecompGridDescCreate(handle, grid_desc, config)
+    call expect_success(res, "create descriptor for pencil queries")
+    if (res /= CUDECOMP_RESULT_SUCCESS) return
+
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, 1), &
+                        "cudecompGetPencilInfo without optional arrays")
+    call expect_axis1_pencil_without_halo(pinfo)
+
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, 2, k_halo_extents, k_padding), &
+                        "cudecompGetPencilInfo with halo and padding")
+    call expect_int_array(pinfo%order, [1, 2, 3], "Fortran pencil order is one-based")
+    call expect_int_array(pinfo%halo_extents, k_halo_extents, "explicit pencil halo extents")
+    call expect_int_array(pinfo%padding, k_padding, "explicit pencil padding")
+    if (minval(pinfo%lo) < 1) call record_failure("Fortran pencil lower bounds are not one-based")
+    if (minval(pinfo%hi) < 1) call record_failure("Fortran pencil upper bounds are not one-based")
+
+    call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for pencil queries")
+  end subroutine test_descriptor_queries
+
+  subroutine test_workspace_and_shifted_rank(handle)
+    type(cudecompHandle) :: handle
+    type(cudecompGridDescConfig) :: config
+    type(cudecompGridDesc) :: grid_desc
+    integer :: res
+    integer(8) :: workspace_size
+    integer :: shifted_rank
+    integer, parameter :: expected_shifted_ranks(api_test_ranks) = [2, 3, -1, -1]
+
+    call setup_distributed_config(config)
+    res = cudecompGridDescCreate(handle, grid_desc, config)
+    call expect_success(res, "create descriptor for workspace queries")
+    if (res /= CUDECOMP_RESULT_SUCCESS) return
+
+    workspace_size = -1_8
+    call expect_success(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_size), &
+                        "cudecompGetTransposeWorkspaceSize")
+    if (workspace_size < 0_8) call record_failure("transpose workspace size is negative")
+
+    workspace_size = -1_8
+    call expect_success(cudecompGetHaloWorkspaceSize(handle, grid_desc, 2, k_halo_extents, workspace_size), &
+                        "cudecompGetHaloWorkspaceSize")
+    if (workspace_size < 0_8) call record_failure("halo workspace size is negative")
+
+    shifted_rank = -2
+    call expect_success(cudecompGetShiftedRank(handle, grid_desc, 1, 2, 1, .false., shifted_rank), &
+                        "cudecompGetShiftedRank")
+    call expect_equal_int(shifted_rank, expected_shifted_ranks(rank + 1), "one-based shifted rank query")
+
+    call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for workspace queries")
+  end subroutine test_workspace_and_shifted_rank
+
+  subroutine test_dtype_sizes_and_strings()
+    integer(8) :: dtype_size
+
+    call expect_success(cudecompGetDataTypeSize(CUDECOMP_FLOAT, dtype_size), "float dtype size query")
+    call expect_equal_integer8(dtype_size, 4_8, "float dtype size")
+    call expect_success(cudecompGetDataTypeSize(CUDECOMP_DOUBLE, dtype_size), "double dtype size query")
+    call expect_equal_integer8(dtype_size, 8_8, "double dtype size")
+    call expect_success(cudecompGetDataTypeSize(CUDECOMP_FLOAT_COMPLEX, dtype_size), "float complex dtype size query")
+    call expect_equal_integer8(dtype_size, 8_8, "float complex dtype size")
+    call expect_success(cudecompGetDataTypeSize(CUDECOMP_DOUBLE_COMPLEX, dtype_size), &
+                        "double complex dtype size query")
+    call expect_equal_integer8(dtype_size, 16_8, "double complex dtype size")
+
+    call expect_string(cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P), "MPI_P2P", &
+                       "transpose backend string")
+    call expect_string(cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI_BLOCKING), "MPI (blocking)", &
+                       "halo backend string")
+  end subroutine test_dtype_sizes_and_strings
+
+  subroutine test_typed_malloc_free(handle)
+    type(cudecompHandle) :: handle
+    type(cudecompGridDescConfig) :: config
+    type(cudecompGridDesc) :: grid_desc
+    real(real32), pointer, device, contiguous :: r4(:)
+    real(real64), pointer, device, contiguous :: r8(:)
+    complex(real32), pointer, device, contiguous :: c4(:)
+    complex(real64), pointer, device, contiguous :: c8(:)
+    integer :: res
+
+    nullify(r4)
+    nullify(r8)
+    nullify(c4)
+    nullify(c8)
+
+    call setup_distributed_config(config)
+    res = cudecompGridDescCreate(handle, grid_desc, config)
+    call expect_success(res, "create descriptor for typed allocation")
+    if (res /= CUDECOMP_RESULT_SUCCESS) return
+
+    call expect_success(cudecompMalloc(handle, grid_desc, r4, 4_8), "cudecompMalloc real32")
+    if (.not. associated(r4)) call record_failure("real32 device pointer is not associated")
+    call expect_success(cudecompFree(handle, grid_desc, r4), "cudecompFree real32")
+
+    call expect_success(cudecompMalloc(handle, grid_desc, r8, 4_8), "cudecompMalloc real64")
+    if (.not. associated(r8)) call record_failure("real64 device pointer is not associated")
+    call expect_success(cudecompFree(handle, grid_desc, r8), "cudecompFree real64")
+
+    call expect_success(cudecompMalloc(handle, grid_desc, c4, 4_8), "cudecompMalloc complex32")
+    if (.not. associated(c4)) call record_failure("complex32 device pointer is not associated")
+    call expect_success(cudecompFree(handle, grid_desc, c4), "cudecompFree complex32")
+
+    call expect_success(cudecompMalloc(handle, grid_desc, c8, 4_8), "cudecompMalloc complex64")
+    if (.not. associated(c8)) call record_failure("complex64 device pointer is not associated")
+    call expect_success(cudecompFree(handle, grid_desc, c8), "cudecompFree complex64")
+
+    call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for typed allocation")
+  end subroutine test_typed_malloc_free
+
+  subroutine setup_distributed_config(config)
+    type(cudecompGridDescConfig) :: config
+
+    call expect_success(cudecompGridDescConfigSetDefaults(config), "set distributed config defaults")
+    config%gdims = k_gdims
+    config%pdims = k_pdims
+  end subroutine setup_distributed_config
+
+  subroutine setup_explicit_config(config)
+    type(cudecompGridDescConfig) :: config
+
+    call setup_distributed_config(config)
+    config%gdims_dist = k_gdims_dist
+    config%rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR
+    config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_A2A
+    config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI_BLOCKING
+    config%transpose_axis_contiguous = [.true., .false., .true.]
+    config%transpose_mem_order(:, 1) = [3, 2, 1]
+    config%transpose_mem_order(:, 2) = [1, 3, 2]
+    config%transpose_mem_order(:, 3) = [2, 1, 3]
+  end subroutine setup_explicit_config
+
+  subroutine expect_axis1_pencil_without_halo(pinfo)
+    type(cudecompPencilInfo) :: pinfo
+    integer :: expected_shape(3)
+    integer :: expected_lo(3)
+    integer :: expected_hi(3)
+    integer(8) :: expected_size
+
+    select case (rank)
+    case (0)
+      expected_shape = [9, 5, 6]
+      expected_lo = [1, 1, 1]
+      expected_hi = [9, 5, 6]
+    case (1)
+      expected_shape = [9, 5, 5]
+      expected_lo = [1, 1, 7]
+      expected_hi = [9, 5, 11]
+    case (2)
+      expected_shape = [9, 5, 6]
+      expected_lo = [1, 6, 1]
+      expected_hi = [9, 10, 6]
+    case default
+      expected_shape = [9, 5, 5]
+      expected_lo = [1, 6, 7]
+      expected_hi = [9, 10, 11]
+    end select
+
+    expected_size = product(expected_shape)
+    call expect_int_array(pinfo%shape, expected_shape, "axis-1 shape without halo")
+    call expect_int_array(pinfo%lo, expected_lo, "axis-1 lo without halo")
+    call expect_int_array(pinfo%hi, expected_hi, "axis-1 hi without halo")
+    call expect_int_array(pinfo%order, [1, 2, 3], "axis-1 order without halo")
+    call expect_int_array(pinfo%halo_extents, [0, 0, 0], "axis-1 omitted halo defaults")
+    call expect_int_array(pinfo%padding, [0, 0, 0], "axis-1 omitted padding defaults")
+    call expect_equal_integer8(pinfo%size, expected_size, "axis-1 size without halo")
+  end subroutine expect_axis1_pencil_without_halo
+
+  subroutine expect_config_equal(actual, expected, context)
+    type(cudecompGridDescConfig) :: actual
+    type(cudecompGridDescConfig) :: expected
+    character(len=*) :: context
+
+    call expect_int_array(actual%gdims, expected%gdims, trim(context)//" gdims")
+    call expect_int_array(actual%gdims_dist, expected%gdims_dist, trim(context)//" gdims_dist")
+    call expect_int_array(actual%pdims, expected%pdims, trim(context)//" pdims")
+    call expect_equal_int(actual%rank_order, expected%rank_order, trim(context)//" rank_order")
+    call expect_equal_int(actual%transpose_comm_backend, expected%transpose_comm_backend, &
+                          trim(context)//" transpose backend")
+    call expect_equal_int(actual%halo_comm_backend, expected%halo_comm_backend, trim(context)//" halo backend")
+    if (any(actual%transpose_axis_contiguous .neqv. expected%transpose_axis_contiguous)) then
+      call record_failure(trim(context)//" axis-contiguous flags mismatch")
+    endif
+    call expect_int_array(reshape(actual%transpose_mem_order, [9]), reshape(expected%transpose_mem_order, [9]), &
+                          trim(context)//" transpose_mem_order")
+  end subroutine expect_config_equal
+
+  subroutine expect_success(result, context)
+    integer :: result
+    character(len=*) :: context
+
+    if (result /= CUDECOMP_RESULT_SUCCESS) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result
+      failures = failures + 1
+    endif
+  end subroutine expect_success
+
+  subroutine expect_equal_int(actual, expected, context)
+    integer :: actual
+    integer :: expected
+    character(len=*) :: context
+
+    if (actual /= expected) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual
+      failures = failures + 1
+    endif
+  end subroutine expect_equal_int
+
+  subroutine expect_equal_integer8(actual, expected, context)
+    integer(8) :: actual
+    integer(8) :: expected
+    character(len=*) :: context
+
+    if (actual /= expected) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual
+      failures = failures + 1
+    endif
+  end subroutine expect_equal_integer8
+
+  subroutine expect_int_array(actual, expected, context)
+    integer :: actual(:)
+    integer :: expected(:)
+    character(len=*) :: context
+
+    if (size(actual) /= size(expected)) then
+      call record_failure(trim(context)//" array sizes differ")
+    elseif (any(actual /= expected)) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual
+      failures = failures + 1
+    endif
+  end subroutine expect_int_array
+
+  subroutine expect_string(actual, expected, context)
+    character(len=*) :: actual
+    character(len=*) :: expected
+    character(len=*) :: context
+
+    if (actual /= expected) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "expected", trim(expected), "actual", trim(actual)
+      failures = failures + 1
+    endif
+  end subroutine expect_string
+
+  subroutine record_failure(context)
+    character(len=*) :: context
+
+    write(*, *) "rank", rank, ": FAIL", trim(context)
+    failures = failures + 1
+  end subroutine record_failure
+
+end program cudecomp_fortran_api_test
diff --git a/tests/ctest/fortran_halo_case.inc b/tests/ctest/fortran_halo_case.inc
new file mode 100644
index 0000000..9583b56
--- /dev/null
+++ b/tests/ctest/fortran_halo_case.inc
@@ -0,0 +1,233 @@
+! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+! SPDX-License-Identifier: Apache-2.0
+
+subroutine RUN_HALO_CASE()
+  implicit none
+
+  ! Pass explicit padding while using a non-default decomposition layout. Each
+  ! scenario below runs X, Y, and Z halo updates for all update dimensions.
+  call RUN_HALO_SCENARIO("custom_layout_padded_halo", k_halo_extents, k_explicit_padding, k_halo_periods, &
+                         k_mixed_mem_order, k_default_axis_contiguous, CUDECOMP_RANK_ORDER_COL_MAJOR, .true.)
+  ! Omit the Fortran-only optional padding argument and verify the default
+  ! unpadded pencil shape still updates halos correctly.
+  call RUN_HALO_SCENARIO("default_padding", k_halo_extents, k_zero_extents, k_halo_periods, k_default_mem_order, &
+                         k_default_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false.)
+end subroutine RUN_HALO_CASE
+
+subroutine RUN_HALO_SCENARIO(scenario, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, &
+                             use_optional_padding)
+  implicit none
+
+  character(len=*), intent(in) :: scenario
+  integer, intent(in) :: halo_extents(3)
+  integer, intent(in) :: padding(3)
+  logical, intent(in) :: halo_periods(3)
+  integer, intent(in) :: mem_order(3, 3)
+  logical, intent(in) :: axis_contiguous(3)
+  integer, intent(in) :: rank_order
+  logical, intent(in) :: use_optional_padding
+  integer :: axis
+
+  do axis = 1, 3
+    call RUN_HALO_AXIS(scenario, axis, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, &
+                       use_optional_padding)
+  enddo
+end subroutine RUN_HALO_SCENARIO
+
+subroutine RUN_HALO_AXIS(scenario, axis, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, &
+                         use_optional_padding)
+  implicit none
+
+  character(len=*), intent(in) :: scenario
+  integer, intent(in) :: axis
+  integer, intent(in) :: halo_extents(3)
+  integer, intent(in) :: padding(3)
+  logical, intent(in) :: halo_periods(3)
+  integer, intent(in) :: mem_order(3, 3)
+  logical, intent(in) :: axis_contiguous(3)
+  integer, intent(in) :: rank_order
+  logical, intent(in) :: use_optional_padding
+  type(cudecompGridDescConfig) :: config
+  type(cudecompGridDesc) :: grid_desc
+  type(cudecompPencilInfo) :: pinfo
+  integer(int64) :: workspace_num_elements
+  integer :: dim
+  integer :: res
+  logical :: grid_desc_created
+  logical :: work_allocated
+  ARRTYPE, allocatable :: initial(:)
+  ARRTYPE, allocatable :: expected(:)
+  ARRTYPE, allocatable :: actual(:)
+  ARRTYPE, allocatable, device, target :: data_d(:)
+  ARRTYPE, pointer, device, contiguous :: work_d(:)
+
+  grid_desc_created = .false.
+  work_allocated = .false.
+  nullify(work_d)
+
+  call setup_halo_config(config, axis_contiguous, mem_order, rank_order)
+  res = cudecompGridDescCreate(handle, grid_desc, config)
+  call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompGridDescCreate")
+  if (res /= CUDECOMP_RESULT_SUCCESS) return
+  grid_desc_created = .true.
+
+  if (use_optional_padding) then
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents, padding), &
+                        DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" pencil info")
+  else
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents), &
+                        DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" pencil info")
+  endif
+  call expect_success(cudecompGetHaloWorkspaceSize(handle, grid_desc, axis, halo_extents, workspace_num_elements), &
+                      DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" halo workspace size")
+
+  allocate(data_d(pinfo%size))
+  call INITIALIZE_HALO_PENCIL(initial, pinfo)
+  call INITIALIZE_HALO_REFERENCE(expected, pinfo, halo_periods)
+
+  res = cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)
+  call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompMalloc workspace")
+  if (res == CUDECOMP_RESULT_SUCCESS) work_allocated = .true.
+
+  if (work_allocated) then
+    data_d = UNSET_HALO_VALUE()
+    data_d(1:pinfo%size) = initial
+    work_d = 0
+
+    do dim = 1, 3
+      select case (axis)
+      case (1)
+        if (use_optional_padding) then
+          res = cudecompUpdateHalosX(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, &
+                                     dim, pinfo%padding)
+        else
+          res = cudecompUpdateHalosX(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim)
+        endif
+      case (2)
+        if (use_optional_padding) then
+          res = cudecompUpdateHalosY(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, &
+                                     dim, pinfo%padding)
+        else
+          res = cudecompUpdateHalosY(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim)
+        endif
+      case (3)
+        if (use_optional_padding) then
+          res = cudecompUpdateHalosZ(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, &
+                                     dim, pinfo%padding)
+        else
+          res = cudecompUpdateHalosZ(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim)
+        endif
+      end select
+      call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompUpdateHalos")
+    enddo
+
+    allocate(actual(pinfo%size))
+    actual = data_d(1:pinfo%size)
+    call EXPECT_HALO_PENCIL_MATCH(expected, actual, pinfo, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis))
+    deallocate(actual)
+  endif
+
+  if (work_allocated) call expect_success(cudecompFree(handle, grid_desc, work_d), &
+                                          DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompFree")
+  if (grid_desc_created) call expect_success(cudecompGridDescDestroy(handle, grid_desc), &
+                                             DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)// &
+                                             " cudecompGridDescDestroy")
+end subroutine RUN_HALO_AXIS
+
+subroutine INITIALIZE_HALO_PENCIL(values, pinfo)
+  implicit none
+
+  ARRTYPE, allocatable, intent(out) :: values(:)
+  type(cudecompPencilInfo), intent(in) :: pinfo
+  integer(int64) :: i
+  integer(int64) :: local(3)
+  integer(int64) :: global(3)
+
+  allocate(values(pinfo%size))
+  values = UNSET_HALO_VALUE()
+
+  do i = 1_int64, pinfo%size
+    call local_coordinate(i, pinfo, local)
+    if (.not. is_internal_coordinate(pinfo, local)) cycle
+
+    call global_coordinate(pinfo, local, global)
+    values(i) = HALO_PENCIL_VALUE(global_linear_index(global))
+  enddo
+end subroutine INITIALIZE_HALO_PENCIL
+
+subroutine INITIALIZE_HALO_REFERENCE(values, pinfo, halo_periods)
+  implicit none
+
+  ARRTYPE, allocatable, intent(out) :: values(:)
+  type(cudecompPencilInfo), intent(in) :: pinfo
+  logical, intent(in) :: halo_periods(3)
+  integer(int64) :: i
+  integer(int64) :: local(3)
+  integer(int64) :: global(3)
+  integer :: dim
+  logical :: unset
+
+  allocate(values(pinfo%size))
+  values = UNSET_HALO_VALUE()
+
+  do i = 1_int64, pinfo%size
+    call local_coordinate(i, pinfo, local)
+    call global_coordinate(pinfo, local, global)
+    unset = is_padding_coordinate(pinfo, local)
+
+    do dim = 1, 3
+      if (global(dim) >= 1_int64 .and. global(dim) <= int(k_gdims(dim), int64)) cycle
+
+      if (halo_periods(dim)) then
+        global(dim) = wrap_index(global(dim), int(k_gdims(dim), int64))
+      else
+        unset = .true.
+      endif
+    enddo
+
+    if (.not. unset) values(i) = HALO_PENCIL_VALUE(global_linear_index(global))
+  enddo
+end subroutine INITIALIZE_HALO_REFERENCE
+
+function HALO_PENCIL_VALUE(linear_index) result(value)
+  implicit none
+
+  integer(int64), intent(in) :: linear_index
+  ARRTYPE :: value
+
+  value = linear_index
+end function HALO_PENCIL_VALUE
+
+function UNSET_HALO_VALUE() result(value)
+  implicit none
+
+  ARRTYPE :: value
+
+  value = -1
+end function UNSET_HALO_VALUE
+
+subroutine EXPECT_HALO_PENCIL_MATCH(expected, actual, pinfo, context)
+  implicit none
+
+  ARRTYPE, intent(in) :: expected(:)
+  ARRTYPE, intent(in) :: actual(:)
+  type(cudecompPencilInfo), intent(in) :: pinfo
+  character(len=*), intent(in) :: context
+  integer(int64) :: i
+  integer(int64) :: local(3)
+
+  if (size(expected, kind=int64) /= pinfo%size .or. size(actual, kind=int64) /= pinfo%size) then
+    call record_failure(trim(context)//" result size mismatch")
+    return
+  endif
+
+  do i = 1_int64, pinfo%size
+    if (expected(i) == actual(i)) cycle
+
+    call local_coordinate(i, pinfo, local)
+    write(*, *) "rank", rank, ": FAIL", trim(context), "mismatch at local coordinate", local, &
+                "expected", expected(i), "actual", actual(i)
+    failures = failures + 1
+    return
+  enddo
+end subroutine EXPECT_HALO_PENCIL_MATCH
diff --git a/tests/ctest/fortran_halo_tests.f90 b/tests/ctest/fortran_halo_tests.f90
new file mode 100644
index 0000000..92adcf4
--- /dev/null
+++ b/tests/ctest/fortran_halo_tests.f90
@@ -0,0 +1,346 @@
+! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+! SPDX-License-Identifier: Apache-2.0
+
+module cudecomp_fortran_halo_tests
+  use, intrinsic :: iso_fortran_env, only: int64, real32, real64
+  use cudafor
+  use cudecomp
+  use mpi
+
+  implicit none
+
+  ! Standalone CTest fixture for the Fortran halo API. The included fixture body
+  ! is instantiated once per dtype so the axis and optional-padding matrix stays
+  ! identical across real and complex coverage.
+  integer, parameter :: halo_test_ranks = 4
+  integer, parameter :: k_gdims(3) = [9, 10, 11]
+  integer, parameter :: k_gdims_dist(3) = [8, 8, 8]
+  integer, parameter :: k_pdims(2) = [2, 2]
+  integer, parameter :: k_halo_extents(3) = [1, 1, 1]
+  integer, parameter :: k_zero_extents(3) = [0, 0, 0]
+  integer, parameter :: k_explicit_padding(3) = [1, 0, 2]
+  logical, parameter :: k_halo_periods(3) = [.true., .false., .true.]
+  logical, parameter :: k_default_axis_contiguous(3) = [.false., .false., .false.]
+  integer, parameter :: k_default_mem_order(3, 3) = reshape([-1, -1, -1, -1, -1, -1, -1, -1, -1], [3, 3])
+  integer, parameter :: k_mixed_mem_order(3, 3) = reshape([3, 2, 1, 3, 2, 1, 3, 2, 1], [3, 3])
+
+  integer :: rank = -1
+  integer :: nranks = 0
+  integer :: ierr = 0
+  integer :: local_comm = MPI_COMM_NULL
+  integer :: local_rank = 0
+  integer :: failures = 0
+  integer :: global_failures = 0
+  logical :: handle_initialized = .false.
+  type(cudecompHandle) :: handle
+
+contains
+
+  subroutine run_all_tests()
+    implicit none
+
+    call MPI_Init(ierr)
+    call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
+    call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr)
+
+    if (nranks /= halo_test_ranks) then
+      call record_failure("Fortran halo test requires exactly 4 MPI ranks")
+    else
+      call initialize_gpu()
+    endif
+
+    if (failures == 0) then
+      call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit")
+      handle_initialized = failures == 0
+    endif
+
+    if (handle_initialized) then
+      call run_halo_r32()
+      call run_halo_r64()
+      call run_halo_c32()
+      call run_halo_c64()
+      call expect_success(cudecompFinalize(handle), "cudecompFinalize")
+    endif
+
+    call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+    if (rank == 0 .and. global_failures == 0) then
+      write(*, '("Fortran halo test passed")')
+    elseif (rank == 0) then
+      write(*, '("Fortran halo test failed with ", i0, " rank-local failure(s)")') global_failures
+    endif
+
+    if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr)
+    call MPI_Finalize(ierr)
+    if (global_failures /= 0) call exit(1)
+  end subroutine run_all_tests
+
+  subroutine initialize_gpu()
+    implicit none
+
+    integer :: status
+    integer :: num_devices
+
+    call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
+    call MPI_Comm_rank(local_comm, local_rank, ierr)
+
+    status = cudaGetDeviceCount(num_devices)
+    if (status /= cudaSuccess) then
+      call record_failure("cudaGetDeviceCount failed")
+      return
+    endif
+    if (num_devices <= 0) then
+      call record_failure("Fortran halo test requires at least one visible CUDA device")
+      return
+    endif
+
+    status = cudaSetDevice(mod(local_rank, num_devices))
+    if (status /= cudaSuccess) call record_failure("cudaSetDevice failed")
+  end subroutine initialize_gpu
+
+  subroutine setup_halo_config(config, axis_contiguous, mem_order, rank_order)
+    implicit none
+
+    type(cudecompGridDescConfig), intent(out) :: config
+    logical, intent(in) :: axis_contiguous(3)
+    integer, intent(in) :: mem_order(3, 3)
+    integer, intent(in) :: rank_order
+
+    call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults")
+    config%gdims = k_gdims
+    config%gdims_dist = k_gdims_dist
+    config%pdims = k_pdims
+    config%rank_order = rank_order
+    config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI
+    config%transpose_axis_contiguous = axis_contiguous
+    config%transpose_mem_order = mem_order
+  end subroutine setup_halo_config
+
+  function axis_name(axis) result(name)
+    implicit none
+
+    integer, intent(in) :: axis
+    character(len=1) :: name
+
+    select case (axis)
+    case (1)
+      name = "X"
+    case (2)
+      name = "Y"
+    case (3)
+      name = "Z"
+    case default
+      name = "?"
+    end select
+  end function axis_name
+
+  function pencil_lower(pinfo, dim) result(lower)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer, intent(in) :: dim
+    integer(int64) :: lower
+
+    lower = int(pinfo%lo(dim), int64) - int(pinfo%halo_extents(pinfo%order(dim)), int64)
+  end function pencil_lower
+
+  subroutine local_coordinate(linear_index, pinfo, local)
+    implicit none
+
+    integer(int64), intent(in) :: linear_index
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(out) :: local(3)
+    integer(int64) :: offset
+    integer(int64) :: shape_1
+    integer(int64) :: shape_2
+
+    offset = linear_index - 1_int64
+    shape_1 = int(pinfo%shape(1), int64)
+    shape_2 = int(pinfo%shape(2), int64)
+    local(1) = pencil_lower(pinfo, 1) + modulo(offset, shape_1)
+    local(2) = pencil_lower(pinfo, 2) + modulo(offset / shape_1, shape_2)
+    local(3) = pencil_lower(pinfo, 3) + offset / (shape_1 * shape_2)
+  end subroutine local_coordinate
+
+  function is_internal_coordinate(pinfo, local) result(is_internal)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(in) :: local(3)
+    logical :: is_internal
+
+    is_internal = local(1) >= int(pinfo%lo(1), int64) .and. local(1) <= int(pinfo%hi(1), int64) .and. &
+                  local(2) >= int(pinfo%lo(2), int64) .and. local(2) <= int(pinfo%hi(2), int64) .and. &
+                  local(3) >= int(pinfo%lo(3), int64) .and. local(3) <= int(pinfo%hi(3), int64)
+  end function is_internal_coordinate
+
+  function is_padding_coordinate(pinfo, local) result(is_padding)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(in) :: local(3)
+    logical :: is_padding
+
+    is_padding = local(1) > int(pinfo%hi(1), int64) + int(pinfo%halo_extents(pinfo%order(1)), int64) .or. &
+                 local(2) > int(pinfo%hi(2), int64) + int(pinfo%halo_extents(pinfo%order(2)), int64) .or. &
+                 local(3) > int(pinfo%hi(3), int64) + int(pinfo%halo_extents(pinfo%order(3)), int64)
+  end function is_padding_coordinate
+
+  subroutine global_coordinate(pinfo, local, global)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(in) :: local(3)
+    integer(int64), intent(out) :: global(3)
+    integer :: dim
+
+    do dim = 1, 3
+      global(pinfo%order(dim)) = local(dim)
+    enddo
+  end subroutine global_coordinate
+
+  function wrap_index(index, size) result(wrapped)
+    implicit none
+
+    integer(int64), intent(in) :: index
+    integer(int64), intent(in) :: size
+    integer(int64) :: wrapped
+
+    wrapped = modulo(index - 1_int64, size) + 1_int64
+  end function wrap_index
+
+  function global_linear_index(global) result(linear_index)
+    implicit none
+
+    integer(int64), intent(in) :: global(3)
+    integer(int64) :: linear_index
+
+    linear_index = global(1) + int(k_gdims(1), int64) * &
+                              ((global(2) - 1_int64) + (global(3) - 1_int64) * int(k_gdims(2), int64))
+  end function global_linear_index
+
+  subroutine expect_success(result, context)
+    implicit none
+
+    integer, intent(in) :: result
+    character(len=*), intent(in) :: context
+
+    if (result /= CUDECOMP_RESULT_SUCCESS) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result
+      failures = failures + 1
+    endif
+  end subroutine expect_success
+
+  subroutine record_failure(context)
+    implicit none
+
+    character(len=*), intent(in) :: context
+
+    write(*, *) "rank", rank, ": FAIL", trim(context)
+    failures = failures + 1
+  end subroutine record_failure
+
+#define ARRTYPE real(real32)
+#define DTYPE CUDECOMP_FLOAT
+#define DTYPE_NAME "R32"
+#define RUN_HALO_CASE run_halo_r32
+#define RUN_HALO_SCENARIO run_halo_scenario_r32
+#define RUN_HALO_AXIS run_halo_axis_r32
+#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_r32
+#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_r32
+#define HALO_PENCIL_VALUE halo_pencil_value_r32
+#define UNSET_HALO_VALUE unset_halo_value_r32
+#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_r32
+#include "fortran_halo_case.inc"
+#undef EXPECT_HALO_PENCIL_MATCH
+#undef UNSET_HALO_VALUE
+#undef HALO_PENCIL_VALUE
+#undef INITIALIZE_HALO_REFERENCE
+#undef INITIALIZE_HALO_PENCIL
+#undef RUN_HALO_AXIS
+#undef RUN_HALO_SCENARIO
+#undef RUN_HALO_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE real(real64)
+#define DTYPE CUDECOMP_DOUBLE
+#define DTYPE_NAME "R64"
+#define RUN_HALO_CASE run_halo_r64
+#define RUN_HALO_SCENARIO run_halo_scenario_r64
+#define RUN_HALO_AXIS run_halo_axis_r64
+#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_r64
+#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_r64
+#define HALO_PENCIL_VALUE halo_pencil_value_r64
+#define UNSET_HALO_VALUE unset_halo_value_r64
+#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_r64
+#include "fortran_halo_case.inc"
+#undef EXPECT_HALO_PENCIL_MATCH
+#undef UNSET_HALO_VALUE
+#undef HALO_PENCIL_VALUE
+#undef INITIALIZE_HALO_REFERENCE
+#undef INITIALIZE_HALO_PENCIL
+#undef RUN_HALO_AXIS
+#undef RUN_HALO_SCENARIO
+#undef RUN_HALO_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE complex(real32)
+#define DTYPE CUDECOMP_FLOAT_COMPLEX
+#define DTYPE_NAME "C32"
+#define RUN_HALO_CASE run_halo_c32
+#define RUN_HALO_SCENARIO run_halo_scenario_c32
+#define RUN_HALO_AXIS run_halo_axis_c32
+#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_c32
+#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_c32
+#define HALO_PENCIL_VALUE halo_pencil_value_c32
+#define UNSET_HALO_VALUE unset_halo_value_c32
+#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_c32
+#include "fortran_halo_case.inc"
+#undef EXPECT_HALO_PENCIL_MATCH
+#undef UNSET_HALO_VALUE
+#undef HALO_PENCIL_VALUE
+#undef INITIALIZE_HALO_REFERENCE
+#undef INITIALIZE_HALO_PENCIL
+#undef RUN_HALO_AXIS
+#undef RUN_HALO_SCENARIO
+#undef RUN_HALO_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE complex(real64)
+#define DTYPE CUDECOMP_DOUBLE_COMPLEX
+#define DTYPE_NAME "C64"
+#define RUN_HALO_CASE run_halo_c64
+#define RUN_HALO_SCENARIO run_halo_scenario_c64
+#define RUN_HALO_AXIS run_halo_axis_c64
+#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_c64
+#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_c64
+#define HALO_PENCIL_VALUE halo_pencil_value_c64
+#define UNSET_HALO_VALUE unset_halo_value_c64
+#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_c64
+#include "fortran_halo_case.inc"
+#undef EXPECT_HALO_PENCIL_MATCH
+#undef UNSET_HALO_VALUE
+#undef HALO_PENCIL_VALUE
+#undef INITIALIZE_HALO_REFERENCE
+#undef INITIALIZE_HALO_PENCIL
+#undef RUN_HALO_AXIS
+#undef RUN_HALO_SCENARIO
+#undef RUN_HALO_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+end module cudecomp_fortran_halo_tests
+
+program cudecomp_fortran_halo_test
+  use cudecomp_fortran_halo_tests
+
+  implicit none
+
+  call run_all_tests()
+end program cudecomp_fortran_halo_test
diff --git a/tests/ctest/fortran_transpose_case.inc b/tests/ctest/fortran_transpose_case.inc
new file mode 100644
index 0000000..1fae212
--- /dev/null
+++ b/tests/ctest/fortran_transpose_case.inc
@@ -0,0 +1,267 @@
+! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+! SPDX-License-Identifier: Apache-2.0
+
+subroutine RUN_TRANSPOSE_CASE()
+  implicit none
+
+  ! Exercise non-default layout, out-of-place transposes, and explicit
+  ! halo/padding optional arguments for every supported dtype.
+  call RUN_TRANSPOSE_SCENARIO("custom_layout_padded_halo", k_explicit_halo_extents, k_explicit_halo_extents, &
+                              k_explicit_halo_extents, k_explicit_padding_x, k_explicit_padding_y, &
+                              k_explicit_padding_z, k_mixed_mem_order, k_default_axis_contiguous, &
+                              CUDECOMP_RANK_ORDER_COL_MAJOR, .true., .true.)
+  ! Omit the Fortran-only optional halo/padding arguments and use the default
+  ! layout path with in-place transposes.
+  call RUN_TRANSPOSE_SCENARIO("default_layout_in_place", k_zero_extents, k_zero_extents, k_zero_extents, &
+                              k_zero_extents, k_zero_extents, k_zero_extents, k_default_mem_order, &
+                              k_default_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false., .false.)
+#ifdef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE
+  ! Keep axis-contiguous coverage small because it is a layout sanity check, not
+  ! a dtype-specific numerical path.
+  call RUN_TRANSPOSE_SCENARIO("axis_contiguous_in_place", k_zero_extents, k_zero_extents, k_zero_extents, &
+                              k_zero_extents, k_zero_extents, k_zero_extents, k_default_mem_order, &
+                              k_all_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false., .false.)
+#endif
+end subroutine RUN_TRANSPOSE_CASE
+
+subroutine RUN_TRANSPOSE_SCENARIO(scenario, halo_extents_x, halo_extents_y, halo_extents_z, padding_x, padding_y, &
+                                  padding_z, mem_order, axis_contiguous, rank_order, out_of_place, &
+                                  use_optional_args)
+  implicit none
+
+  character(len=*), intent(in) :: scenario
+  integer, intent(in) :: halo_extents_x(3)
+  integer, intent(in) :: halo_extents_y(3)
+  integer, intent(in) :: halo_extents_z(3)
+  integer, intent(in) :: padding_x(3)
+  integer, intent(in) :: padding_y(3)
+  integer, intent(in) :: padding_z(3)
+  integer, intent(in) :: mem_order(3, 3)
+  logical, intent(in) :: axis_contiguous(3)
+  integer, intent(in) :: rank_order
+  logical, intent(in) :: out_of_place
+  logical, intent(in) :: use_optional_args
+  type(cudecompGridDescConfig) :: config
+  type(cudecompGridDesc) :: grid_desc
+  type(cudecompPencilInfo) :: pinfo_x
+  type(cudecompPencilInfo) :: pinfo_y
+  type(cudecompPencilInfo) :: pinfo_z
+  integer(int64) :: data_num_elements
+  integer(int64) :: workspace_num_elements
+  integer :: res
+  logical :: grid_desc_created
+  logical :: work_allocated
+  ARRTYPE, allocatable :: xref(:)
+  ARRTYPE, allocatable :: yref(:)
+  ARRTYPE, allocatable :: zref(:)
+  ARRTYPE, allocatable :: actual(:)
+  ARRTYPE, allocatable, device, target :: data_a_d(:)
+  ARRTYPE, allocatable, device, target :: data_b_d(:)
+  ARRTYPE, pointer, device, contiguous :: input_d(:)
+  ARRTYPE, pointer, device, contiguous :: output_d(:)
+  ARRTYPE, pointer, device, contiguous :: work_d(:)
+
+  grid_desc_created = .false.
+  work_allocated = .false.
+  nullify(input_d)
+  nullify(output_d)
+  nullify(work_d)
+
+  call setup_transpose_config(config, axis_contiguous, mem_order, rank_order)
+  res = cudecompGridDescCreate(handle, grid_desc, config)
+  call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompGridDescCreate")
+  if (res /= CUDECOMP_RESULT_SUCCESS) return
+  grid_desc_created = .true.
+
+  if (use_optional_args) then
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x, padding_x), &
+                        DTYPE_NAME//" "//trim(scenario)//" X pencil info")
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y, padding_y), &
+                        DTYPE_NAME//" "//trim(scenario)//" Y pencil info")
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z, padding_z), &
+                        DTYPE_NAME//" "//trim(scenario)//" Z pencil info")
+  else
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1), &
+                        DTYPE_NAME//" "//trim(scenario)//" X pencil info")
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2), &
+                        DTYPE_NAME//" "//trim(scenario)//" Y pencil info")
+    call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3), &
+                        DTYPE_NAME//" "//trim(scenario)//" Z pencil info")
+  endif
+  call expect_success(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_num_elements), &
+                      DTYPE_NAME//" "//trim(scenario)//" transpose workspace size")
+
+  data_num_elements = max(pinfo_x%size, max(pinfo_y%size, pinfo_z%size))
+  allocate(data_a_d(data_num_elements))
+  if (out_of_place) allocate(data_b_d(data_num_elements))
+
+  call INITIALIZE_TRANSPOSE_PENCIL(xref, pinfo_x)
+  call INITIALIZE_TRANSPOSE_PENCIL(yref, pinfo_y)
+  call INITIALIZE_TRANSPOSE_PENCIL(zref, pinfo_z)
+
+  res = cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)
+  call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompMalloc workspace")
+  if (res == CUDECOMP_RESULT_SUCCESS) work_allocated = .true.
+
+  if (work_allocated) then
+    data_a_d = UNSET_TRANSPOSE_VALUE()
+    if (out_of_place) data_b_d = UNSET_TRANSPOSE_VALUE()
+    data_a_d(1:pinfo_x%size) = xref
+    input_d => data_a_d
+    if (out_of_place) then
+      output_d => data_b_d
+    else
+      output_d => data_a_d
+    endif
+    work_d = 0
+    if (use_optional_args) then
+      res = cudecompTransposeXToY(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_x%halo_extents, &
+                                  pinfo_y%halo_extents, pinfo_x%padding, pinfo_y%padding)
+    else
+      res = cudecompTransposeXToY(handle, grid_desc, input_d, output_d, work_d, DTYPE)
+    endif
+    call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeXToY")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      allocate(actual(pinfo_y%size))
+      actual = output_d(1:pinfo_y%size)
+      call EXPECT_TRANSPOSE_PENCIL_MATCH(yref, actual, pinfo_y, DTYPE_NAME//" "//trim(scenario)//" XToY")
+      deallocate(actual)
+    endif
+
+    if (out_of_place) then
+      input_d => data_b_d
+      output_d => data_a_d
+    else
+      input_d => data_a_d
+      output_d => data_a_d
+    endif
+    work_d = 0
+    if (use_optional_args) then
+      res = cudecompTransposeYToZ(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_y%halo_extents, &
+                                  pinfo_z%halo_extents, pinfo_y%padding, pinfo_z%padding)
+    else
+      res = cudecompTransposeYToZ(handle, grid_desc, input_d, output_d, work_d, DTYPE)
+    endif
+    call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeYToZ")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      allocate(actual(pinfo_z%size))
+      actual = output_d(1:pinfo_z%size)
+      call EXPECT_TRANSPOSE_PENCIL_MATCH(zref, actual, pinfo_z, DTYPE_NAME//" "//trim(scenario)//" YToZ")
+      deallocate(actual)
+    endif
+
+    if (out_of_place) then
+      input_d => data_a_d
+      output_d => data_b_d
+    else
+      input_d => data_a_d
+      output_d => data_a_d
+    endif
+    work_d = 0
+    if (use_optional_args) then
+      res = cudecompTransposeZToY(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_z%halo_extents, &
+                                  pinfo_y%halo_extents, pinfo_z%padding, pinfo_y%padding)
+    else
+      res = cudecompTransposeZToY(handle, grid_desc, input_d, output_d, work_d, DTYPE)
+    endif
+    call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeZToY")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      allocate(actual(pinfo_y%size))
+      actual = output_d(1:pinfo_y%size)
+      call EXPECT_TRANSPOSE_PENCIL_MATCH(yref, actual, pinfo_y, DTYPE_NAME//" "//trim(scenario)//" ZToY")
+      deallocate(actual)
+    endif
+
+    if (out_of_place) then
+      input_d => data_b_d
+      output_d => data_a_d
+    else
+      input_d => data_a_d
+      output_d => data_a_d
+    endif
+    work_d = 0
+    if (use_optional_args) then
+      res = cudecompTransposeYToX(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_y%halo_extents, &
+                                  pinfo_x%halo_extents, pinfo_y%padding, pinfo_x%padding)
+    else
+      res = cudecompTransposeYToX(handle, grid_desc, input_d, output_d, work_d, DTYPE)
+    endif
+    call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeYToX")
+    if (res == CUDECOMP_RESULT_SUCCESS) then
+      allocate(actual(pinfo_x%size))
+      actual = output_d(1:pinfo_x%size)
+      call EXPECT_TRANSPOSE_PENCIL_MATCH(xref, actual, pinfo_x, DTYPE_NAME//" "//trim(scenario)//" YToX")
+      deallocate(actual)
+    endif
+  endif
+
+  if (work_allocated) call expect_success(cudecompFree(handle, grid_desc, work_d), &
+                                          DTYPE_NAME//" "//trim(scenario)//" cudecompFree")
+  if (grid_desc_created) call expect_success(cudecompGridDescDestroy(handle, grid_desc), &
+                                             DTYPE_NAME//" "//trim(scenario)//" cudecompGridDescDestroy")
+end subroutine RUN_TRANSPOSE_SCENARIO
+
+subroutine INITIALIZE_TRANSPOSE_PENCIL(values, pinfo)
+  implicit none
+
+  ARRTYPE, allocatable, intent(out) :: values(:)
+  type(cudecompPencilInfo), intent(in) :: pinfo
+  integer(int64) :: i
+  integer(int64) :: local(3)
+  integer(int64) :: global(3)
+
+  allocate(values(pinfo%size))
+  values = UNSET_TRANSPOSE_VALUE()
+
+  do i = 1_int64, pinfo%size
+    call local_coordinate(i, pinfo, local)
+    if (.not. is_internal_coordinate(pinfo, local)) cycle
+
+    call global_coordinate(pinfo, local, global)
+    values(i) = TRANSPOSE_PENCIL_VALUE(global_linear_index(global))
+  enddo
+end subroutine INITIALIZE_TRANSPOSE_PENCIL
+
+function TRANSPOSE_PENCIL_VALUE(linear_index) result(value)
+  implicit none
+
+  integer(int64), intent(in) :: linear_index
+  ARRTYPE :: value
+
+  value = linear_index
+end function TRANSPOSE_PENCIL_VALUE
+
+function UNSET_TRANSPOSE_VALUE() result(value)
+  implicit none
+
+  ARRTYPE :: value
+
+  value = -1
+end function UNSET_TRANSPOSE_VALUE
+
+subroutine EXPECT_TRANSPOSE_PENCIL_MATCH(expected, actual, pinfo, context)
+  implicit none
+
+  ARRTYPE, intent(in) :: expected(:)
+  ARRTYPE, intent(in) :: actual(:)
+  type(cudecompPencilInfo), intent(in) :: pinfo
+  character(len=*), intent(in) :: context
+  integer(int64) :: i
+  integer(int64) :: local(3)
+
+  if (size(expected, kind=int64) /= pinfo%size .or. size(actual, kind=int64) /= pinfo%size) then
+    call record_failure(trim(context)//" result size mismatch")
+    return
+  endif
+
+  do i = 1_int64, pinfo%size
+    call local_coordinate(i, pinfo, local)
+    if (.not. is_internal_coordinate(pinfo, local)) cycle
+    if (expected(i) == actual(i)) cycle
+
+    write(*, *) "rank", rank, ": FAIL", trim(context), "mismatch at local coordinate", local, &
+                "expected", expected(i), "actual", actual(i)
+    failures = failures + 1
+    return
+  enddo
+end subroutine EXPECT_TRANSPOSE_PENCIL_MATCH
diff --git a/tests/ctest/fortran_transpose_tests.f90 b/tests/ctest/fortran_transpose_tests.f90
new file mode 100644
index 0000000..21fb4bb
--- /dev/null
+++ b/tests/ctest/fortran_transpose_tests.f90
@@ -0,0 +1,296 @@
+! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+! SPDX-License-Identifier: Apache-2.0
+
+module cudecomp_fortran_transpose_tests
+  use, intrinsic :: iso_fortran_env, only: int64, real32, real64
+  use cudafor
+  use cudecomp
+  use mpi
+
+  implicit none
+
+  ! Standalone CTest fixture for the Fortran transpose API. The dtype-specific
+  ! cases are generated by including fortran_transpose_case.inc with different
+  ! preprocessor aliases below.
+  integer, parameter :: transpose_test_ranks = 4
+  integer, parameter :: k_gdims(3) = [9, 10, 11]
+  integer, parameter :: k_gdims_dist(3) = [8, 8, 8]
+  integer, parameter :: k_pdims(2) = [2, 2]
+  integer, parameter :: k_zero_extents(3) = [0, 0, 0]
+  integer, parameter :: k_explicit_halo_extents(3) = [1, 1, 1]
+  integer, parameter :: k_explicit_padding_x(3) = [1, 0, 1]
+  integer, parameter :: k_explicit_padding_y(3) = [0, 1, 0]
+  integer, parameter :: k_explicit_padding_z(3) = [1, 0, 1]
+  logical, parameter :: k_default_axis_contiguous(3) = [.false., .false., .false.]
+  logical, parameter :: k_all_axis_contiguous(3) = [.true., .true., .true.]
+  integer, parameter :: k_default_mem_order(3, 3) = reshape([-1, -1, -1, -1, -1, -1, -1, -1, -1], [3, 3])
+  integer, parameter :: k_mixed_mem_order(3, 3) = reshape([3, 2, 1, 1, 3, 2, 2, 1, 3], [3, 3])
+
+  integer :: rank = -1
+  integer :: nranks = 0
+  integer :: ierr = 0
+  integer :: local_comm = MPI_COMM_NULL
+  integer :: local_rank = 0
+  integer :: failures = 0
+  integer :: global_failures = 0
+  logical :: handle_initialized = .false.
+  type(cudecompHandle) :: handle
+
+contains
+
+  subroutine run_all_tests()
+    implicit none
+
+    call MPI_Init(ierr)
+    call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
+    call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr)
+
+    if (nranks /= transpose_test_ranks) then
+      call record_failure("Fortran transpose test requires exactly 4 MPI ranks")
+    else
+      call initialize_gpu()
+    endif
+
+    if (failures == 0) then
+      call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit")
+      handle_initialized = failures == 0
+    endif
+
+    if (handle_initialized) then
+      call run_transpose_r32()
+      call run_transpose_r64()
+      call run_transpose_c32()
+      call run_transpose_c64()
+      call expect_success(cudecompFinalize(handle), "cudecompFinalize")
+    endif
+
+    call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr)
+    if (rank == 0 .and. global_failures == 0) then
+      write(*, '("Fortran transpose test passed")')
+    elseif (rank == 0) then
+      write(*, '("Fortran transpose test failed with ", i0, " rank-local failure(s)")') global_failures
+    endif
+
+    if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr)
+    call MPI_Finalize(ierr)
+    if (global_failures /= 0) call exit(1)
+  end subroutine run_all_tests
+
+  subroutine initialize_gpu()
+    implicit none
+
+    integer :: status
+    integer :: num_devices
+
+    call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr)
+    call MPI_Comm_rank(local_comm, local_rank, ierr)
+
+    status = cudaGetDeviceCount(num_devices)
+    if (status /= cudaSuccess) then
+      call record_failure("cudaGetDeviceCount failed")
+      return
+    endif
+    if (num_devices <= 0) then
+      call record_failure("Fortran transpose test requires at least one visible CUDA device")
+      return
+    endif
+
+    status = cudaSetDevice(mod(local_rank, num_devices))
+    if (status /= cudaSuccess) call record_failure("cudaSetDevice failed")
+  end subroutine initialize_gpu
+
+  subroutine setup_transpose_config(config, axis_contiguous, mem_order, rank_order)
+    implicit none
+
+    type(cudecompGridDescConfig), intent(out) :: config
+    logical, intent(in) :: axis_contiguous(3)
+    integer, intent(in) :: mem_order(3, 3)
+    integer, intent(in) :: rank_order
+
+    call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults")
+    config%gdims = k_gdims
+    config%gdims_dist = k_gdims_dist
+    config%pdims = k_pdims
+    config%rank_order = rank_order
+    config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P
+    config%transpose_axis_contiguous = axis_contiguous
+    config%transpose_mem_order = mem_order
+  end subroutine setup_transpose_config
+
+  function pencil_lower(pinfo, dim) result(lower)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer, intent(in) :: dim
+    integer(int64) :: lower
+
+    lower = int(pinfo%lo(dim), int64) - int(pinfo%halo_extents(pinfo%order(dim)), int64)
+  end function pencil_lower
+
+  subroutine local_coordinate(linear_index, pinfo, local)
+    implicit none
+
+    integer(int64), intent(in) :: linear_index
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(out) :: local(3)
+    integer(int64) :: offset
+    integer(int64) :: shape_1
+    integer(int64) :: shape_2
+
+    offset = linear_index - 1_int64
+    shape_1 = int(pinfo%shape(1), int64)
+    shape_2 = int(pinfo%shape(2), int64)
+    local(1) = pencil_lower(pinfo, 1) + modulo(offset, shape_1)
+    local(2) = pencil_lower(pinfo, 2) + modulo(offset / shape_1, shape_2)
+    local(3) = pencil_lower(pinfo, 3) + offset / (shape_1 * shape_2)
+  end subroutine local_coordinate
+
+  function is_internal_coordinate(pinfo, local) result(is_internal)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(in) :: local(3)
+    logical :: is_internal
+
+    is_internal = local(1) >= int(pinfo%lo(1), int64) .and. local(1) <= int(pinfo%hi(1), int64) .and. &
+                  local(2) >= int(pinfo%lo(2), int64) .and. local(2) <= int(pinfo%hi(2), int64) .and. &
+                  local(3) >= int(pinfo%lo(3), int64) .and. local(3) <= int(pinfo%hi(3), int64)
+  end function is_internal_coordinate
+
+  subroutine global_coordinate(pinfo, local, global)
+    implicit none
+
+    type(cudecompPencilInfo), intent(in) :: pinfo
+    integer(int64), intent(in) :: local(3)
+    integer(int64), intent(out) :: global(3)
+    integer :: dim
+
+    do dim = 1, 3
+      global(pinfo%order(dim)) = local(dim)
+    enddo
+  end subroutine global_coordinate
+
+  function global_linear_index(global) result(linear_index)
+    implicit none
+
+    integer(int64), intent(in) :: global(3)
+    integer(int64) :: linear_index
+
+    linear_index = global(1) + int(k_gdims(1), int64) * &
+                              ((global(2) - 1_int64) + (global(3) - 1_int64) * int(k_gdims(2), int64))
+  end function global_linear_index
+
+  subroutine expect_success(result, context)
+    implicit none
+
+    integer, intent(in) :: result
+    character(len=*), intent(in) :: context
+
+    if (result /= CUDECOMP_RESULT_SUCCESS) then
+      write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result
+      failures = failures + 1
+    endif
+  end subroutine expect_success
+
+  subroutine record_failure(context)
+    implicit none
+
+    character(len=*), intent(in) :: context
+
+    write(*, *) "rank", rank, ": FAIL", trim(context)
+    failures = failures + 1
+  end subroutine record_failure
+
+#define ARRTYPE real(real32)
+#define DTYPE CUDECOMP_FLOAT
+#define DTYPE_NAME "R32"
+#define RUN_TRANSPOSE_CASE run_transpose_r32
+#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_r32
+#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_r32
+#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_r32
+#define UNSET_TRANSPOSE_VALUE unset_transpose_value_r32
+#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_r32
+#define RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE
+#include "fortran_transpose_case.inc"
+#undef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE
+#undef EXPECT_TRANSPOSE_PENCIL_MATCH
+#undef UNSET_TRANSPOSE_VALUE
+#undef TRANSPOSE_PENCIL_VALUE
+#undef INITIALIZE_TRANSPOSE_PENCIL
+#undef RUN_TRANSPOSE_SCENARIO
+#undef RUN_TRANSPOSE_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE real(real64)
+#define DTYPE CUDECOMP_DOUBLE
+#define DTYPE_NAME "R64"
+#define RUN_TRANSPOSE_CASE run_transpose_r64
+#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_r64
+#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_r64
+#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_r64
+#define UNSET_TRANSPOSE_VALUE unset_transpose_value_r64
+#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_r64
+#include "fortran_transpose_case.inc"
+#undef EXPECT_TRANSPOSE_PENCIL_MATCH
+#undef UNSET_TRANSPOSE_VALUE
+#undef TRANSPOSE_PENCIL_VALUE
+#undef INITIALIZE_TRANSPOSE_PENCIL
+#undef RUN_TRANSPOSE_SCENARIO
+#undef RUN_TRANSPOSE_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE complex(real32)
+#define DTYPE CUDECOMP_FLOAT_COMPLEX
+#define DTYPE_NAME "C32"
+#define RUN_TRANSPOSE_CASE run_transpose_c32
+#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_c32
+#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_c32
+#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_c32
+#define UNSET_TRANSPOSE_VALUE unset_transpose_value_c32
+#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_c32
+#define RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE
+#include "fortran_transpose_case.inc"
+#undef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE
+#undef EXPECT_TRANSPOSE_PENCIL_MATCH
+#undef UNSET_TRANSPOSE_VALUE
+#undef TRANSPOSE_PENCIL_VALUE
+#undef INITIALIZE_TRANSPOSE_PENCIL
+#undef RUN_TRANSPOSE_SCENARIO
+#undef RUN_TRANSPOSE_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+#define ARRTYPE complex(real64)
+#define DTYPE CUDECOMP_DOUBLE_COMPLEX
+#define DTYPE_NAME "C64"
+#define RUN_TRANSPOSE_CASE run_transpose_c64
+#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_c64
+#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_c64
+#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_c64
+#define UNSET_TRANSPOSE_VALUE unset_transpose_value_c64
+#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_c64
+#include "fortran_transpose_case.inc"
+#undef EXPECT_TRANSPOSE_PENCIL_MATCH
+#undef UNSET_TRANSPOSE_VALUE
+#undef TRANSPOSE_PENCIL_VALUE
+#undef INITIALIZE_TRANSPOSE_PENCIL
+#undef RUN_TRANSPOSE_SCENARIO
+#undef RUN_TRANSPOSE_CASE
+#undef DTYPE_NAME
+#undef DTYPE
+#undef ARRTYPE
+
+end module cudecomp_fortran_transpose_tests
+
+program cudecomp_fortran_transpose_test
+  use cudecomp_fortran_transpose_tests
+
+  implicit none
+
+  call run_all_tests()
+end program cudecomp_fortran_transpose_test
diff --git a/tests/ctest/gpu_test_utils.cc b/tests/ctest/gpu_test_utils.cc
new file mode 100644
index 0000000..fa314c3
--- /dev/null
+++ b/tests/ctest/gpu_test_utils.cc
@@ -0,0 +1,148 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cstdlib>
+#include <string>
+#include <sys/stat.h>
+
+#include <cuda_runtime.h>
+#include <nccl.h>
+
+#include "gpu_test_utils.h"
+
+namespace cudecomp_test {
+namespace {
+
+bool envFlagEnabled(const char* name) {
+  const char* value = std::getenv(name);
+  if (!value || value[0] == '\0') return false;
+
+  const std::string flag(value);
+  return flag != "0" && flag != "false" && flag != "False" && flag != "FALSE";
+}
+
+bool pathExists(const std::string& path) {
+  struct stat info;
+  return stat(path.c_str(), &info) == 0;
+}
+
+std::string mpsPipeDirectory() {
+  const char* pipe_dir = std::getenv("CUDA_MPS_PIPE_DIRECTORY");
+  if (pipe_dir && pipe_dir[0] != '\0') return pipe_dir;
+  return "/tmp/nvidia-mps";
+}
+
+bool ranksShareGpu(int local_ranks, int visible_devices) {
+  return visible_devices > 0 && local_ranks > visible_devices;
+}
+
+TestSetupDecision queryCudaDeviceCount(int& count) {
+  cudaError_t status = cudaGetDeviceCount(&count);
+  if (status == cudaSuccess) return {};
+
+  count = 0;
+  return {false, true, std::string("unable to query CUDA device count: ") + cudaGetErrorString(status)};
+}
+
+GpuTestRuntime queryGpuTestRuntime() {
+  GpuTestRuntime runtime;
+  runtime.mps_active = pathExists(mpsPipeDirectory() + "/nvidia-cuda-mps-control.pid");
+  if (ncclGetVersion(&runtime.nccl_version) != ncclSuccess) { runtime.nccl_version = 0; }
+  runtime.nccl_multi_rank_gpu_enabled = envFlagEnabled("NCCL_MULTI_RANK_GPU_ENABLE");
+  return runtime;
+}
+
+TestSetupDecision checkGpuSetupGlobal(const MpiTestComm& comm, const TestSetupDecision& local_decision) {
+  const int local_state = local_decision.fail ? 2 : local_decision.skip ? 1 : 0;
+  int global_state = 0;
+  MPI_Allreduce(&local_state, &global_state, 1, MPI_INT, MPI_MAX, comm.mpiComm());
+
+  if (global_state == 0) return {};
+  if (local_decision.skip || local_decision.fail) return local_decision;
+  if (global_state == 2) return {false, true, "GPU setup failed on another rank"};
+  return {true, false, "GPU setup skipped on another rank"};
+}
+
+TestSetupDecision checkGpuTestRequirementsLocal(const MpiTestComm& comm) {
+  int device_count = 0;
+  const TestSetupDecision device_count_decision = queryCudaDeviceCount(device_count);
+  if (device_count_decision.skip || device_count_decision.fail) return device_count_decision;
+
+  if (device_count <= 0) { return {false, true, "GPU tests require at least one visible CUDA device"}; }
+
+  const int local_ranks = comm.localSize();
+  if (ranksShareGpu(local_ranks, device_count)) {
+    if (!mpsActive()) {
+      return {false, true,
+              std::to_string(local_ranks) + " local ranks require GPU sharing with only " +
+                  std::to_string(device_count) +
+                  " visible CUDA device(s); enable CUDA MPS or provide at least one visible GPU per local rank"};
+    }
+  }
+
+  const int device = comm.localRank() % device_count;
+  cudaError_t status = cudaSetDevice(device);
+  if (status != cudaSuccess) {
+    return {false, true, "unable to set CUDA device " + std::to_string(device) + ": " + cudaGetErrorString(status)};
+  }
+
+  return {};
+}
+
+TestSetupDecision initializeGpuForTestLocal(const MpiTestComm& comm, bool check_nccl) {
+  TestSetupDecision decision = checkGpuTestRequirementsLocal(comm);
+  if (decision.skip || decision.fail) return decision;
+
+  if (check_nccl) {
+    int device_count = 0;
+    decision = queryCudaDeviceCount(device_count);
+    if (decision.skip || decision.fail) return decision;
+
+    if (ranksShareGpu(comm.localSize(), device_count)) {
+      if (!ncclSupportsMultiRankPerGpu()) {
+        return {true, false,
+                "NCCL multi-rank-per-GPU testing with MPS requires NCCL 2.30 or newer; runtime reports NCCL version " +
+                    std::to_string(ncclVersion())};
+      }
+
+      if (!ncclMultiRankGpuEnabled()) {
+        return {true, false, "NCCL multi-rank-per-GPU testing with MPS requires NCCL_MULTI_RANK_GPU_ENABLE=1"};
+      }
+    }
+  }
+
+  return {};
+}
+
+} // namespace
+
+void initializeGpuTestRuntime() { (void)gpuTestRuntime(); }
+
+const GpuTestRuntime& gpuTestRuntime() {
+  static const GpuTestRuntime runtime = queryGpuTestRuntime();
+  return runtime;
+}
+
+bool mpsActive() { return gpuTestRuntime().mps_active; }
+
+int ncclVersion() { return gpuTestRuntime().nccl_version; }
+
+bool ncclSupportsMultiRankPerGpu() { return ncclVersion() >= 23000; }
+
+bool ncclMultiRankGpuEnabled() { return gpuTestRuntime().nccl_multi_rank_gpu_enabled; }
+
+TestSetupDecision checkGpuTestRequirements(const MpiTestComm& comm) {
+  if (!comm.valid()) { return {true, false, "inactive MPI communicator"}; }
+
+  return checkGpuSetupGlobal(comm, checkGpuTestRequirementsLocal(comm));
+}
+
+TestSetupDecision initializeGpuForTest(const MpiTestComm& comm, bool check_nccl) {
+  if (!comm.valid()) { return {true, false, "inactive MPI communicator"}; }
+
+  return checkGpuSetupGlobal(comm, initializeGpuForTestLocal(comm, check_nccl));
+}
+
+} // namespace cudecomp_test
diff --git a/tests/ctest/gpu_test_utils.h b/tests/ctest/gpu_test_utils.h
new file mode 100644
index 0000000..cd3645e
--- /dev/null
+++ b/tests/ctest/gpu_test_utils.h
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_GPU_TEST_UTILS_H
+#define CUDECOMP_TEST_GPU_TEST_UTILS_H
+
+#include <string>
+
+#include "mpi_test_utils.h"
+
+namespace cudecomp_test {
+
+struct TestSetupDecision {
+  bool skip = false;
+  bool fail = false;
+  std::string reason;
+};
+
+struct GpuTestRuntime {
+  bool mps_active = false;
+  int nccl_version = 0;
+  bool nccl_multi_rank_gpu_enabled = false;
+};
+
+void initializeGpuTestRuntime();
+const GpuTestRuntime& gpuTestRuntime();
+bool mpsActive();
+int ncclVersion();
+bool ncclSupportsMultiRankPerGpu();
+bool ncclMultiRankGpuEnabled();
+TestSetupDecision checkGpuTestRequirements(const MpiTestComm& comm);
+TestSetupDecision initializeGpuForTest(const MpiTestComm& comm, bool check_nccl = false);
+
+} // namespace cudecomp_test
+
+#endif
diff --git a/tests/ctest/halo_tests.cc b/tests/ctest/halo_tests.cc
new file mode 100644
index 0000000..e645709
--- /dev/null
+++ b/tests/ctest/halo_tests.cc
@@ -0,0 +1,395 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <array>
+#include <cctype>
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <mpi.h>
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "cudecomp.h"
+
+#include "backend_test_context.h"
+#include "backend_utils.h"
+#include "gpu_test_utils.h"
+#include "mpi_test_utils.h"
+#include "test_utils.h"
+
+namespace {
+
+constexpr std::array<int32_t, 3> kBaselineGdims{9, 10, 11};
+constexpr std::array<int32_t, 3> kZeroExtents{0, 0, 0};
+constexpr std::array<int32_t, 3> kBaselineHaloExtents{1, 3, 2};
+constexpr std::array<int32_t, 3> kNonzeroPadding{1, 2, 1};
+constexpr std::array<bool, 3> kDefaultAxisContiguous{false, false, false};
+constexpr std::array<bool, 3> kAllAxisContiguous{true, true, true};
+constexpr std::array<std::array<int32_t, 3>, 3> kDefaultMemOrder{{{{-1, -1, -1}}, {{-1, -1, -1}}, {{-1, -1, -1}}}};
+constexpr std::array<bool, 3> kPeriodicHalos{true, true, true};
+constexpr std::array<bool, 3> kNonPeriodicHalos{false, false, false};
+
+struct HaloCase {
+  cudecomp_test::HaloBackend backend;
+  const char* scenario;
+  int axis;
+  std::array<int32_t, 3> gdims;
+  std::array<int32_t, 2> pdims;
+  cudecompDataType_t dtype;
+  std::array<bool, 3> axis_contiguous;
+  std::array<std::array<int32_t, 3>, 3> mem_order;
+  std::array<int32_t, 3> halo_extents;
+  std::array<bool, 3> halo_periods;
+  std::array<int32_t, 3> padding;
+  cudecompRankOrder_t rank_order;
+};
+
+const char* axisName(int axis) {
+  switch (axis) {
+  case 0: return "X";
+  case 1: return "Y";
+  case 2: return "Z";
+  }
+  return "UnknownAxis";
+}
+
+std::string sanitizeParamName(const std::string& value) {
+  std::string result;
+  for (char ch : value) {
+    if (std::isalnum(static_cast<unsigned char>(ch))) {
+      result.push_back(ch);
+    } else {
+      result.push_back('_');
+    }
+  }
+  return result;
+}
+
+const char* dtypeName(cudecompDataType_t dtype) {
+  switch (dtype) {
+  case CUDECOMP_FLOAT: return "R32";
+  case CUDECOMP_FLOAT_COMPLEX: return "C32";
+  case CUDECOMP_DOUBLE: return "R64";
+  case CUDECOMP_DOUBLE_COMPLEX: return "C64";
+  }
+  return "UnknownDtype";
+}
+
+std::string paramName(const testing::TestParamInfo<HaloCase>& info) {
+  const auto& test_case = info.param;
+  return sanitizeParamName(test_case.scenario) + "_Axis" + axisName(test_case.axis) + "_" +
+         sanitizeParamName(test_case.backend.name) + "_" + dtypeName(test_case.dtype) + "_P" +
+         std::to_string(test_case.pdims[0]) + "x" + std::to_string(test_case.pdims[1]);
+}
+
+HaloCase makeCase(cudecomp_test::HaloBackend backend, const char* scenario, int axis,
+                  std::array<int32_t, 3> gdims = kBaselineGdims, std::array<int32_t, 2> pdims = {2, 2},
+                  cudecompDataType_t dtype = CUDECOMP_FLOAT,
+                  std::array<bool, 3> axis_contiguous = kDefaultAxisContiguous,
+                  std::array<std::array<int32_t, 3>, 3> mem_order = kDefaultMemOrder,
+                  std::array<int32_t, 3> halo_extents = kBaselineHaloExtents,
+                  std::array<bool, 3> halo_periods = kPeriodicHalos, std::array<int32_t, 3> padding = kZeroExtents,
+                  cudecompRankOrder_t rank_order = CUDECOMP_RANK_ORDER_DEFAULT) {
+  return {backend,         scenario,  axis,         gdims,        pdims,   dtype,
+          axis_contiguous, mem_order, halo_extents, halo_periods, padding, rank_order};
+}
+
+void appendBaselineCases(std::vector<HaloCase>& cases, const cudecomp_test::HaloBackend& backend) {
+  // Baseline cases are the common halo sweep: all axis-aligned pencil layouts, default and all-axis-contiguous storage,
+  // periodic and non-periodic boundary behavior, nonuniform halo extents, and single-precision real/complex data.
+  struct LayoutCase {
+    const char* periodic_scenario;
+    const char* non_periodic_scenario;
+    std::array<bool, 3> axis_contiguous;
+  };
+
+  for (const auto& layout : {LayoutCase{"BaselineDefaultLayoutPeriodicBoundaries",
+                                        "BaselineDefaultLayoutNonPeriodicBoundaries", kDefaultAxisContiguous},
+                             LayoutCase{"BaselineAxisContiguousPeriodicBoundaries",
+                                        "BaselineAxisContiguousNonPeriodicBoundaries", kAllAxisContiguous}}) {
+    for (int axis = 0; axis < 3; ++axis) {
+      for (const auto dtype : {CUDECOMP_FLOAT, CUDECOMP_FLOAT_COMPLEX}) {
+        cases.push_back(makeCase(backend, layout.periodic_scenario, axis, kBaselineGdims, {2, 2}, dtype,
+                                 layout.axis_contiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos));
+        cases.push_back(makeCase(backend, layout.non_periodic_scenario, axis, kBaselineGdims, {2, 2}, dtype,
+                                 layout.axis_contiguous, kDefaultMemOrder, kBaselineHaloExtents, kNonPeriodicHalos));
+      }
+    }
+  }
+}
+
+void appendCoverageCases(std::vector<HaloCase>& cases, const cudecomp_test::HaloBackend& backend) {
+  // Coverage cases target halo paths not guaranteed by the baseline sweep. They stay in the MPI collection because
+  // these behaviors are shared halo logic rather than backend-specific communication coverage.
+  cases.push_back(makeCase(backend, "NonzeroPadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT, kDefaultAxisContiguous,
+                           kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, kNonzeroPadding));
+
+  cases.push_back(makeCase(backend, "ColumnMajorRankOrder", 0, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT,
+                           kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, kZeroExtents,
+                           CUDECOMP_RANK_ORDER_COL_MAJOR));
+
+  cases.push_back(makeCase(backend, "InteriorNonPeriodicNeighbors", 0, kBaselineGdims, {3, 1}, CUDECOMP_FLOAT,
+                           kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kNonPeriodicHalos));
+
+  cases.push_back(makeCase(backend, "DtypeWorkspacePadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_DOUBLE,
+                           kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos,
+                           kNonzeroPadding));
+  cases.push_back(makeCase(backend, "DtypeWorkspacePadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_DOUBLE_COMPLEX,
+                           kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos,
+                           kNonzeroPadding));
+}
+
+std::vector<HaloCase> haloCasesForLabel(const char* label) {
+  std::vector<HaloCase> cases;
+  for (const auto& backend : cudecomp_test::haloBackends()) {
+    if (std::string(backend.label) != label) continue;
+    appendBaselineCases(cases, backend);
+    if (std::string(label) == "mpi") { appendCoverageCases(cases, backend); }
+  }
+  return cases;
+}
+
+bool isInternal(const cudecompPencilInfo_t& pinfo, const std::array<int64_t, 3>& local) {
+  return local[0] >= pinfo.halo_extents[pinfo.order[0]] &&
+         local[0] < pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]] &&
+         local[1] >= pinfo.halo_extents[pinfo.order[1]] &&
+         local[1] < pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]] &&
+         local[2] >= pinfo.halo_extents[pinfo.order[2]] &&
+         local[2] < pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]];
+}
+
+bool isPadding(const cudecompPencilInfo_t& pinfo, const std::array<int64_t, 3>& local) {
+  return local[0] >= pinfo.shape[0] - pinfo.padding[pinfo.order[0]] ||
+         local[1] >= pinfo.shape[1] - pinfo.padding[pinfo.order[1]] ||
+         local[2] >= pinfo.shape[2] - pinfo.padding[pinfo.order[2]];
+}
+
+std::array<int64_t, 3> localCoordinate(int64_t index, const cudecompPencilInfo_t& pinfo) {
+  std::array<int64_t, 3> local{};
+  local[0] = index % pinfo.shape[0];
+  local[1] = index / pinfo.shape[0] % pinfo.shape[1];
+  local[2] = index / (pinfo.shape[0] * pinfo.shape[1]);
+  return local;
+}
+
+std::array<int64_t, 3> globalCoordinate(const cudecompPencilInfo_t& pinfo, const std::array<int64_t, 3>& local) {
+  std::array<int64_t, 3> global{};
+  global[pinfo.order[0]] = local[0] + pinfo.lo[0] - pinfo.halo_extents[pinfo.order[0]];
+  global[pinfo.order[1]] = local[1] + pinfo.lo[1] - pinfo.halo_extents[pinfo.order[1]];
+  global[pinfo.order[2]] = local[2] + pinfo.lo[2] - pinfo.halo_extents[pinfo.order[2]];
+  return global;
+}
+
+int64_t wrapIndex(int64_t index, int64_t size) {
+  const int64_t wrapped = index % size;
+  return (wrapped < 0) ? wrapped + size : wrapped;
+}
+
+template <typename T> T unsetValue() { return static_cast<T>(-1); }
+
+template <> std::complex<float> unsetValue<std::complex<float>>() { return {-1.0f, 0.0f}; }
+
+template <> std::complex<double> unsetValue<std::complex<double>>() { return {-1.0, 0.0}; }
+
+template <typename T> T pencilValue(int64_t global_index) { return static_cast<T>(global_index); }
+
+template <> std::complex<float> pencilValue<std::complex<float>>(int64_t global_index) {
+  return {static_cast<float>(global_index), -static_cast<float>(global_index)};
+}
+
+template <> std::complex<double> pencilValue<std::complex<double>>(int64_t global_index) {
+  return {static_cast<double>(global_index), -static_cast<double>(global_index)};
+}
+
+int64_t globalLinearIndex(const std::array<int64_t, 3>& global, const std::array<int32_t, 3>& gdims) {
+  return global[0] + gdims[0] * (global[1] + global[2] * gdims[1]);
+}
+
+template <typename T>
+std::vector<T> initializePencil(const cudecompPencilInfo_t& pinfo, const std::array<int32_t, 3>& gdims) {
+  std::vector<T> data(pinfo.size, unsetValue<T>());
+
+  for (int64_t i = 0; i < pinfo.size; ++i) {
+    const auto local = localCoordinate(i, pinfo);
+    if (!isInternal(pinfo, local)) continue;
+
+    const auto global = globalCoordinate(pinfo, local);
+    data[i] = pencilValue<T>(globalLinearIndex(global, gdims));
+  }
+
+  return data;
+}
+
+template <typename T>
+std::vector<T> initializeReference(const cudecompPencilInfo_t& pinfo, const std::array<int32_t, 3>& gdims,
+                                   const std::array<bool, 3>& halo_periods) {
+  std::vector<T> data(pinfo.size, unsetValue<T>());
+
+  for (int64_t i = 0; i < pinfo.size; ++i) {
+    const auto local = localCoordinate(i, pinfo);
+    auto global = globalCoordinate(pinfo, local);
+    bool unset = isPadding(pinfo, local);
+
+    for (int dim = 0; dim < 3; ++dim) {
+      if (global[dim] >= 0 && global[dim] < gdims[dim]) continue;
+
+      if (halo_periods[dim]) {
+        global[dim] = wrapIndex(global[dim], gdims[dim]);
+      } else {
+        unset = true;
+      }
+    }
+
+    if (!unset) { data[i] = pencilValue<T>(globalLinearIndex(global, gdims)); }
+  }
+
+  return data;
+}
+
+template <typename T>
+testing::AssertionResult pencilMatches(const std::vector<T>& expected, const std::vector<T>& actual,
+                                       const cudecompPencilInfo_t& pinfo) {
+  if (expected.size() != actual.size()) {
+    return testing::AssertionFailure() << "size mismatch: expected " << expected.size() << ", got " << actual.size();
+  }
+
+  for (int64_t i = 0; i < pinfo.size; ++i) {
+    if (expected[i] == actual[i]) continue;
+
+    const auto local = localCoordinate(i, pinfo);
+    return testing::AssertionFailure() << "mismatch at local index " << i << " coordinate (" << local[0] << ", "
+                                       << local[1] << ", " << local[2] << "): expected " << expected[i] << ", got "
+                                       << actual[i];
+  }
+
+  return testing::AssertionSuccess();
+}
+
+template <typename T>
+cudecompResult_t runHalo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, int axis, T* input, void* work,
+                         cudecompDataType_t dtype, const cudecompPencilInfo_t& pinfo,
+                         const std::array<bool, 3>& halo_periods, int dim) {
+  switch (axis) {
+  case 0:
+    return cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim,
+                                pinfo.padding, 0);
+  case 1:
+    return cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim,
+                                pinfo.padding, 0);
+  case 2:
+    return cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim,
+                                pinfo.padding, 0);
+  }
+  return CUDECOMP_RESULT_INVALID_USAGE;
+}
+
+} // namespace
+
+class HaloCorrectnessTest : public ::testing::TestWithParam<HaloCase> {};
+
+template <typename T> void runHaloCase(const HaloCase& test_case) {
+  const int active_ranks = test_case.pdims[0] * test_case.pdims[1];
+  const auto world_comm = cudecomp_test::MpiTestComm::world();
+
+  if (world_comm.size() < active_ranks) {
+    GTEST_SKIP() << "axis " << axisName(test_case.axis) << " halo case with pdims " << test_case.pdims[0] << "x"
+                 << test_case.pdims[1] << " requires " << active_ranks << " ranks, launched with " << world_comm.size();
+  }
+
+  cudecompGridDescConfig_t config;
+  ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config));
+  config.gdims[0] = test_case.gdims[0];
+  config.gdims[1] = test_case.gdims[1];
+  config.gdims[2] = test_case.gdims[2];
+  config.pdims[0] = test_case.pdims[0];
+  config.pdims[1] = test_case.pdims[1];
+  config.rank_order = test_case.rank_order;
+  config.halo_comm_backend = test_case.backend.backend;
+  config.transpose_axis_contiguous[0] = test_case.axis_contiguous[0];
+  config.transpose_axis_contiguous[1] = test_case.axis_contiguous[1];
+  config.transpose_axis_contiguous[2] = test_case.axis_contiguous[2];
+  for (int axis = 0; axis < 3; ++axis) {
+    for (int i = 0; i < 3; ++i) {
+      config.transpose_mem_order[axis][i] = test_case.mem_order[axis][i];
+    }
+  }
+
+  cudecomp_test::BackendTestContext test_context;
+  cudecomp_test::TestSetupDecision setup_decision;
+  ASSERT_TRUE(test_context.initialize(world_comm, active_ranks, test_case.backend.label,
+                                      std::string(test_case.backend.label) == "nccl", config, &setup_decision));
+  ASSERT_FALSE(setup_decision.fail) << setup_decision.reason;
+  if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; }
+
+  const auto& active_comm = test_context.comm();
+  cudecompHandle_t handle = test_context.handle();
+  ASSERT_NE(handle, nullptr);
+
+  cudecompGridDesc_t grid_desc = nullptr;
+  const cudecompResult_t grid_desc_create_result = cudecompGridDescCreate(handle, &grid_desc, &config, nullptr);
+  cudecomp_test::gridDescGuard grid_desc_guard(handle, grid_desc);
+  CHECK_CUDECOMP_GLOBAL(active_comm, grid_desc_create_result);
+
+  cudecompPencilInfo_t pinfo;
+  CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetPencilInfo(handle, grid_desc, &pinfo, test_case.axis,
+                                                           test_case.halo_extents.data(), test_case.padding.data()));
+
+  int64_t workspace_num_elements = 0;
+  CHECK_CUDECOMP_GLOBAL(active_comm,
+                        cudecompGetHaloWorkspaceSize(handle, grid_desc, test_case.axis, test_case.halo_extents.data(),
+                                                     &workspace_num_elements));
+
+  int64_t dtype_size = 0;
+  CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetDataTypeSize(test_case.dtype, &dtype_size));
+
+  const auto initial = initializePencil<T>(pinfo, test_case.gdims);
+  const auto expected = initializeReference<T>(pinfo, test_case.gdims, test_case.halo_periods);
+
+  T* data_d = nullptr;
+  const cudaError_t data_alloc_result = cudaMalloc(&data_d, pinfo.size * sizeof(*data_d));
+  cudecomp_test::cudaBufferGuard data_buffer(data_d);
+  CHECK_CUDA_GLOBAL(active_comm, data_alloc_result);
+
+  void* work_d = nullptr;
+  const cudecompResult_t work_alloc_result =
+      cudecompMalloc(handle, grid_desc, &work_d, workspace_num_elements * dtype_size);
+  cudecomp_test::cudecompBufferGuard work_buffer(handle, grid_desc, work_d);
+  CHECK_CUDECOMP_GLOBAL(active_comm, work_alloc_result);
+
+  CHECK_CUDA_GLOBAL(active_comm, cudaMemset(data_d, 0, pinfo.size * sizeof(*data_d)));
+  CHECK_CUDA_GLOBAL(active_comm,
+                    cudaMemcpy(data_d, initial.data(), initial.size() * sizeof(*data_d), cudaMemcpyHostToDevice));
+  CHECK_CUDA_GLOBAL(active_comm, cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
+
+  for (int dim = 0; dim < 3; ++dim) {
+    CHECK_CUDECOMP_GLOBAL(active_comm, runHalo(handle, grid_desc, test_case.axis, data_d, work_d, test_case.dtype,
+                                               pinfo, test_case.halo_periods, dim));
+  }
+
+  std::vector<T> actual(expected.size(), unsetValue<T>());
+  CHECK_CUDA_GLOBAL(active_comm,
+                    cudaMemcpy(actual.data(), data_d, actual.size() * sizeof(*data_d), cudaMemcpyDeviceToHost));
+  EXPECT_TRUE(pencilMatches(expected, actual, pinfo));
+}
+
+TEST_P(HaloCorrectnessTest, UpdateHalos) {
+  const auto test_case = GetParam();
+  switch (test_case.dtype) {
+  case CUDECOMP_FLOAT: runHaloCase<float>(test_case); break;
+  case CUDECOMP_FLOAT_COMPLEX: runHaloCase<std::complex<float>>(test_case); break;
+  case CUDECOMP_DOUBLE: runHaloCase<double>(test_case); break;
+  case CUDECOMP_DOUBLE_COMPLEX: runHaloCase<std::complex<double>>(test_case); break;
+  default: FAIL() << "unsupported test dtype " << test_case.dtype;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(MpiBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("mpi")), paramName);
+INSTANTIATE_TEST_SUITE_P(NcclBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("nccl")), paramName);
+INSTANTIATE_TEST_SUITE_P(NvshmemBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("nvshmem")),
+                         paramName);
diff --git a/tests/ctest/mpi_test_main.cc b/tests/ctest/mpi_test_main.cc
new file mode 100644
index 0000000..82f2822
--- /dev/null
+++ b/tests/ctest/mpi_test_main.cc
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <iostream>
+
+#include <mpi.h>
+
+#include <gtest/gtest.h>
+
+#include "backend_test_context.h"
+#include "gpu_test_utils.h"
+#include "mpi_test_utils.h"
+
+namespace {
+
+class RankFailurePrinter : public ::testing::EmptyTestEventListener {
+public:
+  explicit RankFailurePrinter(int rank) : rank_(rank) {}
+
+  void OnTestPartResult(const ::testing::TestPartResult& result) override {
+    if (!result.failed()) return;
+
+    std::cerr << "[rank " << rank_ << "] " << result.file_name() << ":" << result.line_number() << ": "
+              << result.summary() << "\n";
+  }
+
+private:
+  int rank_;
+};
+
+bool gpuRequirementsSatisfied(const cudecomp_test::MpiTestComm& world_comm) {
+  const auto setup_decision = cudecomp_test::checkGpuTestRequirements(world_comm);
+  if (!setup_decision.fail) return true;
+
+  std::cerr << "[rank " << world_comm.rank() << "] GPU test requirements are not satisfied: " << setup_decision.reason
+            << "\n";
+  return false;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  MPI_Init(&argc, &argv);
+
+  int global_failure = 0;
+
+  {
+    const auto world_comm = cudecomp_test::MpiTestComm::world();
+    cudecomp_test::initializeGpuTestRuntime();
+
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::UnitTest::GetInstance()->listeners().Append(new RankFailurePrinter(world_comm.rank()));
+
+    if (world_comm.rank() == 0) { std::cout << "[cuDecomp test] MPI ranks: " << world_comm.size() << "\n"; }
+
+    const int local_result = gpuRequirementsSatisfied(world_comm) ? RUN_ALL_TESTS() : 1;
+    cudecomp_test::resetSharedBackendTestContext();
+    const int local_failure = local_result == 0 ? 0 : 1;
+    MPI_Allreduce(&local_failure, &global_failure, 1, MPI_INT, MPI_MAX, world_comm.mpiComm());
+  }
+
+  MPI_Finalize();
+  return global_failure == 0 ? 0 : 1;
+}
diff --git a/tests/ctest/mpi_test_utils.cc b/tests/ctest/mpi_test_utils.cc
new file mode 100644
index 0000000..af0fdcc
--- /dev/null
+++ b/tests/ctest/mpi_test_utils.cc
@@ -0,0 +1,101 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "mpi_test_utils.h"
+
+namespace cudecomp_test {
+
+MpiTestComm::MpiTestComm(MPI_Comm comm) {
+  if (comm == MPI_COMM_NULL) return;
+
+  MPI_Comm_dup(comm, &comm_);
+  MPI_Comm_rank(comm_, &rank_);
+  MPI_Comm_size(comm_, &size_);
+  MPI_Comm_split_type(comm_, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm_);
+  MPI_Comm_rank(local_comm_, &local_rank_);
+  MPI_Comm_size(local_comm_, &local_size_);
+}
+
+MpiTestComm::MpiTestComm(MpiTestComm&& other) noexcept
+    : comm_(other.comm_), local_comm_(other.local_comm_), rank_(other.rank_), size_(other.size_),
+      local_rank_(other.local_rank_), local_size_(other.local_size_) {
+  other.comm_ = MPI_COMM_NULL;
+  other.local_comm_ = MPI_COMM_NULL;
+  other.rank_ = -1;
+  other.size_ = 0;
+  other.local_rank_ = -1;
+  other.local_size_ = 0;
+}
+
+MpiTestComm& MpiTestComm::operator=(MpiTestComm&& other) noexcept {
+  if (this != &other) {
+    reset();
+    comm_ = other.comm_;
+    local_comm_ = other.local_comm_;
+    rank_ = other.rank_;
+    size_ = other.size_;
+    local_rank_ = other.local_rank_;
+    local_size_ = other.local_size_;
+
+    other.comm_ = MPI_COMM_NULL;
+    other.local_comm_ = MPI_COMM_NULL;
+    other.rank_ = -1;
+    other.size_ = 0;
+    other.local_rank_ = -1;
+    other.local_size_ = 0;
+  }
+  return *this;
+}
+
+MpiTestComm::~MpiTestComm() { reset(); }
+
+MpiTestComm MpiTestComm::world() { return fromComm(MPI_COMM_WORLD); }
+
+MpiTestComm MpiTestComm::split(const MpiTestComm& parent_comm, int requested_ranks) {
+  const bool valid_request = requested_ranks > 0 && requested_ranks <= parent_comm.size();
+  const bool active = valid_request && parent_comm.rank() < requested_ranks;
+
+  MPI_Comm comm = MPI_COMM_NULL;
+  MPI_Comm_split(parent_comm.mpiComm(), active ? 0 : MPI_UNDEFINED, parent_comm.rank(), &comm);
+
+  MpiTestComm result = fromComm(comm);
+  if (comm != MPI_COMM_NULL) { MPI_Comm_free(&comm); }
+  return result;
+}
+
+MpiTestComm MpiTestComm::fromComm(MPI_Comm comm) { return MpiTestComm(comm); }
+
+MPI_Comm MpiTestComm::mpiComm() const { return comm_; }
+
+MPI_Comm MpiTestComm::localComm() const { return local_comm_; }
+
+bool MpiTestComm::valid() const { return comm_ != MPI_COMM_NULL; }
+
+int MpiTestComm::rank() const { return rank_; }
+
+int MpiTestComm::size() const { return size_; }
+
+int MpiTestComm::localRank() const { return local_rank_; }
+
+int MpiTestComm::localSize() const { return local_size_; }
+
+void MpiTestComm::reset() {
+  int finalized = 0;
+  MPI_Finalized(&finalized);
+
+  if (!finalized) {
+    if (local_comm_ != MPI_COMM_NULL) { MPI_Comm_free(&local_comm_); }
+    if (comm_ != MPI_COMM_NULL) { MPI_Comm_free(&comm_); }
+  }
+
+  local_comm_ = MPI_COMM_NULL;
+  comm_ = MPI_COMM_NULL;
+  rank_ = -1;
+  size_ = 0;
+  local_rank_ = -1;
+  local_size_ = 0;
+}
+
+} // namespace cudecomp_test
diff --git a/tests/ctest/mpi_test_utils.h b/tests/ctest/mpi_test_utils.h
new file mode 100644
index 0000000..209a24b
--- /dev/null
+++ b/tests/ctest/mpi_test_utils.h
@@ -0,0 +1,48 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_MPI_TEST_UTILS_H
+#define CUDECOMP_TEST_MPI_TEST_UTILS_H
+
+#include <mpi.h>
+
+namespace cudecomp_test {
+
+class MpiTestComm {
+public:
+  MpiTestComm() = default;
+  MpiTestComm(const MpiTestComm&) = delete;
+  MpiTestComm& operator=(const MpiTestComm&) = delete;
+  MpiTestComm(MpiTestComm&& other) noexcept;
+  MpiTestComm& operator=(MpiTestComm&& other) noexcept;
+  ~MpiTestComm();
+
+  static MpiTestComm world();
+  static MpiTestComm split(const MpiTestComm& parent_comm, int requested_ranks);
+  static MpiTestComm fromComm(MPI_Comm comm);
+
+  MPI_Comm mpiComm() const;
+  MPI_Comm localComm() const;
+  bool valid() const;
+  int rank() const;
+  int size() const;
+  int localRank() const;
+  int localSize() const;
+  void reset();
+
+private:
+  explicit MpiTestComm(MPI_Comm comm);
+
+  MPI_Comm comm_ = MPI_COMM_NULL;
+  MPI_Comm local_comm_ = MPI_COMM_NULL;
+  int rank_ = -1;
+  int size_ = 0;
+  int local_rank_ = -1;
+  int local_size_ = 0;
+};
+
+} // namespace cudecomp_test
+
+#endif
diff --git a/tests/ctest/test_utils.cc b/tests/ctest/test_utils.cc
new file mode 100644
index 0000000..f57947f
--- /dev/null
+++ b/tests/ctest/test_utils.cc
@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "test_utils.h"
+
+#include <string>
+
+namespace cudecomp_test {
+namespace {
+
+std::string sourceLocation(const char* file, int line) { return std::string(file) + ":" + std::to_string(line); }
+
+testing::AssertionResult checkLocalFailureGlobal(const MpiTestComm& comm, bool local_success, const char* error_kind,
+                                                 const std::string& local_error, const char* file, int line) {
+  const int local_failure = local_success ? 0 : 1;
+  int global_failure = 0;
+  const int reduce_result = MPI_Allreduce(&local_failure, &global_failure, 1, MPI_INT, MPI_MAX, comm.mpiComm());
+  if (reduce_result != MPI_SUCCESS) {
+    return testing::AssertionFailure() << "MPI_Allreduce failed in global check at " << sourceLocation(file, line);
+  }
+
+  if (global_failure == 0) return testing::AssertionSuccess();
+  if (!local_success) {
+    return testing::AssertionFailure() << error_kind << " failed at " << sourceLocation(file, line) << " with "
+                                       << local_error;
+  }
+  return testing::AssertionFailure() << error_kind << " failed at " << sourceLocation(file, line) << " on another rank";
+}
+
+} // namespace
+
+cudecompHandleGuard::cudecompHandleGuard(cudecompHandle_t handle) : handle_(handle) {}
+
+cudecompHandleGuard::~cudecompHandleGuard() {
+  if (handle_) { (void)cudecompFinalize(handle_); }
+}
+
+gridDescGuard::gridDescGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc)
+    : handle_(handle), grid_desc_(grid_desc) {}
+
+gridDescGuard::~gridDescGuard() {
+  if (grid_desc_) { (void)cudecompGridDescDestroy(handle_, grid_desc_); }
+}
+
+cudaBufferGuard::cudaBufferGuard(void* ptr) : ptr_(ptr) {}
+
+cudaBufferGuard::~cudaBufferGuard() { reset(); }
+
+void cudaBufferGuard::reset(void* ptr) {
+  if (ptr_) { (void)cudaFree(ptr_); }
+  ptr_ = ptr;
+}
+
+cudecompBufferGuard::cudecompBufferGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* ptr)
+    : handle_(handle), grid_desc_(grid_desc), ptr_(ptr) {}
+
+cudecompBufferGuard::~cudecompBufferGuard() {
+  if (ptr_) { (void)cudecompFree(handle_, grid_desc_, ptr_); }
+}
+
+void cudecompBufferGuard::release() noexcept { ptr_ = nullptr; }
+
+testing::AssertionResult checkCudaGlobal(const MpiTestComm& comm, cudaError_t result, const char* file, int line) {
+  return checkLocalFailureGlobal(comm, result == cudaSuccess, "CUDA", cudaGetErrorString(result), file, line);
+}
+
+testing::AssertionResult checkCudecompGlobal(const MpiTestComm& comm, cudecompResult_t result, const char* file,
+                                             int line) {
+  return checkLocalFailureGlobal(comm, result == CUDECOMP_RESULT_SUCCESS, "cuDecomp", std::to_string(result), file,
+                                 line);
+}
+
+testing::AssertionResult checkMpiGlobal(const MpiTestComm& comm, int result, const char* file, int line) {
+  char error_string[MPI_MAX_ERROR_STRING] = {};
+  int error_string_len = 0;
+  if (result != MPI_SUCCESS) { MPI_Error_string(result, error_string, &error_string_len); }
+  return checkLocalFailureGlobal(comm, result == MPI_SUCCESS, "MPI", error_string, file, line);
+}
+
+} // namespace cudecomp_test
diff --git a/tests/ctest/test_utils.h b/tests/ctest/test_utils.h
new file mode 100644
index 0000000..e910f50
--- /dev/null
+++ b/tests/ctest/test_utils.h
@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CUDECOMP_TEST_UTILS_H
+#define CUDECOMP_TEST_UTILS_H
+
+#include <mpi.h>
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "cudecomp.h"
+#include "mpi_test_utils.h"
+
+namespace cudecomp_test {
+
+class cudecompHandleGuard {
+public:
+  explicit cudecompHandleGuard(cudecompHandle_t handle = nullptr);
+  cudecompHandleGuard(const cudecompHandleGuard&) = delete;
+  cudecompHandleGuard& operator=(const cudecompHandleGuard&) = delete;
+  ~cudecompHandleGuard();
+
+private:
+  cudecompHandle_t handle_ = nullptr;
+};
+
+class gridDescGuard {
+public:
+  gridDescGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc);
+  gridDescGuard(const gridDescGuard&) = delete;
+  gridDescGuard& operator=(const gridDescGuard&) = delete;
+  ~gridDescGuard();
+
+private:
+  cudecompHandle_t handle_ = nullptr;
+  cudecompGridDesc_t grid_desc_ = nullptr;
+};
+
+class cudaBufferGuard {
+public:
+  explicit cudaBufferGuard(void* ptr = nullptr);
+  cudaBufferGuard(const cudaBufferGuard&) = delete;
+  cudaBufferGuard& operator=(const cudaBufferGuard&) = delete;
+  ~cudaBufferGuard();
+
+  void reset(void* ptr = nullptr);
+
+private:
+  void* ptr_ = nullptr;
+};
+
+class cudecompBufferGuard {
+public:
+  cudecompBufferGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* ptr);
+  cudecompBufferGuard(const cudecompBufferGuard&) = delete;
+  cudecompBufferGuard& operator=(const cudecompBufferGuard&) = delete;
+  ~cudecompBufferGuard();
+
+  void release() noexcept;
+
+private:
+  cudecompHandle_t handle_ = nullptr;
+  cudecompGridDesc_t grid_desc_ = nullptr;
+  void* ptr_ = nullptr;
+};
+
+testing::AssertionResult checkCudaGlobal(const MpiTestComm& comm, cudaError_t result, const char* file, int line);
+testing::AssertionResult checkCudecompGlobal(const MpiTestComm& comm, cudecompResult_t result, const char* file,
+                                             int line);
+testing::AssertionResult checkMpiGlobal(const MpiTestComm& comm, int result, const char* file, int line);
+
+} // namespace cudecomp_test
+
+#define CHECK_CUDA_GLOBAL(comm, call) ASSERT_TRUE(::cudecomp_test::checkCudaGlobal((comm), (call), __FILE__, __LINE__))
+#define CHECK_CUDECOMP_GLOBAL(comm, call)                                                                              \
+  ASSERT_TRUE(::cudecomp_test::checkCudecompGlobal((comm), (call), __FILE__, __LINE__))
+#define CHECK_MPI_GLOBAL(comm, call) ASSERT_TRUE(::cudecomp_test::checkMpiGlobal((comm), (call), __FILE__, __LINE__))
+
+#endif
diff --git a/tests/ctest/transpose_tests.cc b/tests/ctest/transpose_tests.cc
new file mode 100644
index 0000000..e97382e
--- /dev/null
+++ b/tests/ctest/transpose_tests.cc
@@ -0,0 +1,671 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <complex>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <mpi.h>
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "cudecomp.h"
+#include "internal/common.h"
+
+#include "backend_test_context.h"
+#include "backend_utils.h"
+#include "gpu_test_utils.h"
+#include "mpi_test_utils.h"
+#include "test_utils.h"
+
+namespace {
+
+enum class TransposeOperation { XToY, YToX, YToZ, ZToY };
+
+constexpr std::array<TransposeOperation, 4> kTransposeOperations{TransposeOperation::XToY, TransposeOperation::YToX,
+                                                                 TransposeOperation::YToZ, TransposeOperation::ZToY};
+constexpr std::array<int32_t, 3> kBaselineGdims{9, 10, 11};
+constexpr std::array<int32_t, 3> kZeroExtents{0, 0, 0};
+constexpr std::array<int32_t, 3> kInputHaloExtents{1, 2, 1};
+constexpr std::array<int32_t, 3> kOutputHaloExtents{2, 1, 1};
+constexpr std::array<int32_t, 3> kInputPadding{1, 1, 2};
+constexpr std::array<int32_t, 3> kOutputPadding{2, 1, 1};
+constexpr std::array<bool, 3> kDefaultAxisContiguous{false, false, false};
+constexpr std::array<bool, 3> kAllAxisContiguous{true, true, true};
+constexpr std::array<std::array<int32_t, 3>, 3> kDefaultMemOrder{{{{-1, -1, -1}}, {{-1, -1, -1}}, {{-1, -1, -1}}}};
+
+struct TransposeCase {
+  cudecomp_test::TransposeBackend backend;
+  const char* scenario;
+  TransposeOperation operation;
+  std::array<int32_t, 3> gdims;
+  std::array<int32_t, 2> pdims;
+  cudecompDataType_t dtype;
+  bool out_of_place;
+  std::array<bool, 3> axis_contiguous;
+  std::array<std::array<int32_t, 3>, 3> mem_order;
+  std::array<int32_t, 3> input_halo_extents;
+  std::array<int32_t, 3> output_halo_extents;
+  std::array<int32_t, 3> input_padding;
+  std::array<int32_t, 3> output_padding;
+  cudecompRankOrder_t rank_order;
+  std::vector<int32_t> synthetic_host_groups;
+};
+
+const char* operationName(TransposeOperation operation) {
+  switch (operation) {
+  case TransposeOperation::XToY: return "XToY";
+  case TransposeOperation::YToX: return "YToX";
+  case TransposeOperation::YToZ: return "YToZ";
+  case TransposeOperation::ZToY: return "ZToY";
+  }
+  return "Unknown";
+}
+
+int inputAxis(TransposeOperation operation) {
+  switch (operation) {
+  case TransposeOperation::XToY: return 0;
+  case TransposeOperation::YToX: return 1;
+  case TransposeOperation::YToZ: return 1;
+  case TransposeOperation::ZToY: return 2;
+  }
+  return 0;
+}
+
+int outputAxis(TransposeOperation operation) {
+  switch (operation) {
+  case TransposeOperation::XToY: return 1;
+  case TransposeOperation::YToX: return 0;
+  case TransposeOperation::YToZ: return 2;
+  case TransposeOperation::ZToY: return 1;
+  }
+  return 0;
+}
+
+std::string sanitizeParamName(const std::string& value) {
+  std::string result;
+  for (char ch : value) {
+    if (std::isalnum(static_cast<unsigned char>(ch))) {
+      result.push_back(ch);
+    } else {
+      result.push_back('_');
+    }
+  }
+  return result;
+}
+
+const char* dtypeName(cudecompDataType_t dtype) {
+  switch (dtype) {
+  case CUDECOMP_FLOAT: return "R32";
+  case CUDECOMP_FLOAT_COMPLEX: return "C32";
+  case CUDECOMP_DOUBLE: return "R64";
+  case CUDECOMP_DOUBLE_COMPLEX: return "C64";
+  }
+  return "UnknownDtype";
+}
+
+std::string paramName(const testing::TestParamInfo<TransposeCase>& info) {
+  const auto& test_case = info.param;
+  return sanitizeParamName(test_case.scenario) + "_" + operationName(test_case.operation) + "_" +
+         sanitizeParamName(test_case.backend.name) + "_" + dtypeName(test_case.dtype) + "_P" +
+         std::to_string(test_case.pdims[0]) + "x" + std::to_string(test_case.pdims[1]) + "_" +
+         (test_case.out_of_place ? "OutOfPlace" : "InPlace");
+}
+
+TransposeCase makeCase(cudecomp_test::TransposeBackend backend, const char* scenario, TransposeOperation operation,
+                       std::array<int32_t, 3> gdims = kBaselineGdims, std::array<int32_t, 2> pdims = {2, 2},
+                       cudecompDataType_t dtype = CUDECOMP_FLOAT, bool out_of_place = false,
+                       std::array<bool, 3> axis_contiguous = kDefaultAxisContiguous,
+                       std::array<std::array<int32_t, 3>, 3> mem_order = kDefaultMemOrder,
+                       std::array<int32_t, 3> input_halo_extents = kZeroExtents,
+                       std::array<int32_t, 3> output_halo_extents = kZeroExtents,
+                       std::array<int32_t, 3> input_padding = kZeroExtents,
+                       std::array<int32_t, 3> output_padding = kZeroExtents,
+                       cudecompRankOrder_t rank_order = CUDECOMP_RANK_ORDER_DEFAULT) {
+  return {backend,
+          scenario,
+          operation,
+          gdims,
+          pdims,
+          dtype,
+          out_of_place,
+          axis_contiguous,
+          mem_order,
+          input_halo_extents,
+          output_halo_extents,
+          input_padding,
+          output_padding,
+          rank_order,
+          {}};
+}
+
+TransposeCase withSyntheticHostGroups(TransposeCase test_case, std::vector<int32_t> synthetic_host_groups) {
+  test_case.synthetic_host_groups = std::move(synthetic_host_groups);
+  return test_case;
+}
+
+TransposeCase withHaloAndPadding(TransposeCase test_case) {
+  test_case.input_halo_extents = kInputHaloExtents;
+  test_case.output_halo_extents = kOutputHaloExtents;
+  test_case.input_padding = kInputPadding;
+  test_case.output_padding = kOutputPadding;
+  return test_case;
+}
+
+void appendBaselineCases(std::vector<TransposeCase>& cases, const cudecomp_test::TransposeBackend& backend) {
+  // Baseline cases are the common transpose sweep: all direct transpose operations, in-place and out-of-place, default
+  // layout and all-axis-contiguous layout, and single-precision real/complex data. MPI also keeps P1x1 coverage for
+  // the single-rank local special cases. NCCL and NVSHMEM skip P1x1 because it bypasses backend communication; for
+  // NVSHMEM, it also avoids changing the PE count across init/finalize cycles inside one CTest process set.
+  struct LayoutCase {
+    const char* scenario;
+    std::array<bool, 3> axis_contiguous;
+  };
+
+  for (const auto& layout : {LayoutCase{"BaselineDefaultLayout", kDefaultAxisContiguous},
+                             LayoutCase{"BaselineAxisContiguous", kAllAxisContiguous}}) {
+    for (const auto pdims : {std::array<int32_t, 2>{1, 1}, std::array<int32_t, 2>{2, 2}}) {
+      if (pdims == std::array<int32_t, 2>{1, 1} && std::string(backend.label) != "mpi") continue;
+
+      for (const auto dtype : {CUDECOMP_FLOAT, CUDECOMP_FLOAT_COMPLEX}) {
+        for (const auto operation : kTransposeOperations) {
+          for (const bool out_of_place : {false, true}) {
+            cases.push_back(makeCase(backend, layout.scenario, operation, kBaselineGdims, pdims, dtype, out_of_place,
+                                     layout.axis_contiguous));
+          }
+        }
+      }
+    }
+  }
+}
+
+void appendCoverageCases(std::vector<TransposeCase>& cases, const cudecomp_test::TransposeBackend& backend) {
+  // Coverage cases select explicit memory orders, nonzero halo/padding, dtypes, rank order, and rank counts to reach
+  // transpose paths not guaranteed by the baseline sweep. These are not inherently MPI-only, but running them in the
+  // MPI collection keeps NCCL/NVSHMEM focused on backend baseline coverage while still exercising shared transpose
+  // logic.
+  constexpr std::array<std::array<int32_t, 3>, 3> unpack_mem_order{{{{0, 1, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}};
+  constexpr std::array<std::array<int32_t, 3>, 3> transpose_unpack_mem_order{{{{0, 2, 1}}, {{0, 1, 2}}, {{0, 1, 2}}}};
+  // These last-axis choices force the multi-rank split transpose/unpack path for each direct operation.
+  constexpr std::array<std::array<int32_t, 3>, 3> split_unpack_mem_order{{{{0, 1, 2}}, {{0, 2, 1}}, {{1, 2, 0}}}};
+
+  cases.push_back(
+      withHaloAndPadding(makeCase(backend, "ExplicitMemOrderUnpack", TransposeOperation::XToY, kBaselineGdims, {2, 2},
+                                  CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order)));
+  cases.push_back(
+      withHaloAndPadding(makeCase(backend, "ExplicitMemOrderTransposeUnpack", TransposeOperation::XToY, kBaselineGdims,
+                                  {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, transpose_unpack_mem_order)));
+  for (const auto operation : kTransposeOperations) {
+    cases.push_back(
+        withHaloAndPadding(makeCase(backend, "ExplicitMemOrderSplitUnpack", operation, kBaselineGdims, {2, 2},
+                                    CUDECOMP_FLOAT, true, kDefaultAxisContiguous, split_unpack_mem_order)));
+  }
+
+  cases.push_back(makeCase(backend, "ColumnMajorRankOrder", TransposeOperation::XToY, kBaselineGdims, {2, 2},
+                           CUDECOMP_FLOAT, false, kDefaultAxisContiguous, kDefaultMemOrder, kZeroExtents, kZeroExtents,
+                           kZeroExtents, kZeroExtents, CUDECOMP_RANK_ORDER_COL_MAJOR));
+
+  cases.push_back(
+      withHaloAndPadding(makeCase(backend, "NonPowerOfTwoCommunicator", TransposeOperation::XToY, kBaselineGdims,
+                                  {3, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order)));
+
+  cases.push_back(
+      withHaloAndPadding(makeCase(backend, "DtypeWorkspacePadding", TransposeOperation::XToY, kBaselineGdims, {2, 2},
+                                  CUDECOMP_DOUBLE, true, kDefaultAxisContiguous, split_unpack_mem_order)));
+  cases.push_back(
+      withHaloAndPadding(makeCase(backend, "DtypeWorkspacePadding", TransposeOperation::YToZ, kBaselineGdims, {2, 2},
+                                  CUDECOMP_DOUBLE_COMPLEX, true, kDefaultAxisContiguous, split_unpack_mem_order)));
+
+  if (backend.backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL) {
+    // Pipelined MPI has additional paths for direct offset handling and inter-group scheduling. The synthetic host
+    // groups below make a single-node test look like a multi-host communicator so those inter-group paths are covered.
+    constexpr std::array<int32_t, 2> inter_group_pdims{3, 1};
+    const std::vector<int32_t> inter_group_hosts{0, 1, 2};
+    constexpr std::array<std::array<int32_t, 3>, 3> transpose_pack_offset_mem_order{
+        {{{1, 0, 2}}, {{1, 2, 0}}, {{0, 1, 2}}}};
+    constexpr std::array<std::array<int32_t, 3>, 3> direct_transpose_pack_offset_mem_order{
+        {{{1, 0, 2}}, {{2, 1, 0}}, {{0, 1, 2}}}};
+    constexpr std::array<std::array<int32_t, 3>, 3> direct_transpose_unpack_offset_mem_order{
+        {{{1, 0, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}};
+
+    cases.push_back(withHaloAndPadding(makeCase(backend, "ExplicitMemOrderTransposePackOffset",
+                                                TransposeOperation::XToY, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT, true,
+                                                kDefaultAxisContiguous, transpose_pack_offset_mem_order)));
+    cases.push_back(withHaloAndPadding(makeCase(backend, "DirectTransposePackOffset", TransposeOperation::XToY,
+                                                kBaselineGdims, {1, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous,
+                                                direct_transpose_pack_offset_mem_order)));
+    cases.push_back(withHaloAndPadding(makeCase(backend, "DirectTransposeUnpackOffset", TransposeOperation::XToY,
+                                                kBaselineGdims, {1, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous,
+                                                direct_transpose_unpack_offset_mem_order)));
+
+    cases.push_back(withSyntheticHostGroups(
+        withHaloAndPadding(makeCase(backend, "SyntheticInterGroupUnpack", TransposeOperation::XToY, kBaselineGdims,
+                                    inter_group_pdims, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order)),
+        inter_group_hosts));
+    cases.push_back(
+        withSyntheticHostGroups(withHaloAndPadding(makeCase(backend, "SyntheticInterGroupTransposeUnpack",
+                                                            TransposeOperation::XToY, kBaselineGdims, inter_group_pdims,
+                                                            CUDECOMP_FLOAT, true, kDefaultAxisContiguous,
+                                                            transpose_unpack_mem_order)),
+                                inter_group_hosts));
+    cases.push_back(
+        withSyntheticHostGroups(withHaloAndPadding(makeCase(backend, "SyntheticInterGroupSplitUnpack",
+                                                            TransposeOperation::XToY, kBaselineGdims, inter_group_pdims,
+                                                            CUDECOMP_FLOAT, true, kDefaultAxisContiguous,
+                                                            split_unpack_mem_order)),
+                                inter_group_hosts));
+  }
+}
+
+std::vector<TransposeCase> transposeCasesForLabel(const char* label) {
+  std::vector<TransposeCase> cases;
+  for (const auto& backend : cudecomp_test::transposeBackends()) {
+    if (std::string(backend.label) != label) continue;
+
+    appendBaselineCases(cases, backend);
+    if (std::string(label) == "mpi") { appendCoverageCases(cases, backend); }
+  }
+  return cases;
+}
+
+std::vector<TransposeCase> cudaGraphTransposeCases() {
+  std::vector<TransposeCase> cases;
+  for (const auto& backend : cudecomp_test::transposeBackends()) {
+    if (backend.backend != CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL) continue;
+
+    constexpr std::array<std::array<int32_t, 3>, 3> pack_mem_order{{{{0, 1, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}};
+    constexpr std::array<std::array<int32_t, 3>, 3> transpose_pack_mem_order{{{{0, 1, 2}}, {{1, 2, 0}}, {{0, 1, 2}}}};
+
+    cases.push_back(withHaloAndPadding(makeCase(backend, "CudaGraphsPack", TransposeOperation::XToY, kBaselineGdims,
+                                                {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, pack_mem_order)));
+    cases.push_back(
+        withHaloAndPadding(makeCase(backend, "CudaGraphsTransposePack", TransposeOperation::XToY, kBaselineGdims,
+                                    {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, transpose_pack_mem_order)));
+  }
+  return cases;
+}
+
+std::vector<TransposeCase> ncclUserBufferRegistrationCases() {
+  std::vector<TransposeCase> cases;
+  for (const auto& backend : cudecomp_test::transposeBackends()) {
+    if (backend.backend != CUDECOMP_TRANSPOSE_COMM_NCCL) continue;
+    cases.push_back(makeCase(backend, "UserBufferRegistration", TransposeOperation::XToY, kBaselineGdims, {2, 2},
+                             CUDECOMP_FLOAT, true));
+  }
+  return cases;
+}
+
+bool isInternal(const cudecompPencilInfo_t& pinfo, const std::array<int64_t, 3>& local) {
+  return local[0] >= pinfo.halo_extents[pinfo.order[0]] &&
+         local[0] < pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]] &&
+         local[1] >= pinfo.halo_extents[pinfo.order[1]] &&
+         local[1] < pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]] &&
+         local[2] >= pinfo.halo_extents[pinfo.order[2]] &&
+         local[2] < pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]];
+}
+
+template <typename T> T pencilValue(int64_t global_index) { return static_cast<T>(global_index); }
+
+template <> std::complex<float> pencilValue<std::complex<float>>(int64_t global_index) {
+  return {static_cast<float>(global_index), -static_cast<float>(global_index)};
+}
+
+template <> std::complex<double> pencilValue<std::complex<double>>(int64_t global_index) {
+  return {static_cast<double>(global_index), -static_cast<double>(global_index)};
+}
+
+template <typename T>
+std::vector<T> initializePencil(const cudecompPencilInfo_t& pinfo, const std::array<int32_t, 3>& gdims) {
+  std::vector<T> data(pinfo.size, T{-1});
+
+  for (int64_t i = 0; i < pinfo.size; ++i) {
+    std::array<int64_t, 3> local{};
+    local[0] = i % pinfo.shape[0];
+    local[1] = i / pinfo.shape[0] % pinfo.shape[1];
+    local[2] = i / (pinfo.shape[0] * pinfo.shape[1]);
+
+    if (!isInternal(pinfo, local)) continue;
+
+    std::array<int64_t, 3> global{};
+    global[pinfo.order[0]] = local[0] + pinfo.lo[0] - pinfo.halo_extents[pinfo.order[0]];
+    global[pinfo.order[1]] = local[1] + pinfo.lo[1] - pinfo.halo_extents[pinfo.order[1]];
+    global[pinfo.order[2]] = local[2] + pinfo.lo[2] - pinfo.halo_extents[pinfo.order[2]];
+
+    data[i] = pencilValue<T>(global[0] + gdims[0] * (global[1] + global[2] * gdims[1]));
+  }
+
+  return data;
+}
+
+template <typename T>
+testing::AssertionResult pencilMatches(const std::vector<T>& expected, const std::vector<T>& actual,
+                                       const cudecompPencilInfo_t& pinfo) {
+  if (expected.size() != actual.size()) {
+    return testing::AssertionFailure() << "size mismatch: expected " << expected.size() << ", got " << actual.size();
+  }
+
+  for (int64_t i = 0; i < pinfo.size; ++i) {
+    if (expected[i] == actual[i]) continue;
+
+    std::array<int64_t, 3> local{};
+    local[0] = i % pinfo.shape[0];
+    local[1] = i / pinfo.shape[0] % pinfo.shape[1];
+    local[2] = i / (pinfo.shape[0] * pinfo.shape[1]);
+
+    if (!isInternal(pinfo, local)) continue;
+    return testing::AssertionFailure() << "mismatch at local index " << i << " coordinate (" << local[0] << ", "
+                                       << local[1] << ", " << local[2] << "): expected " << expected[i] << ", got "
+                                       << actual[i];
+  }
+
+  return testing::AssertionSuccess();
+}
+
+template <typename T>
+cudecompResult_t runTranspose(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, TransposeOperation operation,
+                              T* input, T* output, void* work, cudecompDataType_t dtype,
+                              const cudecompPencilInfo_t& input_info, const cudecompPencilInfo_t& output_info) {
+  switch (operation) {
+  case TransposeOperation::XToY:
+    return cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype, input_info.halo_extents,
+                                 output_info.halo_extents, input_info.padding, output_info.padding, 0);
+  case TransposeOperation::YToX:
+    return cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype, input_info.halo_extents,
+                                 output_info.halo_extents, input_info.padding, output_info.padding, 0);
+  case TransposeOperation::YToZ:
+    return cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype, input_info.halo_extents,
+                                 output_info.halo_extents, input_info.padding, output_info.padding, 0);
+  case TransposeOperation::ZToY:
+    return cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype, input_info.halo_extents,
+                                 output_info.halo_extents, input_info.padding, output_info.padding, 0);
+  }
+  return CUDECOMP_RESULT_INVALID_USAGE;
+}
+
+template <typename T>
+void runAndVerifyTranspose(const cudecomp_test::MpiTestComm& active_comm, cudecompHandle_t handle,
+                           cudecompGridDesc_t grid_desc, const TransposeCase& test_case, T* input_d, T* output_d,
+                           void* work_d, int64_t data_num_elements, int64_t workspace_num_elements, int64_t dtype_size,
+                           const std::vector<T>& input_ref, const std::vector<T>& output_ref,
+                           const cudecompPencilInfo_t& input_info, const cudecompPencilInfo_t& output_info) {
+  CHECK_CUDA_GLOBAL(active_comm, cudaMemset(input_d, 0, data_num_elements * sizeof(*input_d)));
+  if (output_d != input_d) {
+    CHECK_CUDA_GLOBAL(active_comm, cudaMemset(output_d, 0, data_num_elements * sizeof(*output_d)));
+  }
+  CHECK_CUDA_GLOBAL(active_comm,
+                    cudaMemcpy(input_d, input_ref.data(), input_ref.size() * sizeof(*input_d), cudaMemcpyHostToDevice));
+  CHECK_CUDA_GLOBAL(active_comm, cudaMemset(work_d, 0, workspace_num_elements * dtype_size));
+
+  CHECK_CUDECOMP_GLOBAL(active_comm, runTranspose(handle, grid_desc, test_case.operation, input_d, output_d, work_d,
+                                                  test_case.dtype, input_info, output_info));
+
+  std::vector<T> output(output_ref.size(), T{});
+  CHECK_CUDA_GLOBAL(active_comm,
+                    cudaMemcpy(output.data(), output_d, output.size() * sizeof(*output_d), cudaMemcpyDeviceToHost));
+  EXPECT_TRUE(pencilMatches(output_ref, output, output_info));
+}
+
+cudecomp::cudecompCommAxis communicationAxis(TransposeOperation operation) {
+  const int ax_a = inputAxis(operation);
+  const int ax_b = outputAxis(operation);
+  return (ax_a == 2 || ax_b == 2) ? cudecomp::CUDECOMP_COMM_ROW : cudecomp::CUDECOMP_COMM_COL;
+}
+
+testing::AssertionResult applySyntheticHostnames(cudecompHandle_t handle, const std::vector<int32_t>& host_groups) {
+  // Inter-group transpose paths are normally selected from MPI processor names and topology metadata. Tests override
+  // the handle before grid descriptor creation so communicator setup observes deterministic host groupings on one node.
+  if (host_groups.empty()) return testing::AssertionSuccess();
+  if (!handle) return testing::AssertionFailure() << "cannot apply synthetic hostnames to null cuDecomp handle";
+
+  if (host_groups.size() != handle->hostnames.size()) {
+    return testing::AssertionFailure() << "synthetic hostname group count " << host_groups.size()
+                                       << " does not match handle rank count " << handle->hostnames.size();
+  }
+
+  for (int rank = 0; rank < static_cast<int>(host_groups.size()); ++rank) {
+    std::string hostname = "cudecomp-test-host-" + std::to_string(host_groups[rank]);
+    if (hostname.size() >= handle->hostnames[rank].size()) {
+      return testing::AssertionFailure() << "synthetic hostname '" << hostname << "' exceeds MPI processor name limit";
+    }
+    handle->hostnames[rank].fill('\0');
+    std::copy(hostname.begin(), hostname.end(), handle->hostnames[rank].begin());
+  }
+
+  // Force communicator setup to use the synthetic hostnames even on systems with MNNVL topology data.
+  handle->rank_to_mnnvl_info.clear();
+  handle->rank_to_clique.clear();
+  handle->rank_to_clique_rank = handle->rank_to_local_rank;
+
+  return testing::AssertionSuccess();
+}
+
+testing::AssertionResult syntheticTopologyIsActive(cudecompGridDesc_t grid_desc, TransposeOperation operation) {
+  const auto comm_axis = communicationAxis(operation);
+  const auto& comm_info =
+      (comm_axis == cudecomp::CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
+
+  if (comm_info.ngroups <= 1) {
+    return testing::AssertionFailure() << "synthetic topology did not create inter-group communicator: ngroups="
+                                       << comm_info.ngroups << ", npergroup=" << comm_info.npergroup
+                                       << ", nranks=" << comm_info.nranks;
+  }
+
+  return testing::AssertionSuccess();
+}
+
+} // namespace
+
+class TransposeCorrectnessTest : public ::testing::TestWithParam<TransposeCase> {};
+class CudaGraphTransposeCorrectnessTest : public ::testing::TestWithParam<TransposeCase> {};
+class NcclUserBufferRegistrationTest : public ::testing::TestWithParam<TransposeCase> {};
+
+testing::AssertionResult ncclUserBufferRegistrationIsActive(cudecompHandle_t handle, void* buffer) {
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
+  if (!handle->nccl_enable_ubr) { return testing::AssertionFailure() << "NCCL user buffer registration is disabled"; }
+
+  auto entry = handle->nccl_ubr_handles.find(buffer);
+  if (entry == handle->nccl_ubr_handles.end() || entry->second.empty()) {
+    return testing::AssertionFailure() << "NCCL user buffer registration handle was not recorded";
+  }
+
+  return testing::AssertionSuccess();
+#else
+  return testing::AssertionFailure() << "NCCL user buffer registration requires NCCL 2.19 or newer";
+#endif
+}
+
+template <typename T>
+void runTransposeCase(const TransposeCase& test_case, bool check_cuda_graph_replay = false,
+                      bool check_nccl_user_buffer_registration = false) {
+  const int active_ranks = test_case.pdims[0] * test_case.pdims[1];
+  const auto world_comm = cudecomp_test::MpiTestComm::world();
+
+  if (world_comm.size() < active_ranks) {
+    GTEST_SKIP() << operationName(test_case.operation) << " with pdims " << test_case.pdims[0] << "x"
+                 << test_case.pdims[1] << " requires " << active_ranks << " ranks, launched with " << world_comm.size();
+  }
+
+  cudecompGridDescConfig_t config;
+  ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config));
+  config.gdims[0] = test_case.gdims[0];
+  config.gdims[1] = test_case.gdims[1];
+  config.gdims[2] = test_case.gdims[2];
+  config.pdims[0] = test_case.pdims[0];
+  config.pdims[1] = test_case.pdims[1];
+  config.rank_order = test_case.rank_order;
+  config.transpose_comm_backend = test_case.backend.backend;
+  config.transpose_axis_contiguous[0] = test_case.axis_contiguous[0];
+  config.transpose_axis_contiguous[1] = test_case.axis_contiguous[1];
+  config.transpose_axis_contiguous[2] = test_case.axis_contiguous[2];
+  for (int axis = 0; axis < 3; ++axis) {
+    for (int i = 0; i < 3; ++i) {
+      config.transpose_mem_order[axis][i] = test_case.mem_order[axis][i];
+    }
+  }
+
+  cudecomp_test::BackendTestContext test_context;
+  cudecomp_test::TestSetupDecision setup_decision;
+  ASSERT_TRUE(test_context.initialize(world_comm, active_ranks, test_case.backend.label,
+                                      std::string(test_case.backend.label) == "nccl", config, &setup_decision));
+  ASSERT_FALSE(setup_decision.fail) << setup_decision.reason;
+  if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; }
+
+  const auto& active_comm = test_context.comm();
+  cudecompHandle_t handle = test_context.handle();
+  ASSERT_NE(handle, nullptr);
+  ASSERT_TRUE(applySyntheticHostnames(handle, test_case.synthetic_host_groups));
+  if (check_cuda_graph_replay) { ASSERT_TRUE(handle->cuda_graphs_enable); }
+
+  cudecompGridDesc_t grid_desc = nullptr;
+  const cudecompResult_t grid_desc_create_result = cudecompGridDescCreate(handle, &grid_desc, &config, nullptr);
+  cudecomp_test::gridDescGuard grid_desc_guard(handle, grid_desc);
+  CHECK_CUDECOMP_GLOBAL(active_comm, grid_desc_create_result);
+  if (!test_case.synthetic_host_groups.empty()) {
+    ASSERT_TRUE(syntheticTopologyIsActive(grid_desc, test_case.operation));
+  }
+
+  cudecompPencilInfo_t input_info;
+  cudecompPencilInfo_t output_info;
+  CHECK_CUDECOMP_GLOBAL(active_comm,
+                        cudecompGetPencilInfo(handle, grid_desc, &input_info, inputAxis(test_case.operation),
+                                              test_case.input_halo_extents.data(), test_case.input_padding.data()));
+  CHECK_CUDECOMP_GLOBAL(active_comm,
+                        cudecompGetPencilInfo(handle, grid_desc, &output_info, outputAxis(test_case.operation),
+                                              test_case.output_halo_extents.data(), test_case.output_padding.data()));
+
+  int64_t workspace_num_elements = 0;
+  CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetTransposeWorkspaceSize(handle, grid_desc, &workspace_num_elements));
+
+  int64_t dtype_size = 0;
+  CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetDataTypeSize(test_case.dtype, &dtype_size));
+
+  const auto input_ref = initializePencil<T>(input_info, test_case.gdims);
+  const auto output_ref = initializePencil<T>(output_info, test_case.gdims);
+  const int64_t data_num_elements = std::max(input_info.size, output_info.size);
+
+  T* input_d = nullptr;
+  const cudaError_t input_alloc_result = cudaMalloc(&input_d, data_num_elements * sizeof(*input_d));
+  cudecomp_test::cudaBufferGuard input_buffer(input_d);
+  CHECK_CUDA_GLOBAL(active_comm, input_alloc_result);
+
+  if (check_cuda_graph_replay) {
+    ASSERT_TRUE(test_case.out_of_place);
+
+    T* output_a_d = nullptr;
+    const cudaError_t output_a_alloc_result = cudaMalloc(&output_a_d, data_num_elements * sizeof(*output_a_d));
+    cudecomp_test::cudaBufferGuard output_a_buffer(output_a_d);
+    CHECK_CUDA_GLOBAL(active_comm, output_a_alloc_result);
+
+    T* output_b_d = nullptr;
+    const cudaError_t output_b_alloc_result = cudaMalloc(&output_b_d, data_num_elements * sizeof(*output_b_d));
+    cudecomp_test::cudaBufferGuard output_b_buffer(output_b_d);
+    CHECK_CUDA_GLOBAL(active_comm, output_b_alloc_result);
+
+    void* work_a_d = nullptr;
+    const cudecompResult_t work_a_alloc_result =
+        cudecompMalloc(handle, grid_desc, &work_a_d, workspace_num_elements * dtype_size);
+    cudecomp_test::cudecompBufferGuard work_a_buffer(handle, grid_desc, work_a_d);
+    CHECK_CUDECOMP_GLOBAL(active_comm, work_a_alloc_result);
+
+    void* work_b_d = nullptr;
+    const cudecompResult_t work_b_alloc_result =
+        cudecompMalloc(handle, grid_desc, &work_b_d, workspace_num_elements * dtype_size);
+    cudecomp_test::cudecompBufferGuard work_b_buffer(handle, grid_desc, work_b_d);
+    CHECK_CUDECOMP_GLOBAL(active_comm, work_b_alloc_result);
+
+    runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_a_d, work_a_d, data_num_elements,
+                          workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info);
+    runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_a_d, work_a_d, data_num_elements,
+                          workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info);
+    runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_b_d, work_b_d, data_num_elements,
+                          workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info);
+    runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_b_d, work_b_d, data_num_elements,
+                          workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info);
+    return;
+  }
+
+  T* output_d = input_d;
+  cudecomp_test::cudaBufferGuard output_buffer;
+  if (test_case.out_of_place) {
+    T* allocated_output_d = nullptr;
+    const cudaError_t output_alloc_result = cudaMalloc(&allocated_output_d, data_num_elements * sizeof(*output_d));
+    output_buffer.reset(allocated_output_d);
+    CHECK_CUDA_GLOBAL(active_comm, output_alloc_result);
+    output_d = allocated_output_d;
+  }
+
+  void* work_d = nullptr;
+  const cudecompResult_t work_alloc_result =
+      cudecompMalloc(handle, grid_desc, &work_d, workspace_num_elements * dtype_size);
+  cudecomp_test::cudecompBufferGuard work_buffer(handle, grid_desc, work_d);
+  CHECK_CUDECOMP_GLOBAL(active_comm, work_alloc_result);
+  if (check_nccl_user_buffer_registration) { ASSERT_TRUE(ncclUserBufferRegistrationIsActive(handle, work_d)); }
+
+  runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_d, work_d, data_num_elements,
+                        workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info);
+
+  if (check_nccl_user_buffer_registration) {
+    const cudecompResult_t work_free_result = cudecompFree(handle, grid_desc, work_d);
+    if (work_free_result == CUDECOMP_RESULT_SUCCESS) { work_buffer.release(); }
+    CHECK_CUDECOMP_GLOBAL(active_comm, work_free_result);
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
+    EXPECT_EQ(handle->nccl_ubr_handles.count(work_d), 0);
+#endif
+  }
+}
+
+TEST_P(TransposeCorrectnessTest, DirectOperation) {
+  const auto test_case = GetParam();
+  switch (test_case.dtype) {
+  case CUDECOMP_FLOAT: runTransposeCase<float>(test_case); break;
+  case CUDECOMP_FLOAT_COMPLEX: runTransposeCase<std::complex<float>>(test_case); break;
+  case CUDECOMP_DOUBLE: runTransposeCase<double>(test_case); break;
+  case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase<std::complex<double>>(test_case); break;
+  default: FAIL() << "unsupported test dtype " << test_case.dtype;
+  }
+}
+
+TEST_P(CudaGraphTransposeCorrectnessTest, CapturesAndReplaysPackingGraph) {
+  const auto test_case = GetParam();
+  switch (test_case.dtype) {
+  case CUDECOMP_FLOAT: runTransposeCase<float>(test_case, true); break;
+  case CUDECOMP_FLOAT_COMPLEX: runTransposeCase<std::complex<float>>(test_case, true); break;
+  case CUDECOMP_DOUBLE: runTransposeCase<double>(test_case, true); break;
+  case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase<std::complex<double>>(test_case, true); break;
+  default: FAIL() << "unsupported test dtype " << test_case.dtype;
+  }
+}
+
+TEST_P(NcclUserBufferRegistrationTest, DirectOperation) {
+#if NCCL_VERSION_CODE < NCCL_VERSION(2, 19, 0)
+  GTEST_SKIP() << "NCCL user buffer registration requires NCCL 2.19 or newer";
+#else
+  const auto test_case = GetParam();
+  switch (test_case.dtype) {
+  case CUDECOMP_FLOAT: runTransposeCase<float>(test_case, false, true); break;
+  case CUDECOMP_FLOAT_COMPLEX: runTransposeCase<std::complex<float>>(test_case, false, true); break;
+  case CUDECOMP_DOUBLE: runTransposeCase<double>(test_case, false, true); break;
+  case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase<std::complex<double>>(test_case, false, true); break;
+  default: FAIL() << "unsupported test dtype " << test_case.dtype;
+  }
+#endif
+}
+
+INSTANTIATE_TEST_SUITE_P(MpiBackends, TransposeCorrectnessTest, ::testing::ValuesIn(transposeCasesForLabel("mpi")),
+                         paramName);
+INSTANTIATE_TEST_SUITE_P(NcclBackends, TransposeCorrectnessTest, ::testing::ValuesIn(transposeCasesForLabel("nccl")),
+                         paramName);
+INSTANTIATE_TEST_SUITE_P(NvshmemBackends, TransposeCorrectnessTest,
+                         ::testing::ValuesIn(transposeCasesForLabel("nvshmem")), paramName);
+INSTANTIATE_TEST_SUITE_P(CudaGraphMpiBackends, CudaGraphTransposeCorrectnessTest,
+                         ::testing::ValuesIn(cudaGraphTransposeCases()), paramName);
+INSTANTIATE_TEST_SUITE_P(NcclUserBufferRegistration, NcclUserBufferRegistrationTest,
+                         ::testing::ValuesIn(ncclUserBufferRegistrationCases()), paramName);