diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e81fce..afd2d18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,8 @@ endif() option(CUDECOMP_BUILD_FORTRAN "Build Fortran bindings" ON) option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON) option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF) +option(CUDECOMP_BUILD_TESTS "Build tests" OFF) +option(CUDECOMP_TEST_FETCH_GTEST "Fetch GoogleTest if a system installation is unavailable" OFF) option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF) set(CUDECOMP_NCCL_HOME CACHE STRING "Path to search for NCCL installation. Use to override NVHPC provided NCCL version.") set(CUDECOMP_NVSHMEM_HOME CACHE STRING "Path to search for NVSHMEM installation. Use to override NVHPC provided NVSHMEM version.") @@ -72,6 +74,10 @@ if (CRAY_CC_BIN) endif() # MPI +if (CUDECOMP_BUILD_TESTS) + # The CTest launcher defaults use MPI_CXX_LIBRARY_VERSION_STRING to identify Open MPI. + set(MPI_DETERMINE_LIBRARY_VERSION ON) +endif() find_package(MPI REQUIRED) if (CRAY_CC_BIN) @@ -290,18 +296,28 @@ if (CUDECOMP_BUILD_FORTRAN) install(FILES ${CMAKE_BINARY_DIR}/include/cudecomp.mod DESTINATION ${CMAKE_INSTALL_PREFIX}/include) endif() +if (CUDECOMP_BUILD_TESTS OR CUDECOMP_BUILD_EXTRAS) + enable_testing() + add_subdirectory(tests/cc) + + if (CUDECOMP_BUILD_FORTRAN) + add_subdirectory(tests/fortran) + endif() + + if (CUDECOMP_BUILD_TESTS) + add_subdirectory(tests/ctest) + endif() +endif() + if (CUDECOMP_BUILD_EXTRAS) add_subdirectory(benchmark) - add_subdirectory(tests/cc) add_subdirectory(examples/cc/basic_usage) add_subdirectory(examples/cc/taylor_green) if (CUDECOMP_BUILD_FORTRAN) - add_subdirectory(tests/fortran) add_subdirectory(examples/fortran/basic_usage) add_subdirectory(examples/fortran/poisson) add_subdirectory(examples/fortran/taylor_green) endif() endif() - diff --git a/docs/requirements.txt b/docs/requirements.txt index 2ccd9d9..517273f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,3 +3,4 @@ sphinx_rtd_theme==3.0.2 breathe==4.36.0 sphinx-tabs==3.4.7 sphinx-fortran==1.1.1 +six==1.17.0 diff --git a/tests/README.md b/tests/README.md index 2f15086..554817e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,5 +1,92 @@ # Tests -This subdirectory contains tests for both the transpose and halo communication routines in cuDecomp, in both C++ and Fortran. + +This directory contains cuDecomp functional tests. The primary routine +development test workflow is the CTest suite under [`ctest/`](ctest/), +which covers public API behavior plus transpose and halo correctness for the +backends available in the current build. When Fortran bindings are enabled, the +CTest workflow also includes focused Fortran API coverage plus MPI functional +coverage across the Fortran dtype-specialized executables. The +legacy C++ and Fortran executables are still available for broader sweeps and +targeted configuration testing. + +## CTest Suite + +### Build + +The default project build does not build tests. Build the CTest suite with +`CUDECOMP_BUILD_TESTS=ON`: + +```shell +mkdir -p build +cd build +cmake -DCUDECOMP_BUILD_TESTS=ON .. +make -j"$(nproc)" +``` + +`CUDECOMP_BUILD_TESTS=ON` also builds the legacy C++/Fortran test executables +under this directory. The C++ CTest executables use GoogleTest, which is found +with `find_package(GTest)` by default. If a system GoogleTest package is +unavailable, configure with `-DCUDECOMP_TEST_FETCH_GTEST=ON` to fetch the pinned +test dependency. + +### Running Tests + +List registered tests with: + +```shell +cd build +ctest -N +``` + +The default test order is API, regular transpose tests, regular halo tests, then +specialized CUDA Graphs, NCCL user-buffer-registration tests, and focused +Fortran API/functional tests when Fortran bindings are enabled. NVSHMEM tests are +registered only with NVSHMEM-enabled builds. + +Useful labels: + +| Label | Tests selected | +| --- | --- | +| `api` | Public C API and focused Fortran API behavior tests | +| `transpose` | All transpose correctness tests | +| `halo` | All halo correctness tests | +| `fortran` | Focused Fortran API and MPI functional tests across Fortran dtypes, when built | +| `mpi` | MPI-backend tests | +| `nccl` | NCCL-backend tests | +| `nvshmem` | NVSHMEM-backend tests, when built | +| `cuda_graphs` | CUDA Graphs functional coverage | +| `nccl_ubr` | NCCL user buffer registration functional coverage | + +Run tests by name or label: + +```shell +cd build +ctest --output-on-failure -R "cudecomp_(api|transpose_mpi|halo_mpi)$" +ctest --output-on-failure -L mpi +ctest --output-on-failure -L cuda_graphs +``` + +### GPU Requirements + +The CTest suites require GPUs. They run four MPI ranks by default and work best +on systems with four visible GPUs available. Test setup fails when no CUDA +device is visible. + +On systems with fewer than four visible GPUs, CUDA MPS is required so multiple +local MPI ranks can share a GPU. Set `CUDA_MPS_ACTIVE_THREAD_PERCENTAGE` to +`100 / nranks` for the local ranks sharing a GPU. For the default four-rank +CTest suites on one GPU, this value is `25`. + +NCCL tests using MPS also require NCCL 2.30 or newer and +`NCCL_MULTI_RANK_GPU_ENABLE=1`. If these NCCL-specific requirements are not met, +the NCCL cases skip so MPI-capable systems can still run the non-NCCL tests. + +## Legacy Executables + +The CTest suites above are the recommended routine development tests. The legacy +runners remain useful for broader manual sweeps or for targeting a specific +configuration that is not part of the focused CTest matrix. + The testing executables accept a number of flags to control the configuration of the test (run `cc/transpose_test -h` or `cc/halo_test -h` for a listing of available options). You can use these binaries to test particular configurations of cuDecomp (i.e. global grid, process grid, communication backends, datatype, etc.) to verify functionality. diff --git a/tests/ctest/CMakeLists.txt b/tests/ctest/CMakeLists.txt new file mode 100644 index 0000000..79b6024 --- /dev/null +++ b/tests/ctest/CMakeLists.txt @@ -0,0 +1,304 @@ +find_package(GTest CONFIG QUIET) +if (NOT GTest_FOUND) + find_package(GTest MODULE QUIET) +endif() + +if (NOT GTest_FOUND AND CUDECOMP_TEST_FETCH_GTEST) + include(FetchContent) + FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.14.0 + ) + set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) +endif() + +if (TARGET GTest::gtest) + set(CUDECOMP_TEST_FRAMEWORK_TARGET GTest::gtest) +elseif (TARGET GTest::GTest) + set(CUDECOMP_TEST_FRAMEWORK_TARGET GTest::GTest) +endif() + +if (NOT CUDECOMP_TEST_FRAMEWORK_TARGET) + message(FATAL_ERROR + "GoogleTest was not found. Install a system GoogleTest package or configure with " + "-DCUDECOMP_TEST_FETCH_GTEST=ON to fetch the pinned test dependency." + ) +endif() + +set(CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT "") +if (MPI_CXX_LIBRARY_VERSION_STRING MATCHES "Open MPI") + set(CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT "--oversubscribe") +endif() + +set(CUDECOMP_TEST_MPIEXEC_PREFLAGS "${CUDECOMP_TEST_MPIEXEC_PREFLAGS_DEFAULT}" CACHE STRING + "Extra MPI launcher flags for cuDecomp CTest tests.") +separate_arguments(CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST UNIX_COMMAND "${CUDECOMP_TEST_MPIEXEC_PREFLAGS}") + +set(CUDECOMP_TEST_TIMEOUT 90) + +set(CUDECOMP_TEST_ENABLE_NVSHMEM ${CUDECOMP_ENABLE_NVSHMEM}) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/backend_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/backend_config.h +) + +add_library(cudecomp_test_support STATIC) +target_sources(cudecomp_test_support + PRIVATE + backend_test_context.cc + backend_utils.cc + gpu_test_utils.cc + mpi_test_utils.cc + test_utils.cc +) +target_include_directories(cudecomp_test_support + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../../include + ${NVHPC_CUDA_INCLUDE_DIR} + ${NCCL_INCLUDE_DIR} +) +target_link_libraries(cudecomp_test_support + PUBLIC + MPI::MPI_CXX + NVHPC::CUDART + ${NCCL_LIBRARY} + ${CUDECOMP_TEST_FRAMEWORK_TARGET} +) + +add_executable(cudecomp_test_api) +target_sources(cudecomp_test_api + PRIVATE + api_tests.cc + mpi_test_main.cc +) +target_link_libraries(cudecomp_test_api + PRIVATE + cudecomp + cudecomp_test_support +) +set_target_properties(cudecomp_test_api PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest +) + +set(CUDECOMP_TEST_API_MPI_RANKS 4) +add_test( + NAME cudecomp_api + COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST} + ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_API_MPI_RANKS} + $ +) +set_tests_properties(cudecomp_api PROPERTIES + LABELS "api;mpi" + TIMEOUT ${CUDECOMP_TEST_TIMEOUT} +) + +add_executable(cudecomp_test_transpose) +target_sources(cudecomp_test_transpose + PRIVATE + mpi_test_main.cc + transpose_tests.cc +) +target_link_libraries(cudecomp_test_transpose + PRIVATE + cudecomp + cudecomp_test_support + NVHPC::CUTENSOR +) +set_target_properties(cudecomp_test_transpose PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest +) + +set(CUDECOMP_TEST_TRANSPOSE_MPI_RANKS 4) +function(add_cudecomp_transpose_test test_name filter labels) + add_test( + NAME ${test_name} + COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST} + ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_TRANSPOSE_MPI_RANKS} + $ + --gtest_filter=${filter} + ) + set_tests_properties(${test_name} PROPERTIES + LABELS "${labels}" + TIMEOUT ${CUDECOMP_TEST_TIMEOUT} + ) +endfunction() + +add_cudecomp_transpose_test(cudecomp_transpose_mpi "MpiBackends/*" "transpose;mpi") +add_cudecomp_transpose_test(cudecomp_transpose_nccl "NcclBackends/*" "transpose;nccl") +set_tests_properties(cudecomp_transpose_nccl PROPERTIES + ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl" +) + +if (CUDECOMP_ENABLE_NVSHMEM) + add_cudecomp_transpose_test(cudecomp_transpose_nvshmem "NvshmemBackends/*" "transpose;nvshmem") + set_tests_properties(cudecomp_transpose_nvshmem PROPERTIES + ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nvshmem;NVSHMEM_DISABLE_NCCL=1" + ) +endif() + +add_executable(cudecomp_test_halo) +target_sources(cudecomp_test_halo + PRIVATE + mpi_test_main.cc + halo_tests.cc +) +target_link_libraries(cudecomp_test_halo + PRIVATE + cudecomp + cudecomp_test_support +) +set_target_properties(cudecomp_test_halo PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest +) + +set(CUDECOMP_TEST_HALO_MPI_RANKS 4) +function(add_cudecomp_halo_test test_name filter labels) + add_test( + NAME ${test_name} + COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST} + ${MPIEXEC_NUMPROC_FLAG} ${CUDECOMP_TEST_HALO_MPI_RANKS} + $ + --gtest_filter=${filter} + ) + set_tests_properties(${test_name} PROPERTIES + LABELS "${labels}" + TIMEOUT ${CUDECOMP_TEST_TIMEOUT} + ) +endfunction() + +add_cudecomp_halo_test(cudecomp_halo_mpi "MpiBackends/*" "halo;mpi") +add_cudecomp_halo_test(cudecomp_halo_nccl "NcclBackends/*" "halo;nccl") +set_tests_properties(cudecomp_halo_nccl PROPERTIES + ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl" +) + +if (CUDECOMP_ENABLE_NVSHMEM) + add_cudecomp_halo_test(cudecomp_halo_nvshmem "NvshmemBackends/*" "halo;nvshmem") + set_tests_properties(cudecomp_halo_nvshmem PROPERTIES + ENVIRONMENT "CUDECOMP_TEST_KEEPALIVE_BACKEND=nvshmem;NVSHMEM_DISABLE_NCCL=1" + ) +endif() + +add_cudecomp_transpose_test(cudecomp_cuda_graphs "CudaGraphMpiBackends/*" "cuda_graphs;mpi") +set_tests_properties(cudecomp_cuda_graphs PROPERTIES + ENVIRONMENT "CUDECOMP_ENABLE_CUDA_GRAPHS=1" +) +add_cudecomp_transpose_test(cudecomp_nccl_user_buffer_registration "NcclUserBufferRegistration/*" + "nccl;nccl_ubr") +set_tests_properties(cudecomp_nccl_user_buffer_registration PROPERTIES + ENVIRONMENT "CUDECOMP_ENABLE_NCCL_UBR=1;CUDECOMP_TEST_KEEPALIVE_BACKEND=nccl" +) + +if (CUDECOMP_BUILD_FORTRAN) + add_executable(cudecomp_test_fortran_api) + target_sources(cudecomp_test_fortran_api + PRIVATE + fortran_api_test.f90 + ) + target_include_directories(cudecomp_test_fortran_api + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ) + target_link_libraries(cudecomp_test_fortran_api + PRIVATE + MPI::MPI_Fortran + cudecomp + cudecomp_fort + ) + target_compile_options(cudecomp_test_fortran_api PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_link_options(cudecomp_test_fortran_api PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_compile_options(cudecomp_test_fortran_api PRIVATE -O3) + set_target_properties(cudecomp_test_fortran_api PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest + ) + + add_executable(cudecomp_test_fortran_transpose) + target_sources(cudecomp_test_fortran_transpose + PRIVATE + fortran_transpose_tests.f90 + ) + target_include_directories(cudecomp_test_fortran_transpose + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ) + target_link_libraries(cudecomp_test_fortran_transpose + PRIVATE + MPI::MPI_Fortran + cudecomp + cudecomp_fort + ) + target_compile_options(cudecomp_test_fortran_transpose PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_link_options(cudecomp_test_fortran_transpose PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_compile_options(cudecomp_test_fortran_transpose PRIVATE -O3) + set_target_properties(cudecomp_test_fortran_transpose PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest + ) + + add_executable(cudecomp_test_fortran_halo) + target_sources(cudecomp_test_fortran_halo + PRIVATE + fortran_halo_tests.f90 + ) + target_include_directories(cudecomp_test_fortran_halo + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ) + target_link_libraries(cudecomp_test_fortran_halo + PRIVATE + MPI::MPI_Fortran + cudecomp + cudecomp_fort + ) + target_compile_options(cudecomp_test_fortran_halo PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_link_options(cudecomp_test_fortran_halo PRIVATE + $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}> + ) + target_compile_options(cudecomp_test_fortran_halo PRIVATE -O3) + set_target_properties(cudecomp_test_fortran_halo PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/tests/ctest + ) + + set(CUDECOMP_TEST_FORTRAN_API_MPI_RANKS 4) + set(CUDECOMP_TEST_FORTRAN_TRANSPOSE_MPI_RANKS 4) + set(CUDECOMP_TEST_FORTRAN_HALO_MPI_RANKS 4) + + function(add_cudecomp_fortran_test test_name target labels mpi_ranks) + add_test( + NAME ${test_name} + COMMAND ${MPIEXEC_EXECUTABLE} ${CUDECOMP_TEST_MPIEXEC_PREFLAGS_LIST} + ${MPIEXEC_NUMPROC_FLAG} ${mpi_ranks} + $ + ${ARGN} + ) + set_tests_properties(${test_name} PROPERTIES + LABELS "${labels}" + TIMEOUT ${CUDECOMP_TEST_TIMEOUT} + ) + endfunction() + + add_cudecomp_fortran_test(cudecomp_fortran_api cudecomp_test_fortran_api "fortran;api;mpi" + ${CUDECOMP_TEST_FORTRAN_API_MPI_RANKS}) + add_cudecomp_fortran_test(cudecomp_fortran_transpose cudecomp_test_fortran_transpose "fortran;transpose;mpi" + ${CUDECOMP_TEST_FORTRAN_TRANSPOSE_MPI_RANKS}) + add_cudecomp_fortran_test(cudecomp_fortran_halo cudecomp_test_fortran_halo "fortran;halo;mpi" + ${CUDECOMP_TEST_FORTRAN_HALO_MPI_RANKS}) +endif() diff --git a/tests/ctest/api_tests.cc b/tests/ctest/api_tests.cc new file mode 100644 index 0000000..9418ba2 --- /dev/null +++ b/tests/ctest/api_tests.cc @@ -0,0 +1,769 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include +#include + +#include "cudecomp.h" + +#include "gpu_test_utils.h" +#include "mpi_test_utils.h" +#include "test_utils.h" + +namespace { + +constexpr int kApiTestRanks = 4; +constexpr std::array kGdims{9, 10, 11}; +constexpr std::array kGdimsDist{8, 9, 10}; +constexpr std::array kPdims{2, 2}; +constexpr std::array kHaloExtents{1, 2, 1}; +constexpr std::array kPadding{1, 0, 2}; +constexpr std::array kHaloPeriods{false, true, false}; + +struct ExpectedPencilInfo { + std::array shape; + std::array lo; + std::array hi; + std::array order; + std::array halo_extents; + std::array padding; + int64_t size; +}; + +constexpr ExpectedPencilInfo kExpectedDefaultPencilInfo[3][kApiTestRanks] = { + { + {{12, 9, 10}, {0, 0, 0}, {8, 4, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{12, 9, 9}, {0, 0, 6}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + {{12, 9, 10}, {0, 5, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{12, 9, 9}, {0, 5, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + }, + { + {{8, 14, 10}, {0, 0, 0}, {4, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1120}, + {{8, 14, 9}, {0, 0, 6}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1008}, + {{7, 14, 10}, {5, 0, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 980}, + {{7, 14, 9}, {5, 0, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 882}, + }, + { + {{8, 9, 15}, {0, 0, 0}, {4, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{8, 9, 15}, {0, 5, 0}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{7, 9, 15}, {5, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + {{7, 9, 15}, {5, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + }, +}; + +constexpr ExpectedPencilInfo kExpectedColumnMajorPencilInfo[3][kApiTestRanks] = { + { + {{12, 9, 10}, {0, 0, 0}, {8, 4, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{12, 9, 10}, {0, 5, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{12, 9, 9}, {0, 0, 6}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + {{12, 9, 9}, {0, 5, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + }, + { + {{8, 14, 10}, {0, 0, 0}, {4, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 1120}, + {{7, 14, 10}, {5, 0, 0}, {8, 9, 5}, {0, 1, 2}, kHaloExtents, kPadding, 980}, + {{8, 14, 9}, {0, 0, 6}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1008}, + {{7, 14, 9}, {5, 0, 6}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 882}, + }, + { + {{8, 9, 15}, {0, 0, 0}, {4, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{7, 9, 15}, {5, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + {{8, 9, 15}, {0, 5, 0}, {4, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{7, 9, 15}, {5, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + }, +}; + +constexpr ExpectedPencilInfo kExpectedGdimsDistPencilInfo[3][kApiTestRanks] = { + { + {{12, 9, 9}, {0, 0, 0}, {8, 4, 4}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + {{12, 9, 10}, {0, 0, 5}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{12, 9, 9}, {0, 5, 0}, {8, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 972}, + {{12, 9, 10}, {0, 5, 5}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + }, + { + {{7, 14, 9}, {0, 0, 0}, {3, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 882}, + {{7, 14, 10}, {0, 0, 5}, {3, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 980}, + {{8, 14, 9}, {4, 0, 0}, {8, 9, 4}, {0, 1, 2}, kHaloExtents, kPadding, 1008}, + {{8, 14, 10}, {4, 0, 5}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1120}, + }, + { + {{7, 9, 15}, {0, 0, 0}, {3, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + {{7, 9, 15}, {0, 5, 0}, {3, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 945}, + {{8, 9, 15}, {4, 0, 0}, {8, 4, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + {{8, 9, 15}, {4, 5, 0}, {8, 9, 10}, {0, 1, 2}, kHaloExtents, kPadding, 1080}, + }, +}; + +void setDistributedConfig(cudecompGridDescConfig_t& config) { + config.gdims[0] = kGdims[0]; + config.gdims[1] = kGdims[1]; + config.gdims[2] = kGdims[2]; + config.pdims[0] = kPdims[0]; + config.pdims[1] = kPdims[1]; +} + +void setMemOrder(cudecompGridDescConfig_t& config, const std::array& order) { + for (int axis = 0; axis < 3; ++axis) { + for (int i = 0; i < 3; ++i) { + config.transpose_mem_order[axis][i] = order[i]; + } + } +} + +void setGdimsDist(cudecompGridDescConfig_t& config) { + config.gdims_dist[0] = kGdimsDist[0]; + config.gdims_dist[1] = kGdimsDist[1]; + config.gdims_dist[2] = kGdimsDist[2]; +} + +void expectPencilInfoEquals(const cudecompPencilInfo_t& actual, const ExpectedPencilInfo& expected) { + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(expected.shape[i], actual.shape[i]); + EXPECT_EQ(expected.lo[i], actual.lo[i]); + EXPECT_EQ(expected.hi[i], actual.hi[i]); + EXPECT_EQ(expected.order[i], actual.order[i]); + EXPECT_EQ(expected.halo_extents[i], actual.halo_extents[i]); + EXPECT_EQ(expected.padding[i], actual.padding[i]); + } + EXPECT_EQ(expected.size, actual.size); +} + +void expectGridDescConfigEquals(const cudecompGridDescConfig_t& actual, const cudecompGridDescConfig_t& expected) { + EXPECT_EQ(expected.rank_order, actual.rank_order); + EXPECT_EQ(expected.transpose_comm_backend, actual.transpose_comm_backend); + EXPECT_EQ(expected.halo_comm_backend, actual.halo_comm_backend); + for (int i = 0; i < 2; ++i) { + EXPECT_EQ(expected.pdims[i], actual.pdims[i]); + } + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(expected.gdims[i], actual.gdims[i]); + EXPECT_EQ(expected.gdims_dist[i], actual.gdims_dist[i]); + EXPECT_EQ(expected.transpose_axis_contiguous[i], actual.transpose_axis_contiguous[i]); + for (int j = 0; j < 3; ++j) { + EXPECT_EQ(expected.transpose_mem_order[i][j], actual.transpose_mem_order[i][j]); + } + } +} + +bool isMpiTransposeBackend(cudecompTransposeCommBackend_t backend) { + return backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P || backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL || + backend == CUDECOMP_TRANSPOSE_COMM_MPI_A2A; +} + +bool isMpiHaloBackend(cudecompHaloCommBackend_t backend) { + return backend == CUDECOMP_HALO_COMM_MPI || backend == CUDECOMP_HALO_COMM_MPI_BLOCKING; +} + +void expectShiftedRanks(const cudecomp_test::MpiTestComm& comm, cudecompHandle_t handle, cudecompGridDesc_t grid_desc, + int axis, int dim, int displacement, bool periodic, + const std::array& expected_ranks) { + int32_t shifted_rank = -2; + CHECK_CUDECOMP_GLOBAL(comm, + cudecompGetShiftedRank(handle, grid_desc, axis, dim, displacement, periodic, &shifted_rank)); + EXPECT_EQ(expected_ranks[comm.rank()], shifted_rank); +} + +TEST(ApiGridDescConfigSetDefaultsTest, SetsDocumentedDefaults) { + cudecompGridDescConfig_t config; + ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config)); + + EXPECT_EQ(CUDECOMP_RANK_ORDER_DEFAULT, config.rank_order); + EXPECT_EQ(CUDECOMP_TRANSPOSE_COMM_MPI_P2P, config.transpose_comm_backend); + EXPECT_EQ(CUDECOMP_HALO_COMM_MPI, config.halo_comm_backend); + for (int i = 0; i < 2; ++i) { + EXPECT_EQ(0, config.pdims[i]); + } + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(0, config.gdims[i]); + EXPECT_EQ(0, config.gdims_dist[i]); + EXPECT_FALSE(config.transpose_axis_contiguous[i]); + for (int j = 0; j < 3; ++j) { + EXPECT_EQ(-1, config.transpose_mem_order[i][j]); + } + } +} + +TEST(ApiGridDescConfigSetDefaultsTest, RejectsInvalidArguments) { + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescConfigSetDefaults(nullptr)); +} + +TEST(ApiGridDescAutotuneOptionsSetDefaultsTest, SetsDocumentedDefaults) { + cudecompGridDescAutotuneOptions_t options; + ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescAutotuneOptionsSetDefaults(&options)); + + EXPECT_EQ(3, options.n_warmup_trials); + EXPECT_EQ(5, options.n_trials); + EXPECT_EQ(CUDECOMP_AUTOTUNE_GRID_TRANSPOSE, options.grid_mode); + EXPECT_EQ(CUDECOMP_DOUBLE, options.dtype); + EXPECT_TRUE(options.allow_uneven_decompositions); + EXPECT_FALSE(options.disable_nccl_backends); + EXPECT_FALSE(options.disable_nvshmem_backends); + EXPECT_EQ(0.0, options.skip_threshold); + EXPECT_FALSE(options.autotune_transpose_backend); + EXPECT_FALSE(options.autotune_halo_backend); + EXPECT_EQ(0, options.halo_axis); + + for (int i = 0; i < 4; ++i) { + EXPECT_FALSE(options.transpose_use_inplace_buffers[i]); + EXPECT_EQ(1.0, options.transpose_op_weights[i]); + for (int j = 0; j < 3; ++j) { + EXPECT_EQ(0, options.transpose_input_halo_extents[i][j]); + EXPECT_EQ(0, options.transpose_output_halo_extents[i][j]); + EXPECT_EQ(0, options.transpose_input_padding[i][j]); + EXPECT_EQ(0, options.transpose_output_padding[i][j]); + } + } + + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(0, options.halo_extents[i]); + EXPECT_FALSE(options.halo_periods[i]); + EXPECT_EQ(0, options.halo_padding[i]); + } +} + +TEST(ApiGridDescAutotuneOptionsSetDefaultsTest, RejectsInvalidArguments) { + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescAutotuneOptionsSetDefaults(nullptr)); +} + +TEST(ApiGetDataTypeSizeTest, ReturnsSupportedTypeSizes) { + int64_t dtype_size = 0; + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_FLOAT, &dtype_size)); + EXPECT_EQ(4, dtype_size); + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_DOUBLE, &dtype_size)); + EXPECT_EQ(8, dtype_size); + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_FLOAT_COMPLEX, &dtype_size)); + EXPECT_EQ(8, dtype_size); + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGetDataTypeSize(CUDECOMP_DOUBLE_COMPLEX, &dtype_size)); + EXPECT_EQ(16, dtype_size); +} + +TEST(ApiGetDataTypeSizeTest, RejectsInvalidArguments) { + int64_t dtype_size = 0; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetDataTypeSize(CUDECOMP_FLOAT, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetDataTypeSize(static_cast(999), &dtype_size)); +} + +TEST(ApiTransposeCommBackendToStringTest, ReturnsBackendNames) { + EXPECT_STREQ("MPI_P2P", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P)); + EXPECT_STREQ("MPI_P2P (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL)); + EXPECT_STREQ("MPI_A2A", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_A2A)); + EXPECT_STREQ("NCCL", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NCCL)); + EXPECT_STREQ("NCCL (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NCCL_PL)); + EXPECT_STREQ("NVSHMEM", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM)); + EXPECT_STREQ("NVSHMEM (pipelined)", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL)); + EXPECT_STREQ("NVSHMEM_SM", cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM)); +} + +TEST(ApiTransposeCommBackendToStringTest, ReturnsErrorForInvalidBackend) { + EXPECT_STREQ("ERROR", cudecompTransposeCommBackendToString(static_cast(999))); +} + +TEST(ApiHaloCommBackendToStringTest, ReturnsBackendNames) { + EXPECT_STREQ("MPI", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI)); + EXPECT_STREQ("MPI (blocking)", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI_BLOCKING)); + EXPECT_STREQ("NCCL", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NCCL)); + EXPECT_STREQ("NVSHMEM", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NVSHMEM)); + EXPECT_STREQ("NVSHMEM (blocking)", cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING)); +} + +TEST(ApiHaloCommBackendToStringTest, ReturnsErrorForInvalidBackend) { + EXPECT_STREQ("ERROR", cudecompHaloCommBackendToString(static_cast(999))); +} + +class ApiMpiTestBase : public ::testing::Test { +protected: + void SetUp() override { + auto world_comm = cudecomp_test::MpiTestComm::world(); + if (world_comm.size() < kApiTestRanks) { + GTEST_SKIP() << "API tests require " << kApiTestRanks << " ranks, launched with " << world_comm.size(); + } + + active_comm_ = cudecomp_test::MpiTestComm::split(world_comm, kApiTestRanks); + if (!active_comm_.valid()) { GTEST_SKIP() << "inactive rank for " << kApiTestRanks << "-rank API case"; } + + const auto setup_decision = cudecomp_test::initializeGpuForTest(active_comm_); + ASSERT_FALSE(setup_decision.fail) << setup_decision.reason; + if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; } + + const cudecompResult_t init_result = cudecompInit(&handle_, active_comm_.mpiComm()); + handle_guard_ = std::make_unique(handle_); + CHECK_CUDECOMP_GLOBAL(active_comm_, init_result); + } + + void TearDown() override { + handle_guard_.reset(); + active_comm_.reset(); + } + + cudecompGridDescConfig_t distributedConfig() { + cudecompGridDescConfig_t config; + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config)); + setDistributedConfig(config); + return config; + } + + cudecompGridDescConfig_t emptyPencilConfig() { + auto config = distributedConfig(); + config.gdims_dist[0] = kGdims[0]; + config.gdims_dist[1] = 1; + config.gdims_dist[2] = kGdims[2]; + return config; + } + + cudecompGridDescAutotuneOptions_t fastAutotuneOptions() { + cudecompGridDescAutotuneOptions_t options; + EXPECT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescAutotuneOptionsSetDefaults(&options)); + options.n_warmup_trials = 0; + options.n_trials = 1; + options.dtype = CUDECOMP_FLOAT; + options.disable_nccl_backends = true; + options.disable_nvshmem_backends = true; + return options; + } + + void expectGridDescCreateInvalid(cudecompGridDescConfig_t config) { + cudecompGridDesc_t grid_desc = nullptr; + const cudecompResult_t result = cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, result); + if (grid_desc) { (void)cudecompGridDescDestroy(handle_, grid_desc); } + } + + cudecomp_test::MpiTestComm active_comm_; + cudecompHandle_t handle_ = nullptr; + std::unique_ptr handle_guard_; +}; + +class ApiInitTest : public ApiMpiTestBase {}; +class ApiFinalizeTest : public ApiMpiTestBase {}; +class ApiGridDescCreateTest : public ApiMpiTestBase {}; +class ApiGridDescDestroyTest : public ApiMpiTestBase {}; +class ApiGetGridDescConfigTest : public ApiMpiTestBase {}; +class ApiGetPencilInfoTest : public ApiMpiTestBase {}; +class ApiGetTransposeWorkspaceSizeTest : public ApiMpiTestBase {}; +class ApiGetHaloWorkspaceSizeTest : public ApiMpiTestBase {}; +class ApiGetShiftedRankTest : public ApiMpiTestBase {}; +class ApiMallocTest : public ApiMpiTestBase {}; +class ApiFreeTest : public ApiMpiTestBase {}; +class ApiTransposeTest : public ApiMpiTestBase {}; +class ApiHaloTest : public ApiMpiTestBase {}; + +TEST_F(ApiInitTest, RejectsInvalidArguments) { + cudecompHandle_t second_handle = nullptr; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompInit(&second_handle, active_comm_.mpiComm())); + EXPECT_EQ(nullptr, second_handle); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompInit(nullptr, active_comm_.mpiComm())); +} + +TEST_F(ApiFinalizeTest, RejectsInvalidArguments) { + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFinalize(nullptr)); +} + +TEST_F(ApiGridDescCreateTest, RejectsInvalidConfigs) { + auto config = distributedConfig(); + config.pdims[0] = 1; + config.pdims[1] = 1; + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.pdims[0] = 0; + config.pdims[1] = 1; + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.rank_order = static_cast(999); + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.transpose_comm_backend = static_cast(999); + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.halo_comm_backend = static_cast(999); + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.transpose_mem_order[0][0] = 0; + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + setMemOrder(config, {0, 1, 1}); + expectGridDescCreateInvalid(config); + + config = distributedConfig(); + config.gdims_dist[0] = kGdims[0] + 1; + config.gdims_dist[1] = kGdims[1]; + config.gdims_dist[2] = kGdims[2]; + expectGridDescCreateInvalid(config); +} + +TEST_F(ApiGridDescCreateTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t unused_grid_desc = nullptr; + + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(nullptr, &unused_grid_desc, &config, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, nullptr, &config, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, nullptr, nullptr)); +} + +TEST_F(ApiGridDescCreateTest, RejectsInvalidAutotuneInputs) { + auto config = distributedConfig(); + config.pdims[0] = 0; + config.pdims[1] = 0; + cudecompGridDesc_t unused_grid_desc = nullptr; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, &config, nullptr)); + + config = distributedConfig(); + auto options = fastAutotuneOptions(); + options.grid_mode = static_cast(999); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescCreate(handle_, &unused_grid_desc, &config, &options)); +} + +TEST_F(ApiGridDescDestroyTest, RejectsInvalidArguments) { + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGridDescDestroy(handle_, nullptr)); +} + +TEST_F(ApiGetGridDescConfigTest, CreatePreservesConfigSettings) { + auto config = distributedConfig(); + setGdimsDist(config); + config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR; + config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_A2A; + config.halo_comm_backend = CUDECOMP_HALO_COMM_MPI_BLOCKING; + for (int i = 0; i < 3; ++i) { + config.transpose_axis_contiguous[i] = true; + } + + auto expected_config = config; + + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + cudecompGridDescConfig_t queried_config; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config)); + expectGridDescConfigEquals(config, expected_config); + expectGridDescConfigEquals(queried_config, expected_config); +} + +TEST_F(ApiGetGridDescConfigTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + cudecompGridDescConfig_t queried_config; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetGridDescConfig(handle_, grid_desc, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetGridDescConfig(handle_, nullptr, &queried_config)); +} + +TEST_F(ApiGridDescCreateTest, AutotunesTransposeConfig) { + auto config = distributedConfig(); + config.pdims[0] = 0; + config.pdims[1] = 0; + auto options = fastAutotuneOptions(); + options.grid_mode = CUDECOMP_AUTOTUNE_GRID_TRANSPOSE; + options.autotune_transpose_backend = true; + + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, &options)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + EXPECT_GT(config.pdims[0], 0); + EXPECT_GT(config.pdims[1], 0); + EXPECT_EQ(kApiTestRanks, config.pdims[0] * config.pdims[1]); + EXPECT_TRUE(isMpiTransposeBackend(config.transpose_comm_backend)) + << cudecompTransposeCommBackendToString(config.transpose_comm_backend); + EXPECT_EQ(CUDECOMP_HALO_COMM_MPI, config.halo_comm_backend); + + cudecompGridDescConfig_t queried_config; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config)); + expectGridDescConfigEquals(queried_config, config); +} + +TEST_F(ApiGridDescCreateTest, AutotunesHaloConfig) { + auto config = distributedConfig(); + config.pdims[0] = 0; + config.pdims[1] = 0; + auto options = fastAutotuneOptions(); + options.grid_mode = CUDECOMP_AUTOTUNE_GRID_HALO; + options.autotune_halo_backend = true; + for (int i = 0; i < 3; ++i) { + options.halo_extents[i] = kHaloExtents[i]; + options.halo_periods[i] = kHaloPeriods[i]; + options.halo_padding[i] = kPadding[i]; + } + + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, &options)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + EXPECT_GT(config.pdims[0], 0); + EXPECT_GT(config.pdims[1], 0); + EXPECT_EQ(kApiTestRanks, config.pdims[0] * config.pdims[1]); + EXPECT_EQ(CUDECOMP_TRANSPOSE_COMM_MPI_P2P, config.transpose_comm_backend); + EXPECT_TRUE(isMpiHaloBackend(config.halo_comm_backend)) << cudecompHaloCommBackendToString(config.halo_comm_backend); + + cudecompGridDescConfig_t queried_config; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetGridDescConfig(handle_, grid_desc, &queried_config)); + expectGridDescConfigEquals(queried_config, config); +} + +TEST_F(ApiGetPencilInfoTest, MatchesExpectedDefaultDecomposition) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + for (int axis = 0; axis < 3; ++axis) { + cudecompPencilInfo_t pinfo; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(), + kPadding.data())); + expectPencilInfoEquals(pinfo, kExpectedDefaultPencilInfo[axis][active_comm_.rank()]); + } +} + +TEST_F(ApiGetPencilInfoTest, MatchesExpectedColumnMajorDecomposition) { + auto config = distributedConfig(); + config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR; + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + for (int axis = 0; axis < 3; ++axis) { + cudecompPencilInfo_t pinfo; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(), + kPadding.data())); + expectPencilInfoEquals(pinfo, kExpectedColumnMajorPencilInfo[axis][active_comm_.rank()]); + } +} + +TEST_F(ApiGetPencilInfoTest, MatchesExpectedGdimsDistDecomposition) { + auto config = distributedConfig(); + setGdimsDist(config); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + for (int axis = 0; axis < 3; ++axis) { + cudecompPencilInfo_t pinfo; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, axis, kHaloExtents.data(), + kPadding.data())); + expectPencilInfoEquals(pinfo, kExpectedGdimsDistPencilInfo[axis][active_comm_.rank()]); + } +} + +TEST_F(ApiGetPencilInfoTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + cudecompPencilInfo_t pinfo; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetPencilInfo(handle_, grid_desc, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetPencilInfo(handle_, grid_desc, &pinfo, -1, nullptr, nullptr)); +} + +TEST_F(ApiGetTransposeWorkspaceSizeTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int64_t workspace_size = 0; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetTransposeWorkspaceSize(handle_, grid_desc, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetTransposeWorkspaceSize(handle_, nullptr, &workspace_size)); +} + +TEST_F(ApiGetHaloWorkspaceSizeTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int64_t workspace_size = 0; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompGetHaloWorkspaceSize(handle_, grid_desc, 0, nullptr, &workspace_size)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompGetHaloWorkspaceSize(handle_, grid_desc, 0, kHaloExtents.data(), nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompGetHaloWorkspaceSize(handle_, grid_desc, 3, kHaloExtents.data(), &workspace_size)); +} + +TEST_F(ApiGetShiftedRankTest, ReturnsExpectedRanksForRowMajorLayout) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, false, {2, 3, -1, -1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, -1, false, {-1, -1, 0, 1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, true, {2, 3, 0, 1}); + + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, false, {1, -1, 3, -1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, -1, false, {-1, 0, -1, 2}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, true, {1, 0, 3, 2}); +} + +TEST_F(ApiGetShiftedRankTest, ReturnsExpectedRanksForColumnMajorLayout) { + auto config = distributedConfig(); + config.rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR; + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, false, {1, -1, 3, -1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, -1, false, {-1, 0, -1, 2}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 1, 1, true, {1, 0, 3, 2}); + + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, false, {2, 3, -1, -1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, -1, false, {-1, -1, 0, 1}); + expectShiftedRanks(active_comm_, handle_, grid_desc, 0, 2, 1, true, {2, 3, 0, 1}); +} + +TEST_F(ApiGetShiftedRankTest, HandlesAxisAlignedAndZeroDisplacements) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int32_t shifted_rank = -2; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, 0, false, &shifted_rank)); + EXPECT_EQ(active_comm_.rank(), shifted_rank); + + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 0, 1, false, &shifted_rank)); + EXPECT_EQ(-1, shifted_rank); + + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 0, 1, true, &shifted_rank)); + EXPECT_EQ(active_comm_.rank(), shifted_rank); + + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, kPdims[0], true, &shifted_rank)); + EXPECT_EQ(active_comm_.rank(), shifted_rank); + + CHECK_CUDECOMP_GLOBAL(active_comm_, + cudecompGetShiftedRank(handle_, grid_desc, 0, 1, kPdims[0], false, &shifted_rank)); + EXPECT_EQ(-1, shifted_rank); +} + +TEST_F(ApiGetShiftedRankTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int32_t shifted_rank = 0; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 0, 1, 1, false, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 3, 1, 1, false, &shifted_rank)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompGetShiftedRank(handle_, grid_desc, 0, 3, 1, false, &shifted_rank)); +} + +TEST_F(ApiMallocTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + void* buffer = nullptr; + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompMalloc(handle_, grid_desc, nullptr, 16)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompMalloc(handle_, grid_desc, &buffer, 0)); +} + +TEST_F(ApiFreeTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFree(nullptr, grid_desc, nullptr)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, cudecompFree(handle_, nullptr, nullptr)); +} + +TEST_F(ApiTransposeTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int placeholder = 0; + void* valid_pointer = &placeholder; + + // X-to-Y is a representative placeholder for all transpose APIs; the directional variants route to the same + // generic transpose implementation. + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompTransposeXToY(handle_, grid_desc, nullptr, valid_pointer, valid_pointer, CUDECOMP_FLOAT, nullptr, + nullptr, nullptr, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompTransposeXToY(handle_, grid_desc, valid_pointer, nullptr, valid_pointer, CUDECOMP_FLOAT, nullptr, + nullptr, nullptr, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompTransposeXToY(handle_, grid_desc, valid_pointer, valid_pointer, nullptr, CUDECOMP_FLOAT, nullptr, + nullptr, nullptr, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompTransposeXToY(handle_, grid_desc, valid_pointer, valid_pointer, valid_pointer, + static_cast(999), nullptr, nullptr, nullptr, nullptr, 0)); +} + +TEST_F(ApiTransposeTest, RejectsEmptyPencilDecomposition) { + auto config = emptyPencilConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int input = 0; + int output = 0; + int work = 0; + EXPECT_EQ(CUDECOMP_RESULT_NOT_SUPPORTED, + cudecompTransposeXToY(handle_, grid_desc, &input, &output, &work, CUDECOMP_FLOAT, nullptr, nullptr, nullptr, + nullptr, 0)); +} + +TEST_F(ApiHaloTest, RejectsInvalidArguments) { + auto config = distributedConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int placeholder = 0; + void* valid_pointer = &placeholder; + + // X-halo updates are a representative placeholder for all halo APIs; the directional variants route to the same + // generic halo implementation. + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, nullptr, + kHaloPeriods.data(), 0, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(), + nullptr, 0, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, nullptr, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(), + kHaloPeriods.data(), 0, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, nullptr, CUDECOMP_FLOAT, kHaloExtents.data(), + kHaloPeriods.data(), 0, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, CUDECOMP_FLOAT, kHaloExtents.data(), + kHaloPeriods.data(), 3, nullptr, 0)); + EXPECT_EQ(CUDECOMP_RESULT_INVALID_USAGE, + cudecompUpdateHalosX(handle_, grid_desc, valid_pointer, valid_pointer, static_cast(999), + kHaloExtents.data(), kHaloPeriods.data(), 0, nullptr, 0)); +} + +TEST_F(ApiHaloTest, RejectsEmptyPencilDecomposition) { + auto config = emptyPencilConfig(); + cudecompGridDesc_t grid_desc = nullptr; + CHECK_CUDECOMP_GLOBAL(active_comm_, cudecompGridDescCreate(handle_, &grid_desc, &config, nullptr)); + cudecomp_test::gridDescGuard grid_desc_guard(handle_, grid_desc); + + int input = 0; + int work = 0; + EXPECT_EQ(CUDECOMP_RESULT_NOT_SUPPORTED, + cudecompUpdateHalosX(handle_, grid_desc, &input, &work, CUDECOMP_FLOAT, kHaloExtents.data(), + kHaloPeriods.data(), 0, nullptr, 0)); +} + +} // namespace diff --git a/tests/ctest/backend_config.h.in b/tests/ctest/backend_config.h.in new file mode 100644 index 0000000..9ba017b --- /dev/null +++ b/tests/ctest/backend_config.h.in @@ -0,0 +1,11 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_BACKEND_CONFIG_H +#define CUDECOMP_TEST_BACKEND_CONFIG_H + +#cmakedefine01 CUDECOMP_TEST_ENABLE_NVSHMEM + +#endif diff --git a/tests/ctest/backend_test_context.cc b/tests/ctest/backend_test_context.cc new file mode 100644 index 0000000..093f780 --- /dev/null +++ b/tests/ctest/backend_test_context.cc @@ -0,0 +1,139 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "backend_test_context.h" + +#include +#include +#include +#include + +#include "test_utils.h" + +namespace cudecomp_test { +namespace { + +constexpr const char* kKeepaliveBackendEnv = "CUDECOMP_TEST_KEEPALIVE_BACKEND"; + +class SharedBackendTestContextState { +public: + SharedBackendTestContextState(std::string backend_label, int active_ranks, MpiTestComm active_comm, + cudecompHandle_t handle) + : backend_label_(std::move(backend_label)), active_ranks_(active_ranks), active_comm_(std::move(active_comm)), + handle_(handle), handle_guard_(handle) {} + + const MpiTestComm& comm() const { return active_comm_; } + cudecompHandle_t handle() const { return handle_; } + + bool compatibleWith(const char* backend_label, int active_ranks) const { + return backend_label_ == backend_label && active_ranks_ == active_ranks; + } + + testing::AssertionResult ensureKeepaliveGridDesc(const cudecompGridDescConfig_t& config) { + if (keepalive_grid_desc_guard_) return testing::AssertionSuccess(); + + cudecompGridDescConfig_t keepalive_config = config; + cudecompGridDesc_t grid_desc = nullptr; + const cudecompResult_t result = cudecompGridDescCreate(handle_, &grid_desc, &keepalive_config, nullptr); + keepalive_grid_desc_guard_ = std::make_unique(handle_, grid_desc); + return checkCudecompGlobal(active_comm_, result, __FILE__, __LINE__); + } + +private: + std::string backend_label_; + int active_ranks_ = 0; + MpiTestComm active_comm_; + cudecompHandle_t handle_ = nullptr; + cudecompHandleGuard handle_guard_; + std::unique_ptr keepalive_grid_desc_guard_; +}; + +std::unique_ptr shared_context; + +std::string requestedKeepaliveBackend() { + const char* value = std::getenv(kKeepaliveBackendEnv); + if (!value || value[0] == '\0') return {}; + + const std::string backend(value); + if (backend == "nccl" || backend == "nvshmem") return backend; + return {}; +} + +} // namespace + +testing::AssertionResult BackendTestContext::initialize(const MpiTestComm& world_comm, int active_ranks, + const char* backend_label, bool check_nccl, + const cudecompGridDescConfig_t& config, + TestSetupDecision* setup_decision) { + if (!setup_decision) { return testing::AssertionFailure() << "test setup decision argument cannot be null"; } + + *setup_decision = {}; + active_comm_ = nullptr; + handle_ = nullptr; + + const std::string keepalive_backend = requestedKeepaliveBackend(); + const bool use_shared_context = + !keepalive_backend.empty() && keepalive_backend == backend_label && active_ranks == world_comm.size(); + + if (use_shared_context) { + if (shared_context && !shared_context->compatibleWith(backend_label, active_ranks)) { + resetSharedBackendTestContext(); + } + + if (!shared_context) { + auto active_comm = MpiTestComm::split(world_comm, active_ranks); + if (!active_comm.valid()) { + *setup_decision = {true, false, "inactive rank for shared backend test context"}; + return testing::AssertionSuccess(); + } + + *setup_decision = initializeGpuForTest(active_comm, check_nccl); + if (setup_decision->skip || setup_decision->fail) return testing::AssertionSuccess(); + + cudecompHandle_t handle = nullptr; + const cudecompResult_t init_result = cudecompInit(&handle, active_comm.mpiComm()); + auto state = + std::make_unique(backend_label, active_ranks, std::move(active_comm), handle); + + testing::AssertionResult init_status = checkCudecompGlobal(state->comm(), init_result, __FILE__, __LINE__); + if (!init_status) return init_status; + + testing::AssertionResult keepalive_status = state->ensureKeepaliveGridDesc(config); + if (!keepalive_status) return keepalive_status; + + shared_context = std::move(state); + } + + active_comm_ = &shared_context->comm(); + handle_ = shared_context->handle(); + return testing::AssertionSuccess(); + } + + resetSharedBackendTestContext(); + + local_active_comm_ = MpiTestComm::split(world_comm, active_ranks); + if (!local_active_comm_.valid()) { + *setup_decision = {true, false, + std::string("inactive rank for ") + std::to_string(active_ranks) + "-rank " + backend_label + + " test case"}; + return testing::AssertionSuccess(); + } + + *setup_decision = initializeGpuForTest(local_active_comm_, check_nccl); + if (setup_decision->skip || setup_decision->fail) return testing::AssertionSuccess(); + + const cudecompResult_t init_result = cudecompInit(&local_handle_, local_active_comm_.mpiComm()); + local_handle_guard_ = std::make_unique(local_handle_); + testing::AssertionResult init_status = checkCudecompGlobal(local_active_comm_, init_result, __FILE__, __LINE__); + if (!init_status) return init_status; + + active_comm_ = &local_active_comm_; + handle_ = local_handle_; + return testing::AssertionSuccess(); +} + +void resetSharedBackendTestContext() { shared_context.reset(); } + +} // namespace cudecomp_test diff --git a/tests/ctest/backend_test_context.h b/tests/ctest/backend_test_context.h new file mode 100644 index 0000000..1e63646 --- /dev/null +++ b/tests/ctest/backend_test_context.h @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_BACKEND_TEST_CONTEXT_H +#define CUDECOMP_TEST_BACKEND_TEST_CONTEXT_H + +#include + +#include + +#include "cudecomp.h" +#include "gpu_test_utils.h" +#include "mpi_test_utils.h" +#include "test_utils.h" + +namespace cudecomp_test { + +class BackendTestContext { +public: + testing::AssertionResult initialize(const MpiTestComm& world_comm, int active_ranks, const char* backend_label, + bool check_nccl, const cudecompGridDescConfig_t& config, + TestSetupDecision* setup_decision); + + const MpiTestComm& comm() const { return *active_comm_; } + cudecompHandle_t handle() const { return handle_; } + +private: + MpiTestComm local_active_comm_; + cudecompHandle_t local_handle_ = nullptr; + std::unique_ptr local_handle_guard_; + const MpiTestComm* active_comm_ = nullptr; + cudecompHandle_t handle_ = nullptr; +}; + +void resetSharedBackendTestContext(); + +} // namespace cudecomp_test + +#endif diff --git a/tests/ctest/backend_utils.cc b/tests/ctest/backend_utils.cc new file mode 100644 index 0000000..19d2e6a --- /dev/null +++ b/tests/ctest/backend_utils.cc @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "backend_utils.h" + +#include "backend_config.h" + +namespace cudecomp_test { + +std::vector transposeBackends() { + std::vector backends = { + {CUDECOMP_TRANSPOSE_COMM_MPI_P2P, "mpi-p2p", "mpi"}, {CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL, "mpi-p2p-pl", "mpi"}, + {CUDECOMP_TRANSPOSE_COMM_MPI_A2A, "mpi-a2a", "mpi"}, {CUDECOMP_TRANSPOSE_COMM_NCCL, "nccl", "nccl"}, + {CUDECOMP_TRANSPOSE_COMM_NCCL_PL, "nccl-pl", "nccl"}, + }; + +#if CUDECOMP_TEST_ENABLE_NVSHMEM + backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM, "nvshmem", "nvshmem"}); + backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM_PL, "nvshmem-pl", "nvshmem"}); + backends.push_back({CUDECOMP_TRANSPOSE_COMM_NVSHMEM_SM, "nvshmem-sm", "nvshmem"}); +#endif + + return backends; +} + +std::vector haloBackends() { + std::vector backends = { + {CUDECOMP_HALO_COMM_MPI, "mpi", "mpi"}, + {CUDECOMP_HALO_COMM_MPI_BLOCKING, "mpi-blocking", "mpi"}, + {CUDECOMP_HALO_COMM_NCCL, "nccl", "nccl"}, + }; + +#if CUDECOMP_TEST_ENABLE_NVSHMEM + backends.push_back({CUDECOMP_HALO_COMM_NVSHMEM, "nvshmem", "nvshmem"}); + backends.push_back({CUDECOMP_HALO_COMM_NVSHMEM_BLOCKING, "nvshmem-blocking", "nvshmem"}); +#endif + + return backends; +} + +} // namespace cudecomp_test diff --git a/tests/ctest/backend_utils.h b/tests/ctest/backend_utils.h new file mode 100644 index 0000000..b532d3d --- /dev/null +++ b/tests/ctest/backend_utils.h @@ -0,0 +1,32 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_BACKEND_UTILS_H +#define CUDECOMP_TEST_BACKEND_UTILS_H + +#include + +#include "cudecomp.h" + +namespace cudecomp_test { + +struct TransposeBackend { + cudecompTransposeCommBackend_t backend; + const char* name; + const char* label; +}; + +struct HaloBackend { + cudecompHaloCommBackend_t backend; + const char* name; + const char* label; +}; + +std::vector transposeBackends(); +std::vector haloBackends(); + +} // namespace cudecomp_test + +#endif diff --git a/tests/ctest/fortran_api_test.f90 b/tests/ctest/fortran_api_test.f90 new file mode 100644 index 0000000..37ed0a8 --- /dev/null +++ b/tests/ctest/fortran_api_test.f90 @@ -0,0 +1,420 @@ +! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: Apache-2.0 + +program cudecomp_fortran_api_test + use, intrinsic :: iso_fortran_env, only: real32, real64 + use cudafor + use cudecomp + use mpi + + implicit none + + integer, parameter :: api_test_ranks = 4 + integer, parameter :: k_gdims(3) = [9, 10, 11] + integer, parameter :: k_gdims_dist(3) = [8, 9, 10] + integer, parameter :: k_pdims(2) = [2, 2] + integer, parameter :: k_halo_extents(3) = [1, 2, 1] + integer, parameter :: k_padding(3) = [1, 0, 2] + + integer :: rank = -1 + integer :: nranks = 0 + integer :: ierr = 0 + integer :: local_comm = MPI_COMM_NULL + integer :: local_rank = 0 + integer :: failures = 0 + integer :: global_failures = 0 + logical :: handle_initialized = .false. + type(cudecompHandle) :: handle + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr) + + if (nranks /= api_test_ranks) then + call record_failure("Fortran API test requires exactly 4 MPI ranks") + else + call initialize_gpu() + endif + + if (failures == 0) then + call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit") + handle_initialized = failures == 0 + endif + + if (handle_initialized) then + call test_default_values() + call test_grid_descriptor_contracts(handle) + call test_descriptor_queries(handle) + call test_workspace_and_shifted_rank(handle) + call test_dtype_sizes_and_strings() + call test_typed_malloc_free(handle) + call expect_success(cudecompFinalize(handle), "cudecompFinalize") + endif + + call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr) + if (rank == 0 .and. global_failures == 0) then + write(*, '("Fortran API test passed")') + elseif (rank == 0) then + write(*, '("Fortran API test failed with ", i0, " rank-local failure(s)")') global_failures + endif + + if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr) + call MPI_Finalize(ierr) + if (global_failures /= 0) call exit(1) + +contains + + subroutine initialize_gpu() + integer :: status + integer :: num_devices + + call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr) + call MPI_Comm_rank(local_comm, local_rank, ierr) + + status = cudaGetDeviceCount(num_devices) + if (status /= cudaSuccess) then + call record_failure("cudaGetDeviceCount failed") + return + endif + if (num_devices <= 0) then + call record_failure("Fortran API test requires at least one visible CUDA device") + return + endif + + status = cudaSetDevice(mod(local_rank, num_devices)) + if (status /= cudaSuccess) call record_failure("cudaSetDevice failed") + end subroutine initialize_gpu + + subroutine test_default_values() + type(cudecompGridDescConfig) :: config + type(cudecompGridDescAutotuneOptions) :: options + + call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults") + call expect_equal_int(config%rank_order, CUDECOMP_RANK_ORDER_DEFAULT, "default rank order") + call expect_equal_int(config%transpose_comm_backend, CUDECOMP_TRANSPOSE_COMM_MPI_P2P, & + "default transpose backend") + call expect_equal_int(config%halo_comm_backend, CUDECOMP_HALO_COMM_MPI, "default halo backend") + call expect_int_array(config%pdims, [0, 0], "default pdims") + call expect_int_array(config%gdims, [0, 0, 0], "default gdims") + call expect_int_array(config%gdims_dist, [0, 0, 0], "default gdims_dist") + call expect_int_array(reshape(config%transpose_mem_order, [9]), [-1, -1, -1, -1, -1, -1, -1, -1, -1], & + "default transpose_mem_order") + if (any(config%transpose_axis_contiguous)) call record_failure("default axis-contiguous flags are not false") + + call expect_success(cudecompGridDescAutotuneOptionsSetDefaults(options), & + "cudecompGridDescAutotuneOptionsSetDefaults") + call expect_equal_int(options%n_warmup_trials, 3, "default warmup trials") + call expect_equal_int(options%n_trials, 5, "default trials") + call expect_equal_int(options%grid_mode, CUDECOMP_AUTOTUNE_GRID_TRANSPOSE, "default grid mode") + call expect_equal_int(options%dtype, CUDECOMP_DOUBLE, "default autotune dtype") + call expect_equal_int(options%halo_axis, 1, "default Fortran halo axis") + if (.not. options%allow_uneven_decompositions) call record_failure("default uneven decompositions flag is false") + if (options%autotune_transpose_backend) call record_failure("default transpose backend autotune flag is true") + if (options%autotune_halo_backend) call record_failure("default halo backend autotune flag is true") + end subroutine test_default_values + + subroutine test_grid_descriptor_contracts(handle) + type(cudecompHandle) :: handle + type(cudecompGridDescConfig) :: config + type(cudecompGridDescConfig) :: expected_config + type(cudecompGridDescConfig) :: queried_config + type(cudecompGridDescAutotuneOptions) :: options + type(cudecompGridDesc) :: grid_desc + integer :: res + + call setup_explicit_config(config) + expected_config = config + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, "cudecompGridDescCreate without options") + call expect_config_equal(config, expected_config, "config restored after create without options") + if (res == CUDECOMP_RESULT_SUCCESS) then + call expect_success(cudecompGetGridDescConfig(handle, grid_desc, queried_config), & + "cudecompGetGridDescConfig without options") + call expect_config_equal(queried_config, expected_config, "queried config without options") + call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor without options") + endif + + call setup_explicit_config(config) + expected_config = config + call expect_success(cudecompGridDescAutotuneOptionsSetDefaults(options), & + "set defaults before create with options") + options%n_warmup_trials = 0 + options%n_trials = 1 + options%dtype = CUDECOMP_FLOAT + options%disable_nccl_backends = .true. + options%disable_nvshmem_backends = .true. + options%halo_axis = 3 + options%halo_extents = k_halo_extents + options%halo_periods = [.false., .true., .false.] + options%halo_padding = k_padding + + res = cudecompGridDescCreate(handle, grid_desc, config, options) + call expect_success(res, "cudecompGridDescCreate with options") + call expect_config_equal(config, expected_config, "config restored after create with options") + call expect_equal_int(options%halo_axis, 3, "options halo_axis restored after create") + if (res == CUDECOMP_RESULT_SUCCESS) then + call expect_success(cudecompGetGridDescConfig(handle, grid_desc, queried_config), & + "cudecompGetGridDescConfig with options") + call expect_config_equal(queried_config, expected_config, "queried config with options") + call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor with options") + endif + end subroutine test_grid_descriptor_contracts + + subroutine test_descriptor_queries(handle) + type(cudecompHandle) :: handle + type(cudecompGridDescConfig) :: config + type(cudecompGridDesc) :: grid_desc + type(cudecompPencilInfo) :: pinfo + integer :: res + + call setup_distributed_config(config) + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, "create descriptor for pencil queries") + if (res /= CUDECOMP_RESULT_SUCCESS) return + + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, 1), & + "cudecompGetPencilInfo without optional arrays") + call expect_axis1_pencil_without_halo(pinfo) + + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, 2, k_halo_extents, k_padding), & + "cudecompGetPencilInfo with halo and padding") + call expect_int_array(pinfo%order, [1, 2, 3], "Fortran pencil order is one-based") + call expect_int_array(pinfo%halo_extents, k_halo_extents, "explicit pencil halo extents") + call expect_int_array(pinfo%padding, k_padding, "explicit pencil padding") + if (minval(pinfo%lo) < 1) call record_failure("Fortran pencil lower bounds are not one-based") + if (minval(pinfo%hi) < 1) call record_failure("Fortran pencil upper bounds are not one-based") + + call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for pencil queries") + end subroutine test_descriptor_queries + + subroutine test_workspace_and_shifted_rank(handle) + type(cudecompHandle) :: handle + type(cudecompGridDescConfig) :: config + type(cudecompGridDesc) :: grid_desc + integer :: res + integer(8) :: workspace_size + integer :: shifted_rank + integer, parameter :: expected_shifted_ranks(api_test_ranks) = [2, 3, -1, -1] + + call setup_distributed_config(config) + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, "create descriptor for workspace queries") + if (res /= CUDECOMP_RESULT_SUCCESS) return + + workspace_size = -1_8 + call expect_success(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_size), & + "cudecompGetTransposeWorkspaceSize") + if (workspace_size < 0_8) call record_failure("transpose workspace size is negative") + + workspace_size = -1_8 + call expect_success(cudecompGetHaloWorkspaceSize(handle, grid_desc, 2, k_halo_extents, workspace_size), & + "cudecompGetHaloWorkspaceSize") + if (workspace_size < 0_8) call record_failure("halo workspace size is negative") + + shifted_rank = -2 + call expect_success(cudecompGetShiftedRank(handle, grid_desc, 1, 2, 1, .false., shifted_rank), & + "cudecompGetShiftedRank") + call expect_equal_int(shifted_rank, expected_shifted_ranks(rank + 1), "one-based shifted rank query") + + call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for workspace queries") + end subroutine test_workspace_and_shifted_rank + + subroutine test_dtype_sizes_and_strings() + integer(8) :: dtype_size + + call expect_success(cudecompGetDataTypeSize(CUDECOMP_FLOAT, dtype_size), "float dtype size query") + call expect_equal_integer8(dtype_size, 4_8, "float dtype size") + call expect_success(cudecompGetDataTypeSize(CUDECOMP_DOUBLE, dtype_size), "double dtype size query") + call expect_equal_integer8(dtype_size, 8_8, "double dtype size") + call expect_success(cudecompGetDataTypeSize(CUDECOMP_FLOAT_COMPLEX, dtype_size), "float complex dtype size query") + call expect_equal_integer8(dtype_size, 8_8, "float complex dtype size") + call expect_success(cudecompGetDataTypeSize(CUDECOMP_DOUBLE_COMPLEX, dtype_size), & + "double complex dtype size query") + call expect_equal_integer8(dtype_size, 16_8, "double complex dtype size") + + call expect_string(cudecompTransposeCommBackendToString(CUDECOMP_TRANSPOSE_COMM_MPI_P2P), "MPI_P2P", & + "transpose backend string") + call expect_string(cudecompHaloCommBackendToString(CUDECOMP_HALO_COMM_MPI_BLOCKING), "MPI (blocking)", & + "halo backend string") + end subroutine test_dtype_sizes_and_strings + + subroutine test_typed_malloc_free(handle) + type(cudecompHandle) :: handle + type(cudecompGridDescConfig) :: config + type(cudecompGridDesc) :: grid_desc + real(real32), pointer, device, contiguous :: r4(:) + real(real64), pointer, device, contiguous :: r8(:) + complex(real32), pointer, device, contiguous :: c4(:) + complex(real64), pointer, device, contiguous :: c8(:) + integer :: res + + nullify(r4) + nullify(r8) + nullify(c4) + nullify(c8) + + call setup_distributed_config(config) + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, "create descriptor for typed allocation") + if (res /= CUDECOMP_RESULT_SUCCESS) return + + call expect_success(cudecompMalloc(handle, grid_desc, r4, 4_8), "cudecompMalloc real32") + if (.not. associated(r4)) call record_failure("real32 device pointer is not associated") + call expect_success(cudecompFree(handle, grid_desc, r4), "cudecompFree real32") + + call expect_success(cudecompMalloc(handle, grid_desc, r8, 4_8), "cudecompMalloc real64") + if (.not. associated(r8)) call record_failure("real64 device pointer is not associated") + call expect_success(cudecompFree(handle, grid_desc, r8), "cudecompFree real64") + + call expect_success(cudecompMalloc(handle, grid_desc, c4, 4_8), "cudecompMalloc complex32") + if (.not. associated(c4)) call record_failure("complex32 device pointer is not associated") + call expect_success(cudecompFree(handle, grid_desc, c4), "cudecompFree complex32") + + call expect_success(cudecompMalloc(handle, grid_desc, c8, 4_8), "cudecompMalloc complex64") + if (.not. associated(c8)) call record_failure("complex64 device pointer is not associated") + call expect_success(cudecompFree(handle, grid_desc, c8), "cudecompFree complex64") + + call expect_success(cudecompGridDescDestroy(handle, grid_desc), "destroy descriptor for typed allocation") + end subroutine test_typed_malloc_free + + subroutine setup_distributed_config(config) + type(cudecompGridDescConfig) :: config + + call expect_success(cudecompGridDescConfigSetDefaults(config), "set distributed config defaults") + config%gdims = k_gdims + config%pdims = k_pdims + end subroutine setup_distributed_config + + subroutine setup_explicit_config(config) + type(cudecompGridDescConfig) :: config + + call setup_distributed_config(config) + config%gdims_dist = k_gdims_dist + config%rank_order = CUDECOMP_RANK_ORDER_COL_MAJOR + config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_A2A + config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI_BLOCKING + config%transpose_axis_contiguous = [.true., .false., .true.] + config%transpose_mem_order(:, 1) = [3, 2, 1] + config%transpose_mem_order(:, 2) = [1, 3, 2] + config%transpose_mem_order(:, 3) = [2, 1, 3] + end subroutine setup_explicit_config + + subroutine expect_axis1_pencil_without_halo(pinfo) + type(cudecompPencilInfo) :: pinfo + integer :: expected_shape(3) + integer :: expected_lo(3) + integer :: expected_hi(3) + integer(8) :: expected_size + + select case (rank) + case (0) + expected_shape = [9, 5, 6] + expected_lo = [1, 1, 1] + expected_hi = [9, 5, 6] + case (1) + expected_shape = [9, 5, 5] + expected_lo = [1, 1, 7] + expected_hi = [9, 5, 11] + case (2) + expected_shape = [9, 5, 6] + expected_lo = [1, 6, 1] + expected_hi = [9, 10, 6] + case default + expected_shape = [9, 5, 5] + expected_lo = [1, 6, 7] + expected_hi = [9, 10, 11] + end select + + expected_size = product(expected_shape) + call expect_int_array(pinfo%shape, expected_shape, "axis-1 shape without halo") + call expect_int_array(pinfo%lo, expected_lo, "axis-1 lo without halo") + call expect_int_array(pinfo%hi, expected_hi, "axis-1 hi without halo") + call expect_int_array(pinfo%order, [1, 2, 3], "axis-1 order without halo") + call expect_int_array(pinfo%halo_extents, [0, 0, 0], "axis-1 omitted halo defaults") + call expect_int_array(pinfo%padding, [0, 0, 0], "axis-1 omitted padding defaults") + call expect_equal_integer8(pinfo%size, expected_size, "axis-1 size without halo") + end subroutine expect_axis1_pencil_without_halo + + subroutine expect_config_equal(actual, expected, context) + type(cudecompGridDescConfig) :: actual + type(cudecompGridDescConfig) :: expected + character(len=*) :: context + + call expect_int_array(actual%gdims, expected%gdims, trim(context)//" gdims") + call expect_int_array(actual%gdims_dist, expected%gdims_dist, trim(context)//" gdims_dist") + call expect_int_array(actual%pdims, expected%pdims, trim(context)//" pdims") + call expect_equal_int(actual%rank_order, expected%rank_order, trim(context)//" rank_order") + call expect_equal_int(actual%transpose_comm_backend, expected%transpose_comm_backend, & + trim(context)//" transpose backend") + call expect_equal_int(actual%halo_comm_backend, expected%halo_comm_backend, trim(context)//" halo backend") + if (any(actual%transpose_axis_contiguous .neqv. expected%transpose_axis_contiguous)) then + call record_failure(trim(context)//" axis-contiguous flags mismatch") + endif + call expect_int_array(reshape(actual%transpose_mem_order, [9]), reshape(expected%transpose_mem_order, [9]), & + trim(context)//" transpose_mem_order") + end subroutine expect_config_equal + + subroutine expect_success(result, context) + integer :: result + character(len=*) :: context + + if (result /= CUDECOMP_RESULT_SUCCESS) then + write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result + failures = failures + 1 + endif + end subroutine expect_success + + subroutine expect_equal_int(actual, expected, context) + integer :: actual + integer :: expected + character(len=*) :: context + + if (actual /= expected) then + write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual + failures = failures + 1 + endif + end subroutine expect_equal_int + + subroutine expect_equal_integer8(actual, expected, context) + integer(8) :: actual + integer(8) :: expected + character(len=*) :: context + + if (actual /= expected) then + write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual + failures = failures + 1 + endif + end subroutine expect_equal_integer8 + + subroutine expect_int_array(actual, expected, context) + integer :: actual(:) + integer :: expected(:) + character(len=*) :: context + + if (size(actual) /= size(expected)) then + call record_failure(trim(context)//" array sizes differ") + elseif (any(actual /= expected)) then + write(*, *) "rank", rank, ": FAIL", trim(context), "expected", expected, "actual", actual + failures = failures + 1 + endif + end subroutine expect_int_array + + subroutine expect_string(actual, expected, context) + character(len=*) :: actual + character(len=*) :: expected + character(len=*) :: context + + if (actual /= expected) then + write(*, *) "rank", rank, ": FAIL", trim(context), "expected", trim(expected), "actual", trim(actual) + failures = failures + 1 + endif + end subroutine expect_string + + subroutine record_failure(context) + character(len=*) :: context + + write(*, *) "rank", rank, ": FAIL", trim(context) + failures = failures + 1 + end subroutine record_failure + +end program cudecomp_fortran_api_test diff --git a/tests/ctest/fortran_halo_case.inc b/tests/ctest/fortran_halo_case.inc new file mode 100644 index 0000000..9583b56 --- /dev/null +++ b/tests/ctest/fortran_halo_case.inc @@ -0,0 +1,233 @@ +! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: Apache-2.0 + +subroutine RUN_HALO_CASE() + implicit none + + ! Pass explicit padding while using a non-default decomposition layout. Each + ! scenario below runs X, Y, and Z halo updates for all update dimensions. + call RUN_HALO_SCENARIO("custom_layout_padded_halo", k_halo_extents, k_explicit_padding, k_halo_periods, & + k_mixed_mem_order, k_default_axis_contiguous, CUDECOMP_RANK_ORDER_COL_MAJOR, .true.) + ! Omit the Fortran-only optional padding argument and verify the default + ! unpadded pencil shape still updates halos correctly. + call RUN_HALO_SCENARIO("default_padding", k_halo_extents, k_zero_extents, k_halo_periods, k_default_mem_order, & + k_default_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false.) +end subroutine RUN_HALO_CASE + +subroutine RUN_HALO_SCENARIO(scenario, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, & + use_optional_padding) + implicit none + + character(len=*), intent(in) :: scenario + integer, intent(in) :: halo_extents(3) + integer, intent(in) :: padding(3) + logical, intent(in) :: halo_periods(3) + integer, intent(in) :: mem_order(3, 3) + logical, intent(in) :: axis_contiguous(3) + integer, intent(in) :: rank_order + logical, intent(in) :: use_optional_padding + integer :: axis + + do axis = 1, 3 + call RUN_HALO_AXIS(scenario, axis, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, & + use_optional_padding) + enddo +end subroutine RUN_HALO_SCENARIO + +subroutine RUN_HALO_AXIS(scenario, axis, halo_extents, padding, halo_periods, mem_order, axis_contiguous, rank_order, & + use_optional_padding) + implicit none + + character(len=*), intent(in) :: scenario + integer, intent(in) :: axis + integer, intent(in) :: halo_extents(3) + integer, intent(in) :: padding(3) + logical, intent(in) :: halo_periods(3) + integer, intent(in) :: mem_order(3, 3) + logical, intent(in) :: axis_contiguous(3) + integer, intent(in) :: rank_order + logical, intent(in) :: use_optional_padding + type(cudecompGridDescConfig) :: config + type(cudecompGridDesc) :: grid_desc + type(cudecompPencilInfo) :: pinfo + integer(int64) :: workspace_num_elements + integer :: dim + integer :: res + logical :: grid_desc_created + logical :: work_allocated + ARRTYPE, allocatable :: initial(:) + ARRTYPE, allocatable :: expected(:) + ARRTYPE, allocatable :: actual(:) + ARRTYPE, allocatable, device, target :: data_d(:) + ARRTYPE, pointer, device, contiguous :: work_d(:) + + grid_desc_created = .false. + work_allocated = .false. + nullify(work_d) + + call setup_halo_config(config, axis_contiguous, mem_order, rank_order) + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompGridDescCreate") + if (res /= CUDECOMP_RESULT_SUCCESS) return + grid_desc_created = .true. + + if (use_optional_padding) then + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents, padding), & + DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" pencil info") + else + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents), & + DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" pencil info") + endif + call expect_success(cudecompGetHaloWorkspaceSize(handle, grid_desc, axis, halo_extents, workspace_num_elements), & + DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" halo workspace size") + + allocate(data_d(pinfo%size)) + call INITIALIZE_HALO_PENCIL(initial, pinfo) + call INITIALIZE_HALO_REFERENCE(expected, pinfo, halo_periods) + + res = cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements) + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompMalloc workspace") + if (res == CUDECOMP_RESULT_SUCCESS) work_allocated = .true. + + if (work_allocated) then + data_d = UNSET_HALO_VALUE() + data_d(1:pinfo%size) = initial + work_d = 0 + + do dim = 1, 3 + select case (axis) + case (1) + if (use_optional_padding) then + res = cudecompUpdateHalosX(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, & + dim, pinfo%padding) + else + res = cudecompUpdateHalosX(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim) + endif + case (2) + if (use_optional_padding) then + res = cudecompUpdateHalosY(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, & + dim, pinfo%padding) + else + res = cudecompUpdateHalosY(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim) + endif + case (3) + if (use_optional_padding) then + res = cudecompUpdateHalosZ(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, & + dim, pinfo%padding) + else + res = cudecompUpdateHalosZ(handle, grid_desc, data_d, work_d, DTYPE, pinfo%halo_extents, halo_periods, dim) + endif + end select + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompUpdateHalos") + enddo + + allocate(actual(pinfo%size)) + actual = data_d(1:pinfo%size) + call EXPECT_HALO_PENCIL_MATCH(expected, actual, pinfo, DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)) + deallocate(actual) + endif + + if (work_allocated) call expect_success(cudecompFree(handle, grid_desc, work_d), & + DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)//" cudecompFree") + if (grid_desc_created) call expect_success(cudecompGridDescDestroy(handle, grid_desc), & + DTYPE_NAME//" "//trim(scenario)//" axis "//axis_name(axis)// & + " cudecompGridDescDestroy") +end subroutine RUN_HALO_AXIS + +subroutine INITIALIZE_HALO_PENCIL(values, pinfo) + implicit none + + ARRTYPE, allocatable, intent(out) :: values(:) + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64) :: i + integer(int64) :: local(3) + integer(int64) :: global(3) + + allocate(values(pinfo%size)) + values = UNSET_HALO_VALUE() + + do i = 1_int64, pinfo%size + call local_coordinate(i, pinfo, local) + if (.not. is_internal_coordinate(pinfo, local)) cycle + + call global_coordinate(pinfo, local, global) + values(i) = HALO_PENCIL_VALUE(global_linear_index(global)) + enddo +end subroutine INITIALIZE_HALO_PENCIL + +subroutine INITIALIZE_HALO_REFERENCE(values, pinfo, halo_periods) + implicit none + + ARRTYPE, allocatable, intent(out) :: values(:) + type(cudecompPencilInfo), intent(in) :: pinfo + logical, intent(in) :: halo_periods(3) + integer(int64) :: i + integer(int64) :: local(3) + integer(int64) :: global(3) + integer :: dim + logical :: unset + + allocate(values(pinfo%size)) + values = UNSET_HALO_VALUE() + + do i = 1_int64, pinfo%size + call local_coordinate(i, pinfo, local) + call global_coordinate(pinfo, local, global) + unset = is_padding_coordinate(pinfo, local) + + do dim = 1, 3 + if (global(dim) >= 1_int64 .and. global(dim) <= int(k_gdims(dim), int64)) cycle + + if (halo_periods(dim)) then + global(dim) = wrap_index(global(dim), int(k_gdims(dim), int64)) + else + unset = .true. + endif + enddo + + if (.not. unset) values(i) = HALO_PENCIL_VALUE(global_linear_index(global)) + enddo +end subroutine INITIALIZE_HALO_REFERENCE + +function HALO_PENCIL_VALUE(linear_index) result(value) + implicit none + + integer(int64), intent(in) :: linear_index + ARRTYPE :: value + + value = linear_index +end function HALO_PENCIL_VALUE + +function UNSET_HALO_VALUE() result(value) + implicit none + + ARRTYPE :: value + + value = -1 +end function UNSET_HALO_VALUE + +subroutine EXPECT_HALO_PENCIL_MATCH(expected, actual, pinfo, context) + implicit none + + ARRTYPE, intent(in) :: expected(:) + ARRTYPE, intent(in) :: actual(:) + type(cudecompPencilInfo), intent(in) :: pinfo + character(len=*), intent(in) :: context + integer(int64) :: i + integer(int64) :: local(3) + + if (size(expected, kind=int64) /= pinfo%size .or. size(actual, kind=int64) /= pinfo%size) then + call record_failure(trim(context)//" result size mismatch") + return + endif + + do i = 1_int64, pinfo%size + if (expected(i) == actual(i)) cycle + + call local_coordinate(i, pinfo, local) + write(*, *) "rank", rank, ": FAIL", trim(context), "mismatch at local coordinate", local, & + "expected", expected(i), "actual", actual(i) + failures = failures + 1 + return + enddo +end subroutine EXPECT_HALO_PENCIL_MATCH diff --git a/tests/ctest/fortran_halo_tests.f90 b/tests/ctest/fortran_halo_tests.f90 new file mode 100644 index 0000000..92adcf4 --- /dev/null +++ b/tests/ctest/fortran_halo_tests.f90 @@ -0,0 +1,346 @@ +! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: Apache-2.0 + +module cudecomp_fortran_halo_tests + use, intrinsic :: iso_fortran_env, only: int64, real32, real64 + use cudafor + use cudecomp + use mpi + + implicit none + + ! Standalone CTest fixture for the Fortran halo API. The included fixture body + ! is instantiated once per dtype so the axis and optional-padding matrix stays + ! identical across real and complex coverage. + integer, parameter :: halo_test_ranks = 4 + integer, parameter :: k_gdims(3) = [9, 10, 11] + integer, parameter :: k_gdims_dist(3) = [8, 8, 8] + integer, parameter :: k_pdims(2) = [2, 2] + integer, parameter :: k_halo_extents(3) = [1, 1, 1] + integer, parameter :: k_zero_extents(3) = [0, 0, 0] + integer, parameter :: k_explicit_padding(3) = [1, 0, 2] + logical, parameter :: k_halo_periods(3) = [.true., .false., .true.] + logical, parameter :: k_default_axis_contiguous(3) = [.false., .false., .false.] + integer, parameter :: k_default_mem_order(3, 3) = reshape([-1, -1, -1, -1, -1, -1, -1, -1, -1], [3, 3]) + integer, parameter :: k_mixed_mem_order(3, 3) = reshape([3, 2, 1, 3, 2, 1, 3, 2, 1], [3, 3]) + + integer :: rank = -1 + integer :: nranks = 0 + integer :: ierr = 0 + integer :: local_comm = MPI_COMM_NULL + integer :: local_rank = 0 + integer :: failures = 0 + integer :: global_failures = 0 + logical :: handle_initialized = .false. + type(cudecompHandle) :: handle + +contains + + subroutine run_all_tests() + implicit none + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr) + + if (nranks /= halo_test_ranks) then + call record_failure("Fortran halo test requires exactly 4 MPI ranks") + else + call initialize_gpu() + endif + + if (failures == 0) then + call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit") + handle_initialized = failures == 0 + endif + + if (handle_initialized) then + call run_halo_r32() + call run_halo_r64() + call run_halo_c32() + call run_halo_c64() + call expect_success(cudecompFinalize(handle), "cudecompFinalize") + endif + + call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr) + if (rank == 0 .and. global_failures == 0) then + write(*, '("Fortran halo test passed")') + elseif (rank == 0) then + write(*, '("Fortran halo test failed with ", i0, " rank-local failure(s)")') global_failures + endif + + if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr) + call MPI_Finalize(ierr) + if (global_failures /= 0) call exit(1) + end subroutine run_all_tests + + subroutine initialize_gpu() + implicit none + + integer :: status + integer :: num_devices + + call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr) + call MPI_Comm_rank(local_comm, local_rank, ierr) + + status = cudaGetDeviceCount(num_devices) + if (status /= cudaSuccess) then + call record_failure("cudaGetDeviceCount failed") + return + endif + if (num_devices <= 0) then + call record_failure("Fortran halo test requires at least one visible CUDA device") + return + endif + + status = cudaSetDevice(mod(local_rank, num_devices)) + if (status /= cudaSuccess) call record_failure("cudaSetDevice failed") + end subroutine initialize_gpu + + subroutine setup_halo_config(config, axis_contiguous, mem_order, rank_order) + implicit none + + type(cudecompGridDescConfig), intent(out) :: config + logical, intent(in) :: axis_contiguous(3) + integer, intent(in) :: mem_order(3, 3) + integer, intent(in) :: rank_order + + call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults") + config%gdims = k_gdims + config%gdims_dist = k_gdims_dist + config%pdims = k_pdims + config%rank_order = rank_order + config%halo_comm_backend = CUDECOMP_HALO_COMM_MPI + config%transpose_axis_contiguous = axis_contiguous + config%transpose_mem_order = mem_order + end subroutine setup_halo_config + + function axis_name(axis) result(name) + implicit none + + integer, intent(in) :: axis + character(len=1) :: name + + select case (axis) + case (1) + name = "X" + case (2) + name = "Y" + case (3) + name = "Z" + case default + name = "?" + end select + end function axis_name + + function pencil_lower(pinfo, dim) result(lower) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer, intent(in) :: dim + integer(int64) :: lower + + lower = int(pinfo%lo(dim), int64) - int(pinfo%halo_extents(pinfo%order(dim)), int64) + end function pencil_lower + + subroutine local_coordinate(linear_index, pinfo, local) + implicit none + + integer(int64), intent(in) :: linear_index + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(out) :: local(3) + integer(int64) :: offset + integer(int64) :: shape_1 + integer(int64) :: shape_2 + + offset = linear_index - 1_int64 + shape_1 = int(pinfo%shape(1), int64) + shape_2 = int(pinfo%shape(2), int64) + local(1) = pencil_lower(pinfo, 1) + modulo(offset, shape_1) + local(2) = pencil_lower(pinfo, 2) + modulo(offset / shape_1, shape_2) + local(3) = pencil_lower(pinfo, 3) + offset / (shape_1 * shape_2) + end subroutine local_coordinate + + function is_internal_coordinate(pinfo, local) result(is_internal) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(in) :: local(3) + logical :: is_internal + + is_internal = local(1) >= int(pinfo%lo(1), int64) .and. local(1) <= int(pinfo%hi(1), int64) .and. & + local(2) >= int(pinfo%lo(2), int64) .and. local(2) <= int(pinfo%hi(2), int64) .and. & + local(3) >= int(pinfo%lo(3), int64) .and. local(3) <= int(pinfo%hi(3), int64) + end function is_internal_coordinate + + function is_padding_coordinate(pinfo, local) result(is_padding) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(in) :: local(3) + logical :: is_padding + + is_padding = local(1) > int(pinfo%hi(1), int64) + int(pinfo%halo_extents(pinfo%order(1)), int64) .or. & + local(2) > int(pinfo%hi(2), int64) + int(pinfo%halo_extents(pinfo%order(2)), int64) .or. & + local(3) > int(pinfo%hi(3), int64) + int(pinfo%halo_extents(pinfo%order(3)), int64) + end function is_padding_coordinate + + subroutine global_coordinate(pinfo, local, global) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(in) :: local(3) + integer(int64), intent(out) :: global(3) + integer :: dim + + do dim = 1, 3 + global(pinfo%order(dim)) = local(dim) + enddo + end subroutine global_coordinate + + function wrap_index(index, size) result(wrapped) + implicit none + + integer(int64), intent(in) :: index + integer(int64), intent(in) :: size + integer(int64) :: wrapped + + wrapped = modulo(index - 1_int64, size) + 1_int64 + end function wrap_index + + function global_linear_index(global) result(linear_index) + implicit none + + integer(int64), intent(in) :: global(3) + integer(int64) :: linear_index + + linear_index = global(1) + int(k_gdims(1), int64) * & + ((global(2) - 1_int64) + (global(3) - 1_int64) * int(k_gdims(2), int64)) + end function global_linear_index + + subroutine expect_success(result, context) + implicit none + + integer, intent(in) :: result + character(len=*), intent(in) :: context + + if (result /= CUDECOMP_RESULT_SUCCESS) then + write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result + failures = failures + 1 + endif + end subroutine expect_success + + subroutine record_failure(context) + implicit none + + character(len=*), intent(in) :: context + + write(*, *) "rank", rank, ": FAIL", trim(context) + failures = failures + 1 + end subroutine record_failure + +#define ARRTYPE real(real32) +#define DTYPE CUDECOMP_FLOAT +#define DTYPE_NAME "R32" +#define RUN_HALO_CASE run_halo_r32 +#define RUN_HALO_SCENARIO run_halo_scenario_r32 +#define RUN_HALO_AXIS run_halo_axis_r32 +#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_r32 +#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_r32 +#define HALO_PENCIL_VALUE halo_pencil_value_r32 +#define UNSET_HALO_VALUE unset_halo_value_r32 +#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_r32 +#include "fortran_halo_case.inc" +#undef EXPECT_HALO_PENCIL_MATCH +#undef UNSET_HALO_VALUE +#undef HALO_PENCIL_VALUE +#undef INITIALIZE_HALO_REFERENCE +#undef INITIALIZE_HALO_PENCIL +#undef RUN_HALO_AXIS +#undef RUN_HALO_SCENARIO +#undef RUN_HALO_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE real(real64) +#define DTYPE CUDECOMP_DOUBLE +#define DTYPE_NAME "R64" +#define RUN_HALO_CASE run_halo_r64 +#define RUN_HALO_SCENARIO run_halo_scenario_r64 +#define RUN_HALO_AXIS run_halo_axis_r64 +#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_r64 +#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_r64 +#define HALO_PENCIL_VALUE halo_pencil_value_r64 +#define UNSET_HALO_VALUE unset_halo_value_r64 +#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_r64 +#include "fortran_halo_case.inc" +#undef EXPECT_HALO_PENCIL_MATCH +#undef UNSET_HALO_VALUE +#undef HALO_PENCIL_VALUE +#undef INITIALIZE_HALO_REFERENCE +#undef INITIALIZE_HALO_PENCIL +#undef RUN_HALO_AXIS +#undef RUN_HALO_SCENARIO +#undef RUN_HALO_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE complex(real32) +#define DTYPE CUDECOMP_FLOAT_COMPLEX +#define DTYPE_NAME "C32" +#define RUN_HALO_CASE run_halo_c32 +#define RUN_HALO_SCENARIO run_halo_scenario_c32 +#define RUN_HALO_AXIS run_halo_axis_c32 +#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_c32 +#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_c32 +#define HALO_PENCIL_VALUE halo_pencil_value_c32 +#define UNSET_HALO_VALUE unset_halo_value_c32 +#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_c32 +#include "fortran_halo_case.inc" +#undef EXPECT_HALO_PENCIL_MATCH +#undef UNSET_HALO_VALUE +#undef HALO_PENCIL_VALUE +#undef INITIALIZE_HALO_REFERENCE +#undef INITIALIZE_HALO_PENCIL +#undef RUN_HALO_AXIS +#undef RUN_HALO_SCENARIO +#undef RUN_HALO_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE complex(real64) +#define DTYPE CUDECOMP_DOUBLE_COMPLEX +#define DTYPE_NAME "C64" +#define RUN_HALO_CASE run_halo_c64 +#define RUN_HALO_SCENARIO run_halo_scenario_c64 +#define RUN_HALO_AXIS run_halo_axis_c64 +#define INITIALIZE_HALO_PENCIL initialize_halo_pencil_c64 +#define INITIALIZE_HALO_REFERENCE initialize_halo_reference_c64 +#define HALO_PENCIL_VALUE halo_pencil_value_c64 +#define UNSET_HALO_VALUE unset_halo_value_c64 +#define EXPECT_HALO_PENCIL_MATCH expect_halo_pencil_match_c64 +#include "fortran_halo_case.inc" +#undef EXPECT_HALO_PENCIL_MATCH +#undef UNSET_HALO_VALUE +#undef HALO_PENCIL_VALUE +#undef INITIALIZE_HALO_REFERENCE +#undef INITIALIZE_HALO_PENCIL +#undef RUN_HALO_AXIS +#undef RUN_HALO_SCENARIO +#undef RUN_HALO_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +end module cudecomp_fortran_halo_tests + +program cudecomp_fortran_halo_test + use cudecomp_fortran_halo_tests + + implicit none + + call run_all_tests() +end program cudecomp_fortran_halo_test diff --git a/tests/ctest/fortran_transpose_case.inc b/tests/ctest/fortran_transpose_case.inc new file mode 100644 index 0000000..1fae212 --- /dev/null +++ b/tests/ctest/fortran_transpose_case.inc @@ -0,0 +1,267 @@ +! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: Apache-2.0 + +subroutine RUN_TRANSPOSE_CASE() + implicit none + + ! Exercise non-default layout, out-of-place transposes, and explicit + ! halo/padding optional arguments for every supported dtype. + call RUN_TRANSPOSE_SCENARIO("custom_layout_padded_halo", k_explicit_halo_extents, k_explicit_halo_extents, & + k_explicit_halo_extents, k_explicit_padding_x, k_explicit_padding_y, & + k_explicit_padding_z, k_mixed_mem_order, k_default_axis_contiguous, & + CUDECOMP_RANK_ORDER_COL_MAJOR, .true., .true.) + ! Omit the Fortran-only optional halo/padding arguments and use the default + ! layout path with in-place transposes. + call RUN_TRANSPOSE_SCENARIO("default_layout_in_place", k_zero_extents, k_zero_extents, k_zero_extents, & + k_zero_extents, k_zero_extents, k_zero_extents, k_default_mem_order, & + k_default_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false., .false.) +#ifdef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE + ! Keep axis-contiguous coverage small because it is a layout sanity check, not + ! a dtype-specific numerical path. + call RUN_TRANSPOSE_SCENARIO("axis_contiguous_in_place", k_zero_extents, k_zero_extents, k_zero_extents, & + k_zero_extents, k_zero_extents, k_zero_extents, k_default_mem_order, & + k_all_axis_contiguous, CUDECOMP_RANK_ORDER_DEFAULT, .false., .false.) +#endif +end subroutine RUN_TRANSPOSE_CASE + +subroutine RUN_TRANSPOSE_SCENARIO(scenario, halo_extents_x, halo_extents_y, halo_extents_z, padding_x, padding_y, & + padding_z, mem_order, axis_contiguous, rank_order, out_of_place, & + use_optional_args) + implicit none + + character(len=*), intent(in) :: scenario + integer, intent(in) :: halo_extents_x(3) + integer, intent(in) :: halo_extents_y(3) + integer, intent(in) :: halo_extents_z(3) + integer, intent(in) :: padding_x(3) + integer, intent(in) :: padding_y(3) + integer, intent(in) :: padding_z(3) + integer, intent(in) :: mem_order(3, 3) + logical, intent(in) :: axis_contiguous(3) + integer, intent(in) :: rank_order + logical, intent(in) :: out_of_place + logical, intent(in) :: use_optional_args + type(cudecompGridDescConfig) :: config + type(cudecompGridDesc) :: grid_desc + type(cudecompPencilInfo) :: pinfo_x + type(cudecompPencilInfo) :: pinfo_y + type(cudecompPencilInfo) :: pinfo_z + integer(int64) :: data_num_elements + integer(int64) :: workspace_num_elements + integer :: res + logical :: grid_desc_created + logical :: work_allocated + ARRTYPE, allocatable :: xref(:) + ARRTYPE, allocatable :: yref(:) + ARRTYPE, allocatable :: zref(:) + ARRTYPE, allocatable :: actual(:) + ARRTYPE, allocatable, device, target :: data_a_d(:) + ARRTYPE, allocatable, device, target :: data_b_d(:) + ARRTYPE, pointer, device, contiguous :: input_d(:) + ARRTYPE, pointer, device, contiguous :: output_d(:) + ARRTYPE, pointer, device, contiguous :: work_d(:) + + grid_desc_created = .false. + work_allocated = .false. + nullify(input_d) + nullify(output_d) + nullify(work_d) + + call setup_transpose_config(config, axis_contiguous, mem_order, rank_order) + res = cudecompGridDescCreate(handle, grid_desc, config) + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompGridDescCreate") + if (res /= CUDECOMP_RESULT_SUCCESS) return + grid_desc_created = .true. + + if (use_optional_args) then + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x, padding_x), & + DTYPE_NAME//" "//trim(scenario)//" X pencil info") + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y, padding_y), & + DTYPE_NAME//" "//trim(scenario)//" Y pencil info") + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z, padding_z), & + DTYPE_NAME//" "//trim(scenario)//" Z pencil info") + else + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1), & + DTYPE_NAME//" "//trim(scenario)//" X pencil info") + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2), & + DTYPE_NAME//" "//trim(scenario)//" Y pencil info") + call expect_success(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3), & + DTYPE_NAME//" "//trim(scenario)//" Z pencil info") + endif + call expect_success(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_num_elements), & + DTYPE_NAME//" "//trim(scenario)//" transpose workspace size") + + data_num_elements = max(pinfo_x%size, max(pinfo_y%size, pinfo_z%size)) + allocate(data_a_d(data_num_elements)) + if (out_of_place) allocate(data_b_d(data_num_elements)) + + call INITIALIZE_TRANSPOSE_PENCIL(xref, pinfo_x) + call INITIALIZE_TRANSPOSE_PENCIL(yref, pinfo_y) + call INITIALIZE_TRANSPOSE_PENCIL(zref, pinfo_z) + + res = cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements) + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompMalloc workspace") + if (res == CUDECOMP_RESULT_SUCCESS) work_allocated = .true. + + if (work_allocated) then + data_a_d = UNSET_TRANSPOSE_VALUE() + if (out_of_place) data_b_d = UNSET_TRANSPOSE_VALUE() + data_a_d(1:pinfo_x%size) = xref + input_d => data_a_d + if (out_of_place) then + output_d => data_b_d + else + output_d => data_a_d + endif + work_d = 0 + if (use_optional_args) then + res = cudecompTransposeXToY(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_x%halo_extents, & + pinfo_y%halo_extents, pinfo_x%padding, pinfo_y%padding) + else + res = cudecompTransposeXToY(handle, grid_desc, input_d, output_d, work_d, DTYPE) + endif + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeXToY") + if (res == CUDECOMP_RESULT_SUCCESS) then + allocate(actual(pinfo_y%size)) + actual = output_d(1:pinfo_y%size) + call EXPECT_TRANSPOSE_PENCIL_MATCH(yref, actual, pinfo_y, DTYPE_NAME//" "//trim(scenario)//" XToY") + deallocate(actual) + endif + + if (out_of_place) then + input_d => data_b_d + output_d => data_a_d + else + input_d => data_a_d + output_d => data_a_d + endif + work_d = 0 + if (use_optional_args) then + res = cudecompTransposeYToZ(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_y%halo_extents, & + pinfo_z%halo_extents, pinfo_y%padding, pinfo_z%padding) + else + res = cudecompTransposeYToZ(handle, grid_desc, input_d, output_d, work_d, DTYPE) + endif + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeYToZ") + if (res == CUDECOMP_RESULT_SUCCESS) then + allocate(actual(pinfo_z%size)) + actual = output_d(1:pinfo_z%size) + call EXPECT_TRANSPOSE_PENCIL_MATCH(zref, actual, pinfo_z, DTYPE_NAME//" "//trim(scenario)//" YToZ") + deallocate(actual) + endif + + if (out_of_place) then + input_d => data_a_d + output_d => data_b_d + else + input_d => data_a_d + output_d => data_a_d + endif + work_d = 0 + if (use_optional_args) then + res = cudecompTransposeZToY(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_z%halo_extents, & + pinfo_y%halo_extents, pinfo_z%padding, pinfo_y%padding) + else + res = cudecompTransposeZToY(handle, grid_desc, input_d, output_d, work_d, DTYPE) + endif + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeZToY") + if (res == CUDECOMP_RESULT_SUCCESS) then + allocate(actual(pinfo_y%size)) + actual = output_d(1:pinfo_y%size) + call EXPECT_TRANSPOSE_PENCIL_MATCH(yref, actual, pinfo_y, DTYPE_NAME//" "//trim(scenario)//" ZToY") + deallocate(actual) + endif + + if (out_of_place) then + input_d => data_b_d + output_d => data_a_d + else + input_d => data_a_d + output_d => data_a_d + endif + work_d = 0 + if (use_optional_args) then + res = cudecompTransposeYToX(handle, grid_desc, input_d, output_d, work_d, DTYPE, pinfo_y%halo_extents, & + pinfo_x%halo_extents, pinfo_y%padding, pinfo_x%padding) + else + res = cudecompTransposeYToX(handle, grid_desc, input_d, output_d, work_d, DTYPE) + endif + call expect_success(res, DTYPE_NAME//" "//trim(scenario)//" cudecompTransposeYToX") + if (res == CUDECOMP_RESULT_SUCCESS) then + allocate(actual(pinfo_x%size)) + actual = output_d(1:pinfo_x%size) + call EXPECT_TRANSPOSE_PENCIL_MATCH(xref, actual, pinfo_x, DTYPE_NAME//" "//trim(scenario)//" YToX") + deallocate(actual) + endif + endif + + if (work_allocated) call expect_success(cudecompFree(handle, grid_desc, work_d), & + DTYPE_NAME//" "//trim(scenario)//" cudecompFree") + if (grid_desc_created) call expect_success(cudecompGridDescDestroy(handle, grid_desc), & + DTYPE_NAME//" "//trim(scenario)//" cudecompGridDescDestroy") +end subroutine RUN_TRANSPOSE_SCENARIO + +subroutine INITIALIZE_TRANSPOSE_PENCIL(values, pinfo) + implicit none + + ARRTYPE, allocatable, intent(out) :: values(:) + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64) :: i + integer(int64) :: local(3) + integer(int64) :: global(3) + + allocate(values(pinfo%size)) + values = UNSET_TRANSPOSE_VALUE() + + do i = 1_int64, pinfo%size + call local_coordinate(i, pinfo, local) + if (.not. is_internal_coordinate(pinfo, local)) cycle + + call global_coordinate(pinfo, local, global) + values(i) = TRANSPOSE_PENCIL_VALUE(global_linear_index(global)) + enddo +end subroutine INITIALIZE_TRANSPOSE_PENCIL + +function TRANSPOSE_PENCIL_VALUE(linear_index) result(value) + implicit none + + integer(int64), intent(in) :: linear_index + ARRTYPE :: value + + value = linear_index +end function TRANSPOSE_PENCIL_VALUE + +function UNSET_TRANSPOSE_VALUE() result(value) + implicit none + + ARRTYPE :: value + + value = -1 +end function UNSET_TRANSPOSE_VALUE + +subroutine EXPECT_TRANSPOSE_PENCIL_MATCH(expected, actual, pinfo, context) + implicit none + + ARRTYPE, intent(in) :: expected(:) + ARRTYPE, intent(in) :: actual(:) + type(cudecompPencilInfo), intent(in) :: pinfo + character(len=*), intent(in) :: context + integer(int64) :: i + integer(int64) :: local(3) + + if (size(expected, kind=int64) /= pinfo%size .or. size(actual, kind=int64) /= pinfo%size) then + call record_failure(trim(context)//" result size mismatch") + return + endif + + do i = 1_int64, pinfo%size + call local_coordinate(i, pinfo, local) + if (.not. is_internal_coordinate(pinfo, local)) cycle + if (expected(i) == actual(i)) cycle + + write(*, *) "rank", rank, ": FAIL", trim(context), "mismatch at local coordinate", local, & + "expected", expected(i), "actual", actual(i) + failures = failures + 1 + return + enddo +end subroutine EXPECT_TRANSPOSE_PENCIL_MATCH diff --git a/tests/ctest/fortran_transpose_tests.f90 b/tests/ctest/fortran_transpose_tests.f90 new file mode 100644 index 0000000..21fb4bb --- /dev/null +++ b/tests/ctest/fortran_transpose_tests.f90 @@ -0,0 +1,296 @@ +! SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: Apache-2.0 + +module cudecomp_fortran_transpose_tests + use, intrinsic :: iso_fortran_env, only: int64, real32, real64 + use cudafor + use cudecomp + use mpi + + implicit none + + ! Standalone CTest fixture for the Fortran transpose API. The dtype-specific + ! cases are generated by including fortran_transpose_case.inc with different + ! preprocessor aliases below. + integer, parameter :: transpose_test_ranks = 4 + integer, parameter :: k_gdims(3) = [9, 10, 11] + integer, parameter :: k_gdims_dist(3) = [8, 8, 8] + integer, parameter :: k_pdims(2) = [2, 2] + integer, parameter :: k_zero_extents(3) = [0, 0, 0] + integer, parameter :: k_explicit_halo_extents(3) = [1, 1, 1] + integer, parameter :: k_explicit_padding_x(3) = [1, 0, 1] + integer, parameter :: k_explicit_padding_y(3) = [0, 1, 0] + integer, parameter :: k_explicit_padding_z(3) = [1, 0, 1] + logical, parameter :: k_default_axis_contiguous(3) = [.false., .false., .false.] + logical, parameter :: k_all_axis_contiguous(3) = [.true., .true., .true.] + integer, parameter :: k_default_mem_order(3, 3) = reshape([-1, -1, -1, -1, -1, -1, -1, -1, -1], [3, 3]) + integer, parameter :: k_mixed_mem_order(3, 3) = reshape([3, 2, 1, 1, 3, 2, 2, 1, 3], [3, 3]) + + integer :: rank = -1 + integer :: nranks = 0 + integer :: ierr = 0 + integer :: local_comm = MPI_COMM_NULL + integer :: local_rank = 0 + integer :: failures = 0 + integer :: global_failures = 0 + logical :: handle_initialized = .false. + type(cudecompHandle) :: handle + +contains + + subroutine run_all_tests() + implicit none + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nranks, ierr) + + if (nranks /= transpose_test_ranks) then + call record_failure("Fortran transpose test requires exactly 4 MPI ranks") + else + call initialize_gpu() + endif + + if (failures == 0) then + call expect_success(cudecompInit(handle, MPI_COMM_WORLD), "cudecompInit") + handle_initialized = failures == 0 + endif + + if (handle_initialized) then + call run_transpose_r32() + call run_transpose_r64() + call run_transpose_c32() + call run_transpose_c64() + call expect_success(cudecompFinalize(handle), "cudecompFinalize") + endif + + call MPI_Allreduce(failures, global_failures, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr) + if (rank == 0 .and. global_failures == 0) then + write(*, '("Fortran transpose test passed")') + elseif (rank == 0) then + write(*, '("Fortran transpose test failed with ", i0, " rank-local failure(s)")') global_failures + endif + + if (local_comm /= MPI_COMM_NULL) call MPI_Comm_free(local_comm, ierr) + call MPI_Finalize(ierr) + if (global_failures /= 0) call exit(1) + end subroutine run_all_tests + + subroutine initialize_gpu() + implicit none + + integer :: status + integer :: num_devices + + call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr) + call MPI_Comm_rank(local_comm, local_rank, ierr) + + status = cudaGetDeviceCount(num_devices) + if (status /= cudaSuccess) then + call record_failure("cudaGetDeviceCount failed") + return + endif + if (num_devices <= 0) then + call record_failure("Fortran transpose test requires at least one visible CUDA device") + return + endif + + status = cudaSetDevice(mod(local_rank, num_devices)) + if (status /= cudaSuccess) call record_failure("cudaSetDevice failed") + end subroutine initialize_gpu + + subroutine setup_transpose_config(config, axis_contiguous, mem_order, rank_order) + implicit none + + type(cudecompGridDescConfig), intent(out) :: config + logical, intent(in) :: axis_contiguous(3) + integer, intent(in) :: mem_order(3, 3) + integer, intent(in) :: rank_order + + call expect_success(cudecompGridDescConfigSetDefaults(config), "cudecompGridDescConfigSetDefaults") + config%gdims = k_gdims + config%gdims_dist = k_gdims_dist + config%pdims = k_pdims + config%rank_order = rank_order + config%transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P + config%transpose_axis_contiguous = axis_contiguous + config%transpose_mem_order = mem_order + end subroutine setup_transpose_config + + function pencil_lower(pinfo, dim) result(lower) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer, intent(in) :: dim + integer(int64) :: lower + + lower = int(pinfo%lo(dim), int64) - int(pinfo%halo_extents(pinfo%order(dim)), int64) + end function pencil_lower + + subroutine local_coordinate(linear_index, pinfo, local) + implicit none + + integer(int64), intent(in) :: linear_index + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(out) :: local(3) + integer(int64) :: offset + integer(int64) :: shape_1 + integer(int64) :: shape_2 + + offset = linear_index - 1_int64 + shape_1 = int(pinfo%shape(1), int64) + shape_2 = int(pinfo%shape(2), int64) + local(1) = pencil_lower(pinfo, 1) + modulo(offset, shape_1) + local(2) = pencil_lower(pinfo, 2) + modulo(offset / shape_1, shape_2) + local(3) = pencil_lower(pinfo, 3) + offset / (shape_1 * shape_2) + end subroutine local_coordinate + + function is_internal_coordinate(pinfo, local) result(is_internal) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(in) :: local(3) + logical :: is_internal + + is_internal = local(1) >= int(pinfo%lo(1), int64) .and. local(1) <= int(pinfo%hi(1), int64) .and. & + local(2) >= int(pinfo%lo(2), int64) .and. local(2) <= int(pinfo%hi(2), int64) .and. & + local(3) >= int(pinfo%lo(3), int64) .and. local(3) <= int(pinfo%hi(3), int64) + end function is_internal_coordinate + + subroutine global_coordinate(pinfo, local, global) + implicit none + + type(cudecompPencilInfo), intent(in) :: pinfo + integer(int64), intent(in) :: local(3) + integer(int64), intent(out) :: global(3) + integer :: dim + + do dim = 1, 3 + global(pinfo%order(dim)) = local(dim) + enddo + end subroutine global_coordinate + + function global_linear_index(global) result(linear_index) + implicit none + + integer(int64), intent(in) :: global(3) + integer(int64) :: linear_index + + linear_index = global(1) + int(k_gdims(1), int64) * & + ((global(2) - 1_int64) + (global(3) - 1_int64) * int(k_gdims(2), int64)) + end function global_linear_index + + subroutine expect_success(result, context) + implicit none + + integer, intent(in) :: result + character(len=*), intent(in) :: context + + if (result /= CUDECOMP_RESULT_SUCCESS) then + write(*, *) "rank", rank, ": FAIL", trim(context), "returned", result + failures = failures + 1 + endif + end subroutine expect_success + + subroutine record_failure(context) + implicit none + + character(len=*), intent(in) :: context + + write(*, *) "rank", rank, ": FAIL", trim(context) + failures = failures + 1 + end subroutine record_failure + +#define ARRTYPE real(real32) +#define DTYPE CUDECOMP_FLOAT +#define DTYPE_NAME "R32" +#define RUN_TRANSPOSE_CASE run_transpose_r32 +#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_r32 +#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_r32 +#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_r32 +#define UNSET_TRANSPOSE_VALUE unset_transpose_value_r32 +#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_r32 +#define RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE +#include "fortran_transpose_case.inc" +#undef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE +#undef EXPECT_TRANSPOSE_PENCIL_MATCH +#undef UNSET_TRANSPOSE_VALUE +#undef TRANSPOSE_PENCIL_VALUE +#undef INITIALIZE_TRANSPOSE_PENCIL +#undef RUN_TRANSPOSE_SCENARIO +#undef RUN_TRANSPOSE_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE real(real64) +#define DTYPE CUDECOMP_DOUBLE +#define DTYPE_NAME "R64" +#define RUN_TRANSPOSE_CASE run_transpose_r64 +#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_r64 +#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_r64 +#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_r64 +#define UNSET_TRANSPOSE_VALUE unset_transpose_value_r64 +#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_r64 +#include "fortran_transpose_case.inc" +#undef EXPECT_TRANSPOSE_PENCIL_MATCH +#undef UNSET_TRANSPOSE_VALUE +#undef TRANSPOSE_PENCIL_VALUE +#undef INITIALIZE_TRANSPOSE_PENCIL +#undef RUN_TRANSPOSE_SCENARIO +#undef RUN_TRANSPOSE_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE complex(real32) +#define DTYPE CUDECOMP_FLOAT_COMPLEX +#define DTYPE_NAME "C32" +#define RUN_TRANSPOSE_CASE run_transpose_c32 +#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_c32 +#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_c32 +#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_c32 +#define UNSET_TRANSPOSE_VALUE unset_transpose_value_c32 +#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_c32 +#define RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE +#include "fortran_transpose_case.inc" +#undef RUN_TRANSPOSE_AXIS_CONTIGUOUS_CASE +#undef EXPECT_TRANSPOSE_PENCIL_MATCH +#undef UNSET_TRANSPOSE_VALUE +#undef TRANSPOSE_PENCIL_VALUE +#undef INITIALIZE_TRANSPOSE_PENCIL +#undef RUN_TRANSPOSE_SCENARIO +#undef RUN_TRANSPOSE_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +#define ARRTYPE complex(real64) +#define DTYPE CUDECOMP_DOUBLE_COMPLEX +#define DTYPE_NAME "C64" +#define RUN_TRANSPOSE_CASE run_transpose_c64 +#define RUN_TRANSPOSE_SCENARIO run_transpose_scenario_c64 +#define INITIALIZE_TRANSPOSE_PENCIL initialize_transpose_pencil_c64 +#define TRANSPOSE_PENCIL_VALUE transpose_pencil_value_c64 +#define UNSET_TRANSPOSE_VALUE unset_transpose_value_c64 +#define EXPECT_TRANSPOSE_PENCIL_MATCH expect_transpose_pencil_match_c64 +#include "fortran_transpose_case.inc" +#undef EXPECT_TRANSPOSE_PENCIL_MATCH +#undef UNSET_TRANSPOSE_VALUE +#undef TRANSPOSE_PENCIL_VALUE +#undef INITIALIZE_TRANSPOSE_PENCIL +#undef RUN_TRANSPOSE_SCENARIO +#undef RUN_TRANSPOSE_CASE +#undef DTYPE_NAME +#undef DTYPE +#undef ARRTYPE + +end module cudecomp_fortran_transpose_tests + +program cudecomp_fortran_transpose_test + use cudecomp_fortran_transpose_tests + + implicit none + + call run_all_tests() +end program cudecomp_fortran_transpose_test diff --git a/tests/ctest/gpu_test_utils.cc b/tests/ctest/gpu_test_utils.cc new file mode 100644 index 0000000..fa314c3 --- /dev/null +++ b/tests/ctest/gpu_test_utils.cc @@ -0,0 +1,148 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include +#include + +#include "gpu_test_utils.h" + +namespace cudecomp_test { +namespace { + +bool envFlagEnabled(const char* name) { + const char* value = std::getenv(name); + if (!value || value[0] == '\0') return false; + + const std::string flag(value); + return flag != "0" && flag != "false" && flag != "False" && flag != "FALSE"; +} + +bool pathExists(const std::string& path) { + struct stat info; + return stat(path.c_str(), &info) == 0; +} + +std::string mpsPipeDirectory() { + const char* pipe_dir = std::getenv("CUDA_MPS_PIPE_DIRECTORY"); + if (pipe_dir && pipe_dir[0] != '\0') return pipe_dir; + return "/tmp/nvidia-mps"; +} + +bool ranksShareGpu(int local_ranks, int visible_devices) { + return visible_devices > 0 && local_ranks > visible_devices; +} + +TestSetupDecision queryCudaDeviceCount(int& count) { + cudaError_t status = cudaGetDeviceCount(&count); + if (status == cudaSuccess) return {}; + + count = 0; + return {false, true, std::string("unable to query CUDA device count: ") + cudaGetErrorString(status)}; +} + +GpuTestRuntime queryGpuTestRuntime() { + GpuTestRuntime runtime; + runtime.mps_active = pathExists(mpsPipeDirectory() + "/nvidia-cuda-mps-control.pid"); + if (ncclGetVersion(&runtime.nccl_version) != ncclSuccess) { runtime.nccl_version = 0; } + runtime.nccl_multi_rank_gpu_enabled = envFlagEnabled("NCCL_MULTI_RANK_GPU_ENABLE"); + return runtime; +} + +TestSetupDecision checkGpuSetupGlobal(const MpiTestComm& comm, const TestSetupDecision& local_decision) { + const int local_state = local_decision.fail ? 2 : local_decision.skip ? 1 : 0; + int global_state = 0; + MPI_Allreduce(&local_state, &global_state, 1, MPI_INT, MPI_MAX, comm.mpiComm()); + + if (global_state == 0) return {}; + if (local_decision.skip || local_decision.fail) return local_decision; + if (global_state == 2) return {false, true, "GPU setup failed on another rank"}; + return {true, false, "GPU setup skipped on another rank"}; +} + +TestSetupDecision checkGpuTestRequirementsLocal(const MpiTestComm& comm) { + int device_count = 0; + const TestSetupDecision device_count_decision = queryCudaDeviceCount(device_count); + if (device_count_decision.skip || device_count_decision.fail) return device_count_decision; + + if (device_count <= 0) { return {false, true, "GPU tests require at least one visible CUDA device"}; } + + const int local_ranks = comm.localSize(); + if (ranksShareGpu(local_ranks, device_count)) { + if (!mpsActive()) { + return {false, true, + std::to_string(local_ranks) + " local ranks require GPU sharing with only " + + std::to_string(device_count) + + " visible CUDA device(s); enable CUDA MPS or provide at least one visible GPU per local rank"}; + } + } + + const int device = comm.localRank() % device_count; + cudaError_t status = cudaSetDevice(device); + if (status != cudaSuccess) { + return {false, true, "unable to set CUDA device " + std::to_string(device) + ": " + cudaGetErrorString(status)}; + } + + return {}; +} + +TestSetupDecision initializeGpuForTestLocal(const MpiTestComm& comm, bool check_nccl) { + TestSetupDecision decision = checkGpuTestRequirementsLocal(comm); + if (decision.skip || decision.fail) return decision; + + if (check_nccl) { + int device_count = 0; + decision = queryCudaDeviceCount(device_count); + if (decision.skip || decision.fail) return decision; + + if (ranksShareGpu(comm.localSize(), device_count)) { + if (!ncclSupportsMultiRankPerGpu()) { + return {true, false, + "NCCL multi-rank-per-GPU testing with MPS requires NCCL 2.30 or newer; runtime reports NCCL version " + + std::to_string(ncclVersion())}; + } + + if (!ncclMultiRankGpuEnabled()) { + return {true, false, "NCCL multi-rank-per-GPU testing with MPS requires NCCL_MULTI_RANK_GPU_ENABLE=1"}; + } + } + } + + return {}; +} + +} // namespace + +void initializeGpuTestRuntime() { (void)gpuTestRuntime(); } + +const GpuTestRuntime& gpuTestRuntime() { + static const GpuTestRuntime runtime = queryGpuTestRuntime(); + return runtime; +} + +bool mpsActive() { return gpuTestRuntime().mps_active; } + +int ncclVersion() { return gpuTestRuntime().nccl_version; } + +bool ncclSupportsMultiRankPerGpu() { return ncclVersion() >= 23000; } + +bool ncclMultiRankGpuEnabled() { return gpuTestRuntime().nccl_multi_rank_gpu_enabled; } + +TestSetupDecision checkGpuTestRequirements(const MpiTestComm& comm) { + if (!comm.valid()) { return {true, false, "inactive MPI communicator"}; } + + return checkGpuSetupGlobal(comm, checkGpuTestRequirementsLocal(comm)); +} + +TestSetupDecision initializeGpuForTest(const MpiTestComm& comm, bool check_nccl) { + if (!comm.valid()) { return {true, false, "inactive MPI communicator"}; } + + return checkGpuSetupGlobal(comm, initializeGpuForTestLocal(comm, check_nccl)); +} + +} // namespace cudecomp_test diff --git a/tests/ctest/gpu_test_utils.h b/tests/ctest/gpu_test_utils.h new file mode 100644 index 0000000..cd3645e --- /dev/null +++ b/tests/ctest/gpu_test_utils.h @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_GPU_TEST_UTILS_H +#define CUDECOMP_TEST_GPU_TEST_UTILS_H + +#include + +#include "mpi_test_utils.h" + +namespace cudecomp_test { + +struct TestSetupDecision { + bool skip = false; + bool fail = false; + std::string reason; +}; + +struct GpuTestRuntime { + bool mps_active = false; + int nccl_version = 0; + bool nccl_multi_rank_gpu_enabled = false; +}; + +void initializeGpuTestRuntime(); +const GpuTestRuntime& gpuTestRuntime(); +bool mpsActive(); +int ncclVersion(); +bool ncclSupportsMultiRankPerGpu(); +bool ncclMultiRankGpuEnabled(); +TestSetupDecision checkGpuTestRequirements(const MpiTestComm& comm); +TestSetupDecision initializeGpuForTest(const MpiTestComm& comm, bool check_nccl = false); + +} // namespace cudecomp_test + +#endif diff --git a/tests/ctest/halo_tests.cc b/tests/ctest/halo_tests.cc new file mode 100644 index 0000000..e645709 --- /dev/null +++ b/tests/ctest/halo_tests.cc @@ -0,0 +1,395 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "cudecomp.h" + +#include "backend_test_context.h" +#include "backend_utils.h" +#include "gpu_test_utils.h" +#include "mpi_test_utils.h" +#include "test_utils.h" + +namespace { + +constexpr std::array kBaselineGdims{9, 10, 11}; +constexpr std::array kZeroExtents{0, 0, 0}; +constexpr std::array kBaselineHaloExtents{1, 3, 2}; +constexpr std::array kNonzeroPadding{1, 2, 1}; +constexpr std::array kDefaultAxisContiguous{false, false, false}; +constexpr std::array kAllAxisContiguous{true, true, true}; +constexpr std::array, 3> kDefaultMemOrder{{{{-1, -1, -1}}, {{-1, -1, -1}}, {{-1, -1, -1}}}}; +constexpr std::array kPeriodicHalos{true, true, true}; +constexpr std::array kNonPeriodicHalos{false, false, false}; + +struct HaloCase { + cudecomp_test::HaloBackend backend; + const char* scenario; + int axis; + std::array gdims; + std::array pdims; + cudecompDataType_t dtype; + std::array axis_contiguous; + std::array, 3> mem_order; + std::array halo_extents; + std::array halo_periods; + std::array padding; + cudecompRankOrder_t rank_order; +}; + +const char* axisName(int axis) { + switch (axis) { + case 0: return "X"; + case 1: return "Y"; + case 2: return "Z"; + } + return "UnknownAxis"; +} + +std::string sanitizeParamName(const std::string& value) { + std::string result; + for (char ch : value) { + if (std::isalnum(static_cast(ch))) { + result.push_back(ch); + } else { + result.push_back('_'); + } + } + return result; +} + +const char* dtypeName(cudecompDataType_t dtype) { + switch (dtype) { + case CUDECOMP_FLOAT: return "R32"; + case CUDECOMP_FLOAT_COMPLEX: return "C32"; + case CUDECOMP_DOUBLE: return "R64"; + case CUDECOMP_DOUBLE_COMPLEX: return "C64"; + } + return "UnknownDtype"; +} + +std::string paramName(const testing::TestParamInfo& info) { + const auto& test_case = info.param; + return sanitizeParamName(test_case.scenario) + "_Axis" + axisName(test_case.axis) + "_" + + sanitizeParamName(test_case.backend.name) + "_" + dtypeName(test_case.dtype) + "_P" + + std::to_string(test_case.pdims[0]) + "x" + std::to_string(test_case.pdims[1]); +} + +HaloCase makeCase(cudecomp_test::HaloBackend backend, const char* scenario, int axis, + std::array gdims = kBaselineGdims, std::array pdims = {2, 2}, + cudecompDataType_t dtype = CUDECOMP_FLOAT, + std::array axis_contiguous = kDefaultAxisContiguous, + std::array, 3> mem_order = kDefaultMemOrder, + std::array halo_extents = kBaselineHaloExtents, + std::array halo_periods = kPeriodicHalos, std::array padding = kZeroExtents, + cudecompRankOrder_t rank_order = CUDECOMP_RANK_ORDER_DEFAULT) { + return {backend, scenario, axis, gdims, pdims, dtype, + axis_contiguous, mem_order, halo_extents, halo_periods, padding, rank_order}; +} + +void appendBaselineCases(std::vector& cases, const cudecomp_test::HaloBackend& backend) { + // Baseline cases are the common halo sweep: all axis-aligned pencil layouts, default and all-axis-contiguous storage, + // periodic and non-periodic boundary behavior, nonuniform halo extents, and single-precision real/complex data. + struct LayoutCase { + const char* periodic_scenario; + const char* non_periodic_scenario; + std::array axis_contiguous; + }; + + for (const auto& layout : {LayoutCase{"BaselineDefaultLayoutPeriodicBoundaries", + "BaselineDefaultLayoutNonPeriodicBoundaries", kDefaultAxisContiguous}, + LayoutCase{"BaselineAxisContiguousPeriodicBoundaries", + "BaselineAxisContiguousNonPeriodicBoundaries", kAllAxisContiguous}}) { + for (int axis = 0; axis < 3; ++axis) { + for (const auto dtype : {CUDECOMP_FLOAT, CUDECOMP_FLOAT_COMPLEX}) { + cases.push_back(makeCase(backend, layout.periodic_scenario, axis, kBaselineGdims, {2, 2}, dtype, + layout.axis_contiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos)); + cases.push_back(makeCase(backend, layout.non_periodic_scenario, axis, kBaselineGdims, {2, 2}, dtype, + layout.axis_contiguous, kDefaultMemOrder, kBaselineHaloExtents, kNonPeriodicHalos)); + } + } + } +} + +void appendCoverageCases(std::vector& cases, const cudecomp_test::HaloBackend& backend) { + // Coverage cases target halo paths not guaranteed by the baseline sweep. They stay in the MPI collection because + // these behaviors are shared halo logic rather than backend-specific communication coverage. + cases.push_back(makeCase(backend, "NonzeroPadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT, kDefaultAxisContiguous, + kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, kNonzeroPadding)); + + cases.push_back(makeCase(backend, "ColumnMajorRankOrder", 0, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT, + kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, kZeroExtents, + CUDECOMP_RANK_ORDER_COL_MAJOR)); + + cases.push_back(makeCase(backend, "InteriorNonPeriodicNeighbors", 0, kBaselineGdims, {3, 1}, CUDECOMP_FLOAT, + kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kNonPeriodicHalos)); + + cases.push_back(makeCase(backend, "DtypeWorkspacePadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_DOUBLE, + kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, + kNonzeroPadding)); + cases.push_back(makeCase(backend, "DtypeWorkspacePadding", 0, kBaselineGdims, {2, 2}, CUDECOMP_DOUBLE_COMPLEX, + kDefaultAxisContiguous, kDefaultMemOrder, kBaselineHaloExtents, kPeriodicHalos, + kNonzeroPadding)); +} + +std::vector haloCasesForLabel(const char* label) { + std::vector cases; + for (const auto& backend : cudecomp_test::haloBackends()) { + if (std::string(backend.label) != label) continue; + appendBaselineCases(cases, backend); + if (std::string(label) == "mpi") { appendCoverageCases(cases, backend); } + } + return cases; +} + +bool isInternal(const cudecompPencilInfo_t& pinfo, const std::array& local) { + return local[0] >= pinfo.halo_extents[pinfo.order[0]] && + local[0] < pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]] && + local[1] >= pinfo.halo_extents[pinfo.order[1]] && + local[1] < pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]] && + local[2] >= pinfo.halo_extents[pinfo.order[2]] && + local[2] < pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]]; +} + +bool isPadding(const cudecompPencilInfo_t& pinfo, const std::array& local) { + return local[0] >= pinfo.shape[0] - pinfo.padding[pinfo.order[0]] || + local[1] >= pinfo.shape[1] - pinfo.padding[pinfo.order[1]] || + local[2] >= pinfo.shape[2] - pinfo.padding[pinfo.order[2]]; +} + +std::array localCoordinate(int64_t index, const cudecompPencilInfo_t& pinfo) { + std::array local{}; + local[0] = index % pinfo.shape[0]; + local[1] = index / pinfo.shape[0] % pinfo.shape[1]; + local[2] = index / (pinfo.shape[0] * pinfo.shape[1]); + return local; +} + +std::array globalCoordinate(const cudecompPencilInfo_t& pinfo, const std::array& local) { + std::array global{}; + global[pinfo.order[0]] = local[0] + pinfo.lo[0] - pinfo.halo_extents[pinfo.order[0]]; + global[pinfo.order[1]] = local[1] + pinfo.lo[1] - pinfo.halo_extents[pinfo.order[1]]; + global[pinfo.order[2]] = local[2] + pinfo.lo[2] - pinfo.halo_extents[pinfo.order[2]]; + return global; +} + +int64_t wrapIndex(int64_t index, int64_t size) { + const int64_t wrapped = index % size; + return (wrapped < 0) ? wrapped + size : wrapped; +} + +template T unsetValue() { return static_cast(-1); } + +template <> std::complex unsetValue>() { return {-1.0f, 0.0f}; } + +template <> std::complex unsetValue>() { return {-1.0, 0.0}; } + +template T pencilValue(int64_t global_index) { return static_cast(global_index); } + +template <> std::complex pencilValue>(int64_t global_index) { + return {static_cast(global_index), -static_cast(global_index)}; +} + +template <> std::complex pencilValue>(int64_t global_index) { + return {static_cast(global_index), -static_cast(global_index)}; +} + +int64_t globalLinearIndex(const std::array& global, const std::array& gdims) { + return global[0] + gdims[0] * (global[1] + global[2] * gdims[1]); +} + +template +std::vector initializePencil(const cudecompPencilInfo_t& pinfo, const std::array& gdims) { + std::vector data(pinfo.size, unsetValue()); + + for (int64_t i = 0; i < pinfo.size; ++i) { + const auto local = localCoordinate(i, pinfo); + if (!isInternal(pinfo, local)) continue; + + const auto global = globalCoordinate(pinfo, local); + data[i] = pencilValue(globalLinearIndex(global, gdims)); + } + + return data; +} + +template +std::vector initializeReference(const cudecompPencilInfo_t& pinfo, const std::array& gdims, + const std::array& halo_periods) { + std::vector data(pinfo.size, unsetValue()); + + for (int64_t i = 0; i < pinfo.size; ++i) { + const auto local = localCoordinate(i, pinfo); + auto global = globalCoordinate(pinfo, local); + bool unset = isPadding(pinfo, local); + + for (int dim = 0; dim < 3; ++dim) { + if (global[dim] >= 0 && global[dim] < gdims[dim]) continue; + + if (halo_periods[dim]) { + global[dim] = wrapIndex(global[dim], gdims[dim]); + } else { + unset = true; + } + } + + if (!unset) { data[i] = pencilValue(globalLinearIndex(global, gdims)); } + } + + return data; +} + +template +testing::AssertionResult pencilMatches(const std::vector& expected, const std::vector& actual, + const cudecompPencilInfo_t& pinfo) { + if (expected.size() != actual.size()) { + return testing::AssertionFailure() << "size mismatch: expected " << expected.size() << ", got " << actual.size(); + } + + for (int64_t i = 0; i < pinfo.size; ++i) { + if (expected[i] == actual[i]) continue; + + const auto local = localCoordinate(i, pinfo); + return testing::AssertionFailure() << "mismatch at local index " << i << " coordinate (" << local[0] << ", " + << local[1] << ", " << local[2] << "): expected " << expected[i] << ", got " + << actual[i]; + } + + return testing::AssertionSuccess(); +} + +template +cudecompResult_t runHalo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, int axis, T* input, void* work, + cudecompDataType_t dtype, const cudecompPencilInfo_t& pinfo, + const std::array& halo_periods, int dim) { + switch (axis) { + case 0: + return cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim, + pinfo.padding, 0); + case 1: + return cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim, + pinfo.padding, 0); + case 2: + return cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, pinfo.halo_extents, halo_periods.data(), dim, + pinfo.padding, 0); + } + return CUDECOMP_RESULT_INVALID_USAGE; +} + +} // namespace + +class HaloCorrectnessTest : public ::testing::TestWithParam {}; + +template void runHaloCase(const HaloCase& test_case) { + const int active_ranks = test_case.pdims[0] * test_case.pdims[1]; + const auto world_comm = cudecomp_test::MpiTestComm::world(); + + if (world_comm.size() < active_ranks) { + GTEST_SKIP() << "axis " << axisName(test_case.axis) << " halo case with pdims " << test_case.pdims[0] << "x" + << test_case.pdims[1] << " requires " << active_ranks << " ranks, launched with " << world_comm.size(); + } + + cudecompGridDescConfig_t config; + ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config)); + config.gdims[0] = test_case.gdims[0]; + config.gdims[1] = test_case.gdims[1]; + config.gdims[2] = test_case.gdims[2]; + config.pdims[0] = test_case.pdims[0]; + config.pdims[1] = test_case.pdims[1]; + config.rank_order = test_case.rank_order; + config.halo_comm_backend = test_case.backend.backend; + config.transpose_axis_contiguous[0] = test_case.axis_contiguous[0]; + config.transpose_axis_contiguous[1] = test_case.axis_contiguous[1]; + config.transpose_axis_contiguous[2] = test_case.axis_contiguous[2]; + for (int axis = 0; axis < 3; ++axis) { + for (int i = 0; i < 3; ++i) { + config.transpose_mem_order[axis][i] = test_case.mem_order[axis][i]; + } + } + + cudecomp_test::BackendTestContext test_context; + cudecomp_test::TestSetupDecision setup_decision; + ASSERT_TRUE(test_context.initialize(world_comm, active_ranks, test_case.backend.label, + std::string(test_case.backend.label) == "nccl", config, &setup_decision)); + ASSERT_FALSE(setup_decision.fail) << setup_decision.reason; + if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; } + + const auto& active_comm = test_context.comm(); + cudecompHandle_t handle = test_context.handle(); + ASSERT_NE(handle, nullptr); + + cudecompGridDesc_t grid_desc = nullptr; + const cudecompResult_t grid_desc_create_result = cudecompGridDescCreate(handle, &grid_desc, &config, nullptr); + cudecomp_test::gridDescGuard grid_desc_guard(handle, grid_desc); + CHECK_CUDECOMP_GLOBAL(active_comm, grid_desc_create_result); + + cudecompPencilInfo_t pinfo; + CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetPencilInfo(handle, grid_desc, &pinfo, test_case.axis, + test_case.halo_extents.data(), test_case.padding.data())); + + int64_t workspace_num_elements = 0; + CHECK_CUDECOMP_GLOBAL(active_comm, + cudecompGetHaloWorkspaceSize(handle, grid_desc, test_case.axis, test_case.halo_extents.data(), + &workspace_num_elements)); + + int64_t dtype_size = 0; + CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetDataTypeSize(test_case.dtype, &dtype_size)); + + const auto initial = initializePencil(pinfo, test_case.gdims); + const auto expected = initializeReference(pinfo, test_case.gdims, test_case.halo_periods); + + T* data_d = nullptr; + const cudaError_t data_alloc_result = cudaMalloc(&data_d, pinfo.size * sizeof(*data_d)); + cudecomp_test::cudaBufferGuard data_buffer(data_d); + CHECK_CUDA_GLOBAL(active_comm, data_alloc_result); + + void* work_d = nullptr; + const cudecompResult_t work_alloc_result = + cudecompMalloc(handle, grid_desc, &work_d, workspace_num_elements * dtype_size); + cudecomp_test::cudecompBufferGuard work_buffer(handle, grid_desc, work_d); + CHECK_CUDECOMP_GLOBAL(active_comm, work_alloc_result); + + CHECK_CUDA_GLOBAL(active_comm, cudaMemset(data_d, 0, pinfo.size * sizeof(*data_d))); + CHECK_CUDA_GLOBAL(active_comm, + cudaMemcpy(data_d, initial.data(), initial.size() * sizeof(*data_d), cudaMemcpyHostToDevice)); + CHECK_CUDA_GLOBAL(active_comm, cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); + + for (int dim = 0; dim < 3; ++dim) { + CHECK_CUDECOMP_GLOBAL(active_comm, runHalo(handle, grid_desc, test_case.axis, data_d, work_d, test_case.dtype, + pinfo, test_case.halo_periods, dim)); + } + + std::vector actual(expected.size(), unsetValue()); + CHECK_CUDA_GLOBAL(active_comm, + cudaMemcpy(actual.data(), data_d, actual.size() * sizeof(*data_d), cudaMemcpyDeviceToHost)); + EXPECT_TRUE(pencilMatches(expected, actual, pinfo)); +} + +TEST_P(HaloCorrectnessTest, UpdateHalos) { + const auto test_case = GetParam(); + switch (test_case.dtype) { + case CUDECOMP_FLOAT: runHaloCase(test_case); break; + case CUDECOMP_FLOAT_COMPLEX: runHaloCase>(test_case); break; + case CUDECOMP_DOUBLE: runHaloCase(test_case); break; + case CUDECOMP_DOUBLE_COMPLEX: runHaloCase>(test_case); break; + default: FAIL() << "unsupported test dtype " << test_case.dtype; + } +} + +INSTANTIATE_TEST_SUITE_P(MpiBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("mpi")), paramName); +INSTANTIATE_TEST_SUITE_P(NcclBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("nccl")), paramName); +INSTANTIATE_TEST_SUITE_P(NvshmemBackends, HaloCorrectnessTest, ::testing::ValuesIn(haloCasesForLabel("nvshmem")), + paramName); diff --git a/tests/ctest/mpi_test_main.cc b/tests/ctest/mpi_test_main.cc new file mode 100644 index 0000000..82f2822 --- /dev/null +++ b/tests/ctest/mpi_test_main.cc @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include + +#include "backend_test_context.h" +#include "gpu_test_utils.h" +#include "mpi_test_utils.h" + +namespace { + +class RankFailurePrinter : public ::testing::EmptyTestEventListener { +public: + explicit RankFailurePrinter(int rank) : rank_(rank) {} + + void OnTestPartResult(const ::testing::TestPartResult& result) override { + if (!result.failed()) return; + + std::cerr << "[rank " << rank_ << "] " << result.file_name() << ":" << result.line_number() << ": " + << result.summary() << "\n"; + } + +private: + int rank_; +}; + +bool gpuRequirementsSatisfied(const cudecomp_test::MpiTestComm& world_comm) { + const auto setup_decision = cudecomp_test::checkGpuTestRequirements(world_comm); + if (!setup_decision.fail) return true; + + std::cerr << "[rank " << world_comm.rank() << "] GPU test requirements are not satisfied: " << setup_decision.reason + << "\n"; + return false; +} + +} // namespace + +int main(int argc, char** argv) { + MPI_Init(&argc, &argv); + + int global_failure = 0; + + { + const auto world_comm = cudecomp_test::MpiTestComm::world(); + cudecomp_test::initializeGpuTestRuntime(); + + ::testing::InitGoogleTest(&argc, argv); + ::testing::UnitTest::GetInstance()->listeners().Append(new RankFailurePrinter(world_comm.rank())); + + if (world_comm.rank() == 0) { std::cout << "[cuDecomp test] MPI ranks: " << world_comm.size() << "\n"; } + + const int local_result = gpuRequirementsSatisfied(world_comm) ? RUN_ALL_TESTS() : 1; + cudecomp_test::resetSharedBackendTestContext(); + const int local_failure = local_result == 0 ? 0 : 1; + MPI_Allreduce(&local_failure, &global_failure, 1, MPI_INT, MPI_MAX, world_comm.mpiComm()); + } + + MPI_Finalize(); + return global_failure == 0 ? 0 : 1; +} diff --git a/tests/ctest/mpi_test_utils.cc b/tests/ctest/mpi_test_utils.cc new file mode 100644 index 0000000..af0fdcc --- /dev/null +++ b/tests/ctest/mpi_test_utils.cc @@ -0,0 +1,101 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "mpi_test_utils.h" + +namespace cudecomp_test { + +MpiTestComm::MpiTestComm(MPI_Comm comm) { + if (comm == MPI_COMM_NULL) return; + + MPI_Comm_dup(comm, &comm_); + MPI_Comm_rank(comm_, &rank_); + MPI_Comm_size(comm_, &size_); + MPI_Comm_split_type(comm_, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm_); + MPI_Comm_rank(local_comm_, &local_rank_); + MPI_Comm_size(local_comm_, &local_size_); +} + +MpiTestComm::MpiTestComm(MpiTestComm&& other) noexcept + : comm_(other.comm_), local_comm_(other.local_comm_), rank_(other.rank_), size_(other.size_), + local_rank_(other.local_rank_), local_size_(other.local_size_) { + other.comm_ = MPI_COMM_NULL; + other.local_comm_ = MPI_COMM_NULL; + other.rank_ = -1; + other.size_ = 0; + other.local_rank_ = -1; + other.local_size_ = 0; +} + +MpiTestComm& MpiTestComm::operator=(MpiTestComm&& other) noexcept { + if (this != &other) { + reset(); + comm_ = other.comm_; + local_comm_ = other.local_comm_; + rank_ = other.rank_; + size_ = other.size_; + local_rank_ = other.local_rank_; + local_size_ = other.local_size_; + + other.comm_ = MPI_COMM_NULL; + other.local_comm_ = MPI_COMM_NULL; + other.rank_ = -1; + other.size_ = 0; + other.local_rank_ = -1; + other.local_size_ = 0; + } + return *this; +} + +MpiTestComm::~MpiTestComm() { reset(); } + +MpiTestComm MpiTestComm::world() { return fromComm(MPI_COMM_WORLD); } + +MpiTestComm MpiTestComm::split(const MpiTestComm& parent_comm, int requested_ranks) { + const bool valid_request = requested_ranks > 0 && requested_ranks <= parent_comm.size(); + const bool active = valid_request && parent_comm.rank() < requested_ranks; + + MPI_Comm comm = MPI_COMM_NULL; + MPI_Comm_split(parent_comm.mpiComm(), active ? 0 : MPI_UNDEFINED, parent_comm.rank(), &comm); + + MpiTestComm result = fromComm(comm); + if (comm != MPI_COMM_NULL) { MPI_Comm_free(&comm); } + return result; +} + +MpiTestComm MpiTestComm::fromComm(MPI_Comm comm) { return MpiTestComm(comm); } + +MPI_Comm MpiTestComm::mpiComm() const { return comm_; } + +MPI_Comm MpiTestComm::localComm() const { return local_comm_; } + +bool MpiTestComm::valid() const { return comm_ != MPI_COMM_NULL; } + +int MpiTestComm::rank() const { return rank_; } + +int MpiTestComm::size() const { return size_; } + +int MpiTestComm::localRank() const { return local_rank_; } + +int MpiTestComm::localSize() const { return local_size_; } + +void MpiTestComm::reset() { + int finalized = 0; + MPI_Finalized(&finalized); + + if (!finalized) { + if (local_comm_ != MPI_COMM_NULL) { MPI_Comm_free(&local_comm_); } + if (comm_ != MPI_COMM_NULL) { MPI_Comm_free(&comm_); } + } + + local_comm_ = MPI_COMM_NULL; + comm_ = MPI_COMM_NULL; + rank_ = -1; + size_ = 0; + local_rank_ = -1; + local_size_ = 0; +} + +} // namespace cudecomp_test diff --git a/tests/ctest/mpi_test_utils.h b/tests/ctest/mpi_test_utils.h new file mode 100644 index 0000000..209a24b --- /dev/null +++ b/tests/ctest/mpi_test_utils.h @@ -0,0 +1,48 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_MPI_TEST_UTILS_H +#define CUDECOMP_TEST_MPI_TEST_UTILS_H + +#include + +namespace cudecomp_test { + +class MpiTestComm { +public: + MpiTestComm() = default; + MpiTestComm(const MpiTestComm&) = delete; + MpiTestComm& operator=(const MpiTestComm&) = delete; + MpiTestComm(MpiTestComm&& other) noexcept; + MpiTestComm& operator=(MpiTestComm&& other) noexcept; + ~MpiTestComm(); + + static MpiTestComm world(); + static MpiTestComm split(const MpiTestComm& parent_comm, int requested_ranks); + static MpiTestComm fromComm(MPI_Comm comm); + + MPI_Comm mpiComm() const; + MPI_Comm localComm() const; + bool valid() const; + int rank() const; + int size() const; + int localRank() const; + int localSize() const; + void reset(); + +private: + explicit MpiTestComm(MPI_Comm comm); + + MPI_Comm comm_ = MPI_COMM_NULL; + MPI_Comm local_comm_ = MPI_COMM_NULL; + int rank_ = -1; + int size_ = 0; + int local_rank_ = -1; + int local_size_ = 0; +}; + +} // namespace cudecomp_test + +#endif diff --git a/tests/ctest/test_utils.cc b/tests/ctest/test_utils.cc new file mode 100644 index 0000000..f57947f --- /dev/null +++ b/tests/ctest/test_utils.cc @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "test_utils.h" + +#include + +namespace cudecomp_test { +namespace { + +std::string sourceLocation(const char* file, int line) { return std::string(file) + ":" + std::to_string(line); } + +testing::AssertionResult checkLocalFailureGlobal(const MpiTestComm& comm, bool local_success, const char* error_kind, + const std::string& local_error, const char* file, int line) { + const int local_failure = local_success ? 0 : 1; + int global_failure = 0; + const int reduce_result = MPI_Allreduce(&local_failure, &global_failure, 1, MPI_INT, MPI_MAX, comm.mpiComm()); + if (reduce_result != MPI_SUCCESS) { + return testing::AssertionFailure() << "MPI_Allreduce failed in global check at " << sourceLocation(file, line); + } + + if (global_failure == 0) return testing::AssertionSuccess(); + if (!local_success) { + return testing::AssertionFailure() << error_kind << " failed at " << sourceLocation(file, line) << " with " + << local_error; + } + return testing::AssertionFailure() << error_kind << " failed at " << sourceLocation(file, line) << " on another rank"; +} + +} // namespace + +cudecompHandleGuard::cudecompHandleGuard(cudecompHandle_t handle) : handle_(handle) {} + +cudecompHandleGuard::~cudecompHandleGuard() { + if (handle_) { (void)cudecompFinalize(handle_); } +} + +gridDescGuard::gridDescGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc) + : handle_(handle), grid_desc_(grid_desc) {} + +gridDescGuard::~gridDescGuard() { + if (grid_desc_) { (void)cudecompGridDescDestroy(handle_, grid_desc_); } +} + +cudaBufferGuard::cudaBufferGuard(void* ptr) : ptr_(ptr) {} + +cudaBufferGuard::~cudaBufferGuard() { reset(); } + +void cudaBufferGuard::reset(void* ptr) { + if (ptr_) { (void)cudaFree(ptr_); } + ptr_ = ptr; +} + +cudecompBufferGuard::cudecompBufferGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* ptr) + : handle_(handle), grid_desc_(grid_desc), ptr_(ptr) {} + +cudecompBufferGuard::~cudecompBufferGuard() { + if (ptr_) { (void)cudecompFree(handle_, grid_desc_, ptr_); } +} + +void cudecompBufferGuard::release() noexcept { ptr_ = nullptr; } + +testing::AssertionResult checkCudaGlobal(const MpiTestComm& comm, cudaError_t result, const char* file, int line) { + return checkLocalFailureGlobal(comm, result == cudaSuccess, "CUDA", cudaGetErrorString(result), file, line); +} + +testing::AssertionResult checkCudecompGlobal(const MpiTestComm& comm, cudecompResult_t result, const char* file, + int line) { + return checkLocalFailureGlobal(comm, result == CUDECOMP_RESULT_SUCCESS, "cuDecomp", std::to_string(result), file, + line); +} + +testing::AssertionResult checkMpiGlobal(const MpiTestComm& comm, int result, const char* file, int line) { + char error_string[MPI_MAX_ERROR_STRING] = {}; + int error_string_len = 0; + if (result != MPI_SUCCESS) { MPI_Error_string(result, error_string, &error_string_len); } + return checkLocalFailureGlobal(comm, result == MPI_SUCCESS, "MPI", error_string, file, line); +} + +} // namespace cudecomp_test diff --git a/tests/ctest/test_utils.h b/tests/ctest/test_utils.h new file mode 100644 index 0000000..e910f50 --- /dev/null +++ b/tests/ctest/test_utils.h @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef CUDECOMP_TEST_UTILS_H +#define CUDECOMP_TEST_UTILS_H + +#include + +#include +#include + +#include "cudecomp.h" +#include "mpi_test_utils.h" + +namespace cudecomp_test { + +class cudecompHandleGuard { +public: + explicit cudecompHandleGuard(cudecompHandle_t handle = nullptr); + cudecompHandleGuard(const cudecompHandleGuard&) = delete; + cudecompHandleGuard& operator=(const cudecompHandleGuard&) = delete; + ~cudecompHandleGuard(); + +private: + cudecompHandle_t handle_ = nullptr; +}; + +class gridDescGuard { +public: + gridDescGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc); + gridDescGuard(const gridDescGuard&) = delete; + gridDescGuard& operator=(const gridDescGuard&) = delete; + ~gridDescGuard(); + +private: + cudecompHandle_t handle_ = nullptr; + cudecompGridDesc_t grid_desc_ = nullptr; +}; + +class cudaBufferGuard { +public: + explicit cudaBufferGuard(void* ptr = nullptr); + cudaBufferGuard(const cudaBufferGuard&) = delete; + cudaBufferGuard& operator=(const cudaBufferGuard&) = delete; + ~cudaBufferGuard(); + + void reset(void* ptr = nullptr); + +private: + void* ptr_ = nullptr; +}; + +class cudecompBufferGuard { +public: + cudecompBufferGuard(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* ptr); + cudecompBufferGuard(const cudecompBufferGuard&) = delete; + cudecompBufferGuard& operator=(const cudecompBufferGuard&) = delete; + ~cudecompBufferGuard(); + + void release() noexcept; + +private: + cudecompHandle_t handle_ = nullptr; + cudecompGridDesc_t grid_desc_ = nullptr; + void* ptr_ = nullptr; +}; + +testing::AssertionResult checkCudaGlobal(const MpiTestComm& comm, cudaError_t result, const char* file, int line); +testing::AssertionResult checkCudecompGlobal(const MpiTestComm& comm, cudecompResult_t result, const char* file, + int line); +testing::AssertionResult checkMpiGlobal(const MpiTestComm& comm, int result, const char* file, int line); + +} // namespace cudecomp_test + +#define CHECK_CUDA_GLOBAL(comm, call) ASSERT_TRUE(::cudecomp_test::checkCudaGlobal((comm), (call), __FILE__, __LINE__)) +#define CHECK_CUDECOMP_GLOBAL(comm, call) \ + ASSERT_TRUE(::cudecomp_test::checkCudecompGlobal((comm), (call), __FILE__, __LINE__)) +#define CHECK_MPI_GLOBAL(comm, call) ASSERT_TRUE(::cudecomp_test::checkMpiGlobal((comm), (call), __FILE__, __LINE__)) + +#endif diff --git a/tests/ctest/transpose_tests.cc b/tests/ctest/transpose_tests.cc new file mode 100644 index 0000000..e97382e --- /dev/null +++ b/tests/ctest/transpose_tests.cc @@ -0,0 +1,671 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "cudecomp.h" +#include "internal/common.h" + +#include "backend_test_context.h" +#include "backend_utils.h" +#include "gpu_test_utils.h" +#include "mpi_test_utils.h" +#include "test_utils.h" + +namespace { + +enum class TransposeOperation { XToY, YToX, YToZ, ZToY }; + +constexpr std::array kTransposeOperations{TransposeOperation::XToY, TransposeOperation::YToX, + TransposeOperation::YToZ, TransposeOperation::ZToY}; +constexpr std::array kBaselineGdims{9, 10, 11}; +constexpr std::array kZeroExtents{0, 0, 0}; +constexpr std::array kInputHaloExtents{1, 2, 1}; +constexpr std::array kOutputHaloExtents{2, 1, 1}; +constexpr std::array kInputPadding{1, 1, 2}; +constexpr std::array kOutputPadding{2, 1, 1}; +constexpr std::array kDefaultAxisContiguous{false, false, false}; +constexpr std::array kAllAxisContiguous{true, true, true}; +constexpr std::array, 3> kDefaultMemOrder{{{{-1, -1, -1}}, {{-1, -1, -1}}, {{-1, -1, -1}}}}; + +struct TransposeCase { + cudecomp_test::TransposeBackend backend; + const char* scenario; + TransposeOperation operation; + std::array gdims; + std::array pdims; + cudecompDataType_t dtype; + bool out_of_place; + std::array axis_contiguous; + std::array, 3> mem_order; + std::array input_halo_extents; + std::array output_halo_extents; + std::array input_padding; + std::array output_padding; + cudecompRankOrder_t rank_order; + std::vector synthetic_host_groups; +}; + +const char* operationName(TransposeOperation operation) { + switch (operation) { + case TransposeOperation::XToY: return "XToY"; + case TransposeOperation::YToX: return "YToX"; + case TransposeOperation::YToZ: return "YToZ"; + case TransposeOperation::ZToY: return "ZToY"; + } + return "Unknown"; +} + +int inputAxis(TransposeOperation operation) { + switch (operation) { + case TransposeOperation::XToY: return 0; + case TransposeOperation::YToX: return 1; + case TransposeOperation::YToZ: return 1; + case TransposeOperation::ZToY: return 2; + } + return 0; +} + +int outputAxis(TransposeOperation operation) { + switch (operation) { + case TransposeOperation::XToY: return 1; + case TransposeOperation::YToX: return 0; + case TransposeOperation::YToZ: return 2; + case TransposeOperation::ZToY: return 1; + } + return 0; +} + +std::string sanitizeParamName(const std::string& value) { + std::string result; + for (char ch : value) { + if (std::isalnum(static_cast(ch))) { + result.push_back(ch); + } else { + result.push_back('_'); + } + } + return result; +} + +const char* dtypeName(cudecompDataType_t dtype) { + switch (dtype) { + case CUDECOMP_FLOAT: return "R32"; + case CUDECOMP_FLOAT_COMPLEX: return "C32"; + case CUDECOMP_DOUBLE: return "R64"; + case CUDECOMP_DOUBLE_COMPLEX: return "C64"; + } + return "UnknownDtype"; +} + +std::string paramName(const testing::TestParamInfo& info) { + const auto& test_case = info.param; + return sanitizeParamName(test_case.scenario) + "_" + operationName(test_case.operation) + "_" + + sanitizeParamName(test_case.backend.name) + "_" + dtypeName(test_case.dtype) + "_P" + + std::to_string(test_case.pdims[0]) + "x" + std::to_string(test_case.pdims[1]) + "_" + + (test_case.out_of_place ? "OutOfPlace" : "InPlace"); +} + +TransposeCase makeCase(cudecomp_test::TransposeBackend backend, const char* scenario, TransposeOperation operation, + std::array gdims = kBaselineGdims, std::array pdims = {2, 2}, + cudecompDataType_t dtype = CUDECOMP_FLOAT, bool out_of_place = false, + std::array axis_contiguous = kDefaultAxisContiguous, + std::array, 3> mem_order = kDefaultMemOrder, + std::array input_halo_extents = kZeroExtents, + std::array output_halo_extents = kZeroExtents, + std::array input_padding = kZeroExtents, + std::array output_padding = kZeroExtents, + cudecompRankOrder_t rank_order = CUDECOMP_RANK_ORDER_DEFAULT) { + return {backend, + scenario, + operation, + gdims, + pdims, + dtype, + out_of_place, + axis_contiguous, + mem_order, + input_halo_extents, + output_halo_extents, + input_padding, + output_padding, + rank_order, + {}}; +} + +TransposeCase withSyntheticHostGroups(TransposeCase test_case, std::vector synthetic_host_groups) { + test_case.synthetic_host_groups = std::move(synthetic_host_groups); + return test_case; +} + +TransposeCase withHaloAndPadding(TransposeCase test_case) { + test_case.input_halo_extents = kInputHaloExtents; + test_case.output_halo_extents = kOutputHaloExtents; + test_case.input_padding = kInputPadding; + test_case.output_padding = kOutputPadding; + return test_case; +} + +void appendBaselineCases(std::vector& cases, const cudecomp_test::TransposeBackend& backend) { + // Baseline cases are the common transpose sweep: all direct transpose operations, in-place and out-of-place, default + // layout and all-axis-contiguous layout, and single-precision real/complex data. MPI also keeps P1x1 coverage for + // the single-rank local special cases. NCCL and NVSHMEM skip P1x1 because it bypasses backend communication; for + // NVSHMEM, it also avoids changing the PE count across init/finalize cycles inside one CTest process set. + struct LayoutCase { + const char* scenario; + std::array axis_contiguous; + }; + + for (const auto& layout : {LayoutCase{"BaselineDefaultLayout", kDefaultAxisContiguous}, + LayoutCase{"BaselineAxisContiguous", kAllAxisContiguous}}) { + for (const auto pdims : {std::array{1, 1}, std::array{2, 2}}) { + if (pdims == std::array{1, 1} && std::string(backend.label) != "mpi") continue; + + for (const auto dtype : {CUDECOMP_FLOAT, CUDECOMP_FLOAT_COMPLEX}) { + for (const auto operation : kTransposeOperations) { + for (const bool out_of_place : {false, true}) { + cases.push_back(makeCase(backend, layout.scenario, operation, kBaselineGdims, pdims, dtype, out_of_place, + layout.axis_contiguous)); + } + } + } + } + } +} + +void appendCoverageCases(std::vector& cases, const cudecomp_test::TransposeBackend& backend) { + // Coverage cases select explicit memory orders, nonzero halo/padding, dtypes, rank order, and rank counts to reach + // transpose paths not guaranteed by the baseline sweep. These are not inherently MPI-only, but running them in the + // MPI collection keeps NCCL/NVSHMEM focused on backend baseline coverage while still exercising shared transpose + // logic. + constexpr std::array, 3> unpack_mem_order{{{{0, 1, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}}; + constexpr std::array, 3> transpose_unpack_mem_order{{{{0, 2, 1}}, {{0, 1, 2}}, {{0, 1, 2}}}}; + // These last-axis choices force the multi-rank split transpose/unpack path for each direct operation. + constexpr std::array, 3> split_unpack_mem_order{{{{0, 1, 2}}, {{0, 2, 1}}, {{1, 2, 0}}}}; + + cases.push_back( + withHaloAndPadding(makeCase(backend, "ExplicitMemOrderUnpack", TransposeOperation::XToY, kBaselineGdims, {2, 2}, + CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order))); + cases.push_back( + withHaloAndPadding(makeCase(backend, "ExplicitMemOrderTransposeUnpack", TransposeOperation::XToY, kBaselineGdims, + {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, transpose_unpack_mem_order))); + for (const auto operation : kTransposeOperations) { + cases.push_back( + withHaloAndPadding(makeCase(backend, "ExplicitMemOrderSplitUnpack", operation, kBaselineGdims, {2, 2}, + CUDECOMP_FLOAT, true, kDefaultAxisContiguous, split_unpack_mem_order))); + } + + cases.push_back(makeCase(backend, "ColumnMajorRankOrder", TransposeOperation::XToY, kBaselineGdims, {2, 2}, + CUDECOMP_FLOAT, false, kDefaultAxisContiguous, kDefaultMemOrder, kZeroExtents, kZeroExtents, + kZeroExtents, kZeroExtents, CUDECOMP_RANK_ORDER_COL_MAJOR)); + + cases.push_back( + withHaloAndPadding(makeCase(backend, "NonPowerOfTwoCommunicator", TransposeOperation::XToY, kBaselineGdims, + {3, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order))); + + cases.push_back( + withHaloAndPadding(makeCase(backend, "DtypeWorkspacePadding", TransposeOperation::XToY, kBaselineGdims, {2, 2}, + CUDECOMP_DOUBLE, true, kDefaultAxisContiguous, split_unpack_mem_order))); + cases.push_back( + withHaloAndPadding(makeCase(backend, "DtypeWorkspacePadding", TransposeOperation::YToZ, kBaselineGdims, {2, 2}, + CUDECOMP_DOUBLE_COMPLEX, true, kDefaultAxisContiguous, split_unpack_mem_order))); + + if (backend.backend == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL) { + // Pipelined MPI has additional paths for direct offset handling and inter-group scheduling. The synthetic host + // groups below make a single-node test look like a multi-host communicator so those inter-group paths are covered. + constexpr std::array inter_group_pdims{3, 1}; + const std::vector inter_group_hosts{0, 1, 2}; + constexpr std::array, 3> transpose_pack_offset_mem_order{ + {{{1, 0, 2}}, {{1, 2, 0}}, {{0, 1, 2}}}}; + constexpr std::array, 3> direct_transpose_pack_offset_mem_order{ + {{{1, 0, 2}}, {{2, 1, 0}}, {{0, 1, 2}}}}; + constexpr std::array, 3> direct_transpose_unpack_offset_mem_order{ + {{{1, 0, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}}; + + cases.push_back(withHaloAndPadding(makeCase(backend, "ExplicitMemOrderTransposePackOffset", + TransposeOperation::XToY, kBaselineGdims, {2, 2}, CUDECOMP_FLOAT, true, + kDefaultAxisContiguous, transpose_pack_offset_mem_order))); + cases.push_back(withHaloAndPadding(makeCase(backend, "DirectTransposePackOffset", TransposeOperation::XToY, + kBaselineGdims, {1, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, + direct_transpose_pack_offset_mem_order))); + cases.push_back(withHaloAndPadding(makeCase(backend, "DirectTransposeUnpackOffset", TransposeOperation::XToY, + kBaselineGdims, {1, 1}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, + direct_transpose_unpack_offset_mem_order))); + + cases.push_back(withSyntheticHostGroups( + withHaloAndPadding(makeCase(backend, "SyntheticInterGroupUnpack", TransposeOperation::XToY, kBaselineGdims, + inter_group_pdims, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, unpack_mem_order)), + inter_group_hosts)); + cases.push_back( + withSyntheticHostGroups(withHaloAndPadding(makeCase(backend, "SyntheticInterGroupTransposeUnpack", + TransposeOperation::XToY, kBaselineGdims, inter_group_pdims, + CUDECOMP_FLOAT, true, kDefaultAxisContiguous, + transpose_unpack_mem_order)), + inter_group_hosts)); + cases.push_back( + withSyntheticHostGroups(withHaloAndPadding(makeCase(backend, "SyntheticInterGroupSplitUnpack", + TransposeOperation::XToY, kBaselineGdims, inter_group_pdims, + CUDECOMP_FLOAT, true, kDefaultAxisContiguous, + split_unpack_mem_order)), + inter_group_hosts)); + } +} + +std::vector transposeCasesForLabel(const char* label) { + std::vector cases; + for (const auto& backend : cudecomp_test::transposeBackends()) { + if (std::string(backend.label) != label) continue; + + appendBaselineCases(cases, backend); + if (std::string(label) == "mpi") { appendCoverageCases(cases, backend); } + } + return cases; +} + +std::vector cudaGraphTransposeCases() { + std::vector cases; + for (const auto& backend : cudecomp_test::transposeBackends()) { + if (backend.backend != CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL) continue; + + constexpr std::array, 3> pack_mem_order{{{{0, 1, 2}}, {{0, 1, 2}}, {{0, 1, 2}}}}; + constexpr std::array, 3> transpose_pack_mem_order{{{{0, 1, 2}}, {{1, 2, 0}}, {{0, 1, 2}}}}; + + cases.push_back(withHaloAndPadding(makeCase(backend, "CudaGraphsPack", TransposeOperation::XToY, kBaselineGdims, + {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, pack_mem_order))); + cases.push_back( + withHaloAndPadding(makeCase(backend, "CudaGraphsTransposePack", TransposeOperation::XToY, kBaselineGdims, + {2, 2}, CUDECOMP_FLOAT, true, kDefaultAxisContiguous, transpose_pack_mem_order))); + } + return cases; +} + +std::vector ncclUserBufferRegistrationCases() { + std::vector cases; + for (const auto& backend : cudecomp_test::transposeBackends()) { + if (backend.backend != CUDECOMP_TRANSPOSE_COMM_NCCL) continue; + cases.push_back(makeCase(backend, "UserBufferRegistration", TransposeOperation::XToY, kBaselineGdims, {2, 2}, + CUDECOMP_FLOAT, true)); + } + return cases; +} + +bool isInternal(const cudecompPencilInfo_t& pinfo, const std::array& local) { + return local[0] >= pinfo.halo_extents[pinfo.order[0]] && + local[0] < pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]] && + local[1] >= pinfo.halo_extents[pinfo.order[1]] && + local[1] < pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]] && + local[2] >= pinfo.halo_extents[pinfo.order[2]] && + local[2] < pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]]; +} + +template T pencilValue(int64_t global_index) { return static_cast(global_index); } + +template <> std::complex pencilValue>(int64_t global_index) { + return {static_cast(global_index), -static_cast(global_index)}; +} + +template <> std::complex pencilValue>(int64_t global_index) { + return {static_cast(global_index), -static_cast(global_index)}; +} + +template +std::vector initializePencil(const cudecompPencilInfo_t& pinfo, const std::array& gdims) { + std::vector data(pinfo.size, T{-1}); + + for (int64_t i = 0; i < pinfo.size; ++i) { + std::array local{}; + local[0] = i % pinfo.shape[0]; + local[1] = i / pinfo.shape[0] % pinfo.shape[1]; + local[2] = i / (pinfo.shape[0] * pinfo.shape[1]); + + if (!isInternal(pinfo, local)) continue; + + std::array global{}; + global[pinfo.order[0]] = local[0] + pinfo.lo[0] - pinfo.halo_extents[pinfo.order[0]]; + global[pinfo.order[1]] = local[1] + pinfo.lo[1] - pinfo.halo_extents[pinfo.order[1]]; + global[pinfo.order[2]] = local[2] + pinfo.lo[2] - pinfo.halo_extents[pinfo.order[2]]; + + data[i] = pencilValue(global[0] + gdims[0] * (global[1] + global[2] * gdims[1])); + } + + return data; +} + +template +testing::AssertionResult pencilMatches(const std::vector& expected, const std::vector& actual, + const cudecompPencilInfo_t& pinfo) { + if (expected.size() != actual.size()) { + return testing::AssertionFailure() << "size mismatch: expected " << expected.size() << ", got " << actual.size(); + } + + for (int64_t i = 0; i < pinfo.size; ++i) { + if (expected[i] == actual[i]) continue; + + std::array local{}; + local[0] = i % pinfo.shape[0]; + local[1] = i / pinfo.shape[0] % pinfo.shape[1]; + local[2] = i / (pinfo.shape[0] * pinfo.shape[1]); + + if (!isInternal(pinfo, local)) continue; + return testing::AssertionFailure() << "mismatch at local index " << i << " coordinate (" << local[0] << ", " + << local[1] << ", " << local[2] << "): expected " << expected[i] << ", got " + << actual[i]; + } + + return testing::AssertionSuccess(); +} + +template +cudecompResult_t runTranspose(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, TransposeOperation operation, + T* input, T* output, void* work, cudecompDataType_t dtype, + const cudecompPencilInfo_t& input_info, const cudecompPencilInfo_t& output_info) { + switch (operation) { + case TransposeOperation::XToY: + return cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype, input_info.halo_extents, + output_info.halo_extents, input_info.padding, output_info.padding, 0); + case TransposeOperation::YToX: + return cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype, input_info.halo_extents, + output_info.halo_extents, input_info.padding, output_info.padding, 0); + case TransposeOperation::YToZ: + return cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype, input_info.halo_extents, + output_info.halo_extents, input_info.padding, output_info.padding, 0); + case TransposeOperation::ZToY: + return cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype, input_info.halo_extents, + output_info.halo_extents, input_info.padding, output_info.padding, 0); + } + return CUDECOMP_RESULT_INVALID_USAGE; +} + +template +void runAndVerifyTranspose(const cudecomp_test::MpiTestComm& active_comm, cudecompHandle_t handle, + cudecompGridDesc_t grid_desc, const TransposeCase& test_case, T* input_d, T* output_d, + void* work_d, int64_t data_num_elements, int64_t workspace_num_elements, int64_t dtype_size, + const std::vector& input_ref, const std::vector& output_ref, + const cudecompPencilInfo_t& input_info, const cudecompPencilInfo_t& output_info) { + CHECK_CUDA_GLOBAL(active_comm, cudaMemset(input_d, 0, data_num_elements * sizeof(*input_d))); + if (output_d != input_d) { + CHECK_CUDA_GLOBAL(active_comm, cudaMemset(output_d, 0, data_num_elements * sizeof(*output_d))); + } + CHECK_CUDA_GLOBAL(active_comm, + cudaMemcpy(input_d, input_ref.data(), input_ref.size() * sizeof(*input_d), cudaMemcpyHostToDevice)); + CHECK_CUDA_GLOBAL(active_comm, cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); + + CHECK_CUDECOMP_GLOBAL(active_comm, runTranspose(handle, grid_desc, test_case.operation, input_d, output_d, work_d, + test_case.dtype, input_info, output_info)); + + std::vector output(output_ref.size(), T{}); + CHECK_CUDA_GLOBAL(active_comm, + cudaMemcpy(output.data(), output_d, output.size() * sizeof(*output_d), cudaMemcpyDeviceToHost)); + EXPECT_TRUE(pencilMatches(output_ref, output, output_info)); +} + +cudecomp::cudecompCommAxis communicationAxis(TransposeOperation operation) { + const int ax_a = inputAxis(operation); + const int ax_b = outputAxis(operation); + return (ax_a == 2 || ax_b == 2) ? cudecomp::CUDECOMP_COMM_ROW : cudecomp::CUDECOMP_COMM_COL; +} + +testing::AssertionResult applySyntheticHostnames(cudecompHandle_t handle, const std::vector& host_groups) { + // Inter-group transpose paths are normally selected from MPI processor names and topology metadata. Tests override + // the handle before grid descriptor creation so communicator setup observes deterministic host groupings on one node. + if (host_groups.empty()) return testing::AssertionSuccess(); + if (!handle) return testing::AssertionFailure() << "cannot apply synthetic hostnames to null cuDecomp handle"; + + if (host_groups.size() != handle->hostnames.size()) { + return testing::AssertionFailure() << "synthetic hostname group count " << host_groups.size() + << " does not match handle rank count " << handle->hostnames.size(); + } + + for (int rank = 0; rank < static_cast(host_groups.size()); ++rank) { + std::string hostname = "cudecomp-test-host-" + std::to_string(host_groups[rank]); + if (hostname.size() >= handle->hostnames[rank].size()) { + return testing::AssertionFailure() << "synthetic hostname '" << hostname << "' exceeds MPI processor name limit"; + } + handle->hostnames[rank].fill('\0'); + std::copy(hostname.begin(), hostname.end(), handle->hostnames[rank].begin()); + } + + // Force communicator setup to use the synthetic hostnames even on systems with MNNVL topology data. + handle->rank_to_mnnvl_info.clear(); + handle->rank_to_clique.clear(); + handle->rank_to_clique_rank = handle->rank_to_local_rank; + + return testing::AssertionSuccess(); +} + +testing::AssertionResult syntheticTopologyIsActive(cudecompGridDesc_t grid_desc, TransposeOperation operation) { + const auto comm_axis = communicationAxis(operation); + const auto& comm_info = + (comm_axis == cudecomp::CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info; + + if (comm_info.ngroups <= 1) { + return testing::AssertionFailure() << "synthetic topology did not create inter-group communicator: ngroups=" + << comm_info.ngroups << ", npergroup=" << comm_info.npergroup + << ", nranks=" << comm_info.nranks; + } + + return testing::AssertionSuccess(); +} + +} // namespace + +class TransposeCorrectnessTest : public ::testing::TestWithParam {}; +class CudaGraphTransposeCorrectnessTest : public ::testing::TestWithParam {}; +class NcclUserBufferRegistrationTest : public ::testing::TestWithParam {}; + +testing::AssertionResult ncclUserBufferRegistrationIsActive(cudecompHandle_t handle, void* buffer) { +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0) + if (!handle->nccl_enable_ubr) { return testing::AssertionFailure() << "NCCL user buffer registration is disabled"; } + + auto entry = handle->nccl_ubr_handles.find(buffer); + if (entry == handle->nccl_ubr_handles.end() || entry->second.empty()) { + return testing::AssertionFailure() << "NCCL user buffer registration handle was not recorded"; + } + + return testing::AssertionSuccess(); +#else + return testing::AssertionFailure() << "NCCL user buffer registration requires NCCL 2.19 or newer"; +#endif +} + +template +void runTransposeCase(const TransposeCase& test_case, bool check_cuda_graph_replay = false, + bool check_nccl_user_buffer_registration = false) { + const int active_ranks = test_case.pdims[0] * test_case.pdims[1]; + const auto world_comm = cudecomp_test::MpiTestComm::world(); + + if (world_comm.size() < active_ranks) { + GTEST_SKIP() << operationName(test_case.operation) << " with pdims " << test_case.pdims[0] << "x" + << test_case.pdims[1] << " requires " << active_ranks << " ranks, launched with " << world_comm.size(); + } + + cudecompGridDescConfig_t config; + ASSERT_EQ(CUDECOMP_RESULT_SUCCESS, cudecompGridDescConfigSetDefaults(&config)); + config.gdims[0] = test_case.gdims[0]; + config.gdims[1] = test_case.gdims[1]; + config.gdims[2] = test_case.gdims[2]; + config.pdims[0] = test_case.pdims[0]; + config.pdims[1] = test_case.pdims[1]; + config.rank_order = test_case.rank_order; + config.transpose_comm_backend = test_case.backend.backend; + config.transpose_axis_contiguous[0] = test_case.axis_contiguous[0]; + config.transpose_axis_contiguous[1] = test_case.axis_contiguous[1]; + config.transpose_axis_contiguous[2] = test_case.axis_contiguous[2]; + for (int axis = 0; axis < 3; ++axis) { + for (int i = 0; i < 3; ++i) { + config.transpose_mem_order[axis][i] = test_case.mem_order[axis][i]; + } + } + + cudecomp_test::BackendTestContext test_context; + cudecomp_test::TestSetupDecision setup_decision; + ASSERT_TRUE(test_context.initialize(world_comm, active_ranks, test_case.backend.label, + std::string(test_case.backend.label) == "nccl", config, &setup_decision)); + ASSERT_FALSE(setup_decision.fail) << setup_decision.reason; + if (setup_decision.skip) { GTEST_SKIP() << setup_decision.reason; } + + const auto& active_comm = test_context.comm(); + cudecompHandle_t handle = test_context.handle(); + ASSERT_NE(handle, nullptr); + ASSERT_TRUE(applySyntheticHostnames(handle, test_case.synthetic_host_groups)); + if (check_cuda_graph_replay) { ASSERT_TRUE(handle->cuda_graphs_enable); } + + cudecompGridDesc_t grid_desc = nullptr; + const cudecompResult_t grid_desc_create_result = cudecompGridDescCreate(handle, &grid_desc, &config, nullptr); + cudecomp_test::gridDescGuard grid_desc_guard(handle, grid_desc); + CHECK_CUDECOMP_GLOBAL(active_comm, grid_desc_create_result); + if (!test_case.synthetic_host_groups.empty()) { + ASSERT_TRUE(syntheticTopologyIsActive(grid_desc, test_case.operation)); + } + + cudecompPencilInfo_t input_info; + cudecompPencilInfo_t output_info; + CHECK_CUDECOMP_GLOBAL(active_comm, + cudecompGetPencilInfo(handle, grid_desc, &input_info, inputAxis(test_case.operation), + test_case.input_halo_extents.data(), test_case.input_padding.data())); + CHECK_CUDECOMP_GLOBAL(active_comm, + cudecompGetPencilInfo(handle, grid_desc, &output_info, outputAxis(test_case.operation), + test_case.output_halo_extents.data(), test_case.output_padding.data())); + + int64_t workspace_num_elements = 0; + CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetTransposeWorkspaceSize(handle, grid_desc, &workspace_num_elements)); + + int64_t dtype_size = 0; + CHECK_CUDECOMP_GLOBAL(active_comm, cudecompGetDataTypeSize(test_case.dtype, &dtype_size)); + + const auto input_ref = initializePencil(input_info, test_case.gdims); + const auto output_ref = initializePencil(output_info, test_case.gdims); + const int64_t data_num_elements = std::max(input_info.size, output_info.size); + + T* input_d = nullptr; + const cudaError_t input_alloc_result = cudaMalloc(&input_d, data_num_elements * sizeof(*input_d)); + cudecomp_test::cudaBufferGuard input_buffer(input_d); + CHECK_CUDA_GLOBAL(active_comm, input_alloc_result); + + if (check_cuda_graph_replay) { + ASSERT_TRUE(test_case.out_of_place); + + T* output_a_d = nullptr; + const cudaError_t output_a_alloc_result = cudaMalloc(&output_a_d, data_num_elements * sizeof(*output_a_d)); + cudecomp_test::cudaBufferGuard output_a_buffer(output_a_d); + CHECK_CUDA_GLOBAL(active_comm, output_a_alloc_result); + + T* output_b_d = nullptr; + const cudaError_t output_b_alloc_result = cudaMalloc(&output_b_d, data_num_elements * sizeof(*output_b_d)); + cudecomp_test::cudaBufferGuard output_b_buffer(output_b_d); + CHECK_CUDA_GLOBAL(active_comm, output_b_alloc_result); + + void* work_a_d = nullptr; + const cudecompResult_t work_a_alloc_result = + cudecompMalloc(handle, grid_desc, &work_a_d, workspace_num_elements * dtype_size); + cudecomp_test::cudecompBufferGuard work_a_buffer(handle, grid_desc, work_a_d); + CHECK_CUDECOMP_GLOBAL(active_comm, work_a_alloc_result); + + void* work_b_d = nullptr; + const cudecompResult_t work_b_alloc_result = + cudecompMalloc(handle, grid_desc, &work_b_d, workspace_num_elements * dtype_size); + cudecomp_test::cudecompBufferGuard work_b_buffer(handle, grid_desc, work_b_d); + CHECK_CUDECOMP_GLOBAL(active_comm, work_b_alloc_result); + + runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_a_d, work_a_d, data_num_elements, + workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info); + runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_a_d, work_a_d, data_num_elements, + workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info); + runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_b_d, work_b_d, data_num_elements, + workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info); + runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_b_d, work_b_d, data_num_elements, + workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info); + return; + } + + T* output_d = input_d; + cudecomp_test::cudaBufferGuard output_buffer; + if (test_case.out_of_place) { + T* allocated_output_d = nullptr; + const cudaError_t output_alloc_result = cudaMalloc(&allocated_output_d, data_num_elements * sizeof(*output_d)); + output_buffer.reset(allocated_output_d); + CHECK_CUDA_GLOBAL(active_comm, output_alloc_result); + output_d = allocated_output_d; + } + + void* work_d = nullptr; + const cudecompResult_t work_alloc_result = + cudecompMalloc(handle, grid_desc, &work_d, workspace_num_elements * dtype_size); + cudecomp_test::cudecompBufferGuard work_buffer(handle, grid_desc, work_d); + CHECK_CUDECOMP_GLOBAL(active_comm, work_alloc_result); + if (check_nccl_user_buffer_registration) { ASSERT_TRUE(ncclUserBufferRegistrationIsActive(handle, work_d)); } + + runAndVerifyTranspose(active_comm, handle, grid_desc, test_case, input_d, output_d, work_d, data_num_elements, + workspace_num_elements, dtype_size, input_ref, output_ref, input_info, output_info); + + if (check_nccl_user_buffer_registration) { + const cudecompResult_t work_free_result = cudecompFree(handle, grid_desc, work_d); + if (work_free_result == CUDECOMP_RESULT_SUCCESS) { work_buffer.release(); } + CHECK_CUDECOMP_GLOBAL(active_comm, work_free_result); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0) + EXPECT_EQ(handle->nccl_ubr_handles.count(work_d), 0); +#endif + } +} + +TEST_P(TransposeCorrectnessTest, DirectOperation) { + const auto test_case = GetParam(); + switch (test_case.dtype) { + case CUDECOMP_FLOAT: runTransposeCase(test_case); break; + case CUDECOMP_FLOAT_COMPLEX: runTransposeCase>(test_case); break; + case CUDECOMP_DOUBLE: runTransposeCase(test_case); break; + case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase>(test_case); break; + default: FAIL() << "unsupported test dtype " << test_case.dtype; + } +} + +TEST_P(CudaGraphTransposeCorrectnessTest, CapturesAndReplaysPackingGraph) { + const auto test_case = GetParam(); + switch (test_case.dtype) { + case CUDECOMP_FLOAT: runTransposeCase(test_case, true); break; + case CUDECOMP_FLOAT_COMPLEX: runTransposeCase>(test_case, true); break; + case CUDECOMP_DOUBLE: runTransposeCase(test_case, true); break; + case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase>(test_case, true); break; + default: FAIL() << "unsupported test dtype " << test_case.dtype; + } +} + +TEST_P(NcclUserBufferRegistrationTest, DirectOperation) { +#if NCCL_VERSION_CODE < NCCL_VERSION(2, 19, 0) + GTEST_SKIP() << "NCCL user buffer registration requires NCCL 2.19 or newer"; +#else + const auto test_case = GetParam(); + switch (test_case.dtype) { + case CUDECOMP_FLOAT: runTransposeCase(test_case, false, true); break; + case CUDECOMP_FLOAT_COMPLEX: runTransposeCase>(test_case, false, true); break; + case CUDECOMP_DOUBLE: runTransposeCase(test_case, false, true); break; + case CUDECOMP_DOUBLE_COMPLEX: runTransposeCase>(test_case, false, true); break; + default: FAIL() << "unsupported test dtype " << test_case.dtype; + } +#endif +} + +INSTANTIATE_TEST_SUITE_P(MpiBackends, TransposeCorrectnessTest, ::testing::ValuesIn(transposeCasesForLabel("mpi")), + paramName); +INSTANTIATE_TEST_SUITE_P(NcclBackends, TransposeCorrectnessTest, ::testing::ValuesIn(transposeCasesForLabel("nccl")), + paramName); +INSTANTIATE_TEST_SUITE_P(NvshmemBackends, TransposeCorrectnessTest, + ::testing::ValuesIn(transposeCasesForLabel("nvshmem")), paramName); +INSTANTIATE_TEST_SUITE_P(CudaGraphMpiBackends, CudaGraphTransposeCorrectnessTest, + ::testing::ValuesIn(cudaGraphTransposeCases()), paramName); +INSTANTIATE_TEST_SUITE_P(NcclUserBufferRegistration, NcclUserBufferRegistrationTest, + ::testing::ValuesIn(ncclUserBufferRegistrationCases()), paramName);