From f7f8ae941292267f737160d8fd2cac9b0844853b Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 10 Feb 2023 10:54:35 -0800 Subject: [PATCH 01/13] Initial CMake build files. Still a work in progress. --- CMakeLists.txt | 162 ++++++++++++++++++++ benchmark/CMakeLists.txt | 35 +++++ examples/cc/basic_usage/CMakeLists.txt | 38 +++++ examples/cc/taylor_green/CMakeLists.txt | 27 ++++ examples/fortran/basic_usage/CMakeLists.txt | 36 +++++ examples/fortran/poisson/CMakeLists.txt | 23 +++ tests/cc/CMakeLists.txt | 58 +++++++ tests/fortran/CMakeLists.txt | 59 +++++++ tests/fortran/halo_test.f90 | 47 ++++-- tests/fortran/transpose_test.f90 | 47 ++++-- 10 files changed, 504 insertions(+), 28 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 benchmark/CMakeLists.txt create mode 100644 examples/cc/basic_usage/CMakeLists.txt create mode 100644 examples/cc/taylor_green/CMakeLists.txt create mode 100644 examples/fortran/basic_usage/CMakeLists.txt create mode 100644 examples/fortran/poisson/CMakeLists.txt create mode 100644 tests/cc/CMakeLists.txt create mode 100644 tests/fortran/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..e10d63e --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,162 @@ +cmake_minimum_required(VERSION 3.16) +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) + +# User-defined build options +option(CUDECOMP_BUILD_FORTRAN "Build Fortran bindings" ON) +option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON) +option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF) +option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF) +set(CUDECOMP_CUDA_CC_LIST "60;70;80" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") + +if (CUDECOMP_BUILD_FORTRAN) + set(LANGS CXX CUDA Fortran) +else() + set(LANGS CXX CUDA) +endif() + +project(cudecomp LANGUAGES ${LANGS}) + +# MPI +find_package(MPI REQUIRED) +# TODO: Check for MPICH to define `-DMPICH` flag + +# HPC SDK +if (CUDECOMP_ENABLE_NVSHMEM) + find_package(NVHPC REQUIRED COMPONENTS CUDA MATH NCCL NVSHMEM) +else() + find_package(NVHPC REQUIRED COMPONENTS CUDA MATH NCCL) +endif() + +# Set up required include directory flags +string(REPLACE "/lib64" "/include" NVHPC_CUDA_INCLUDE_DIR ${NVHPC_CUDA_LIBRARY_DIR}) +string(REPLACE "/lib64" "/include" NVHPC_CUFFT_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) +string(REPLACE "/lib64" "/include" NVHPC_CUTENSOR_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) +string(REPLACE "/lib" "/include" NVHPC_NCCL_INCLUDE_DIR ${NVHPC_NCCL_LIBRARY_DIR}) +if (CUDECOMP_ENABLE_NVSHMEM) + string(REPLACE "/lib" "/include" NVHPC_NVSHMEM_INCLUDE_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) +endif() + +# Building cuDecomp shared lib +add_library(cudecomp SHARED) +set_target_properties(cudecomp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + +# Set NVCC flags for requested compute capability +if (CMAKE_VERSION VERSION_LESS 3.18) + foreach(CUDA_CC ${CUDECOMP_CUDA_CC_LIST}) + list(APPEND CUDA_CC_FLAGS -gencode=arch=compute_${CUDA_CC},code=sm_${CUDA_CC}) + endforeach() + target_compile_options(cudecomp PRIVATE $<$: ${CUDA_CC_FLAGS}>) +else() + set_target_properties(cudecomp PROPERTIES CUDA_ARCHITECTURES "${CUDECOMP_CUDA_CC_LIST}") +endif() +target_sources(cudecomp + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/autotune.cc + ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_kernels.cu + ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp.cc +) + +target_include_directories(cudecomp + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${MPI_CXX_INCLUDE_DIR} + ${NVHPC_CUDA_INCLUDE_DIR} + ${NVHPC_CUTENSOR_INCLUDE_DIR} + ${NVHPC_NCCL_INCLUDE_DIR} +) + +target_link_libraries(cudecomp PUBLIC NVHPC::CUDART) +target_link_libraries(cudecomp PUBLIC MPI::MPI_CXX) +target_link_libraries(cudecomp PRIVATE NVHPC::CUTENSOR) +target_link_libraries(cudecomp PRIVATE NVHPC::NCCL) + +if (CUDECOMP_ENABLE_NVTX) + target_compile_definitions(cudecomp PRIVATE CUDECOMP_ENABLE_NVTX) +endif() + +if (CUDECOMP_ENABLE_NVSHMEM) + target_compile_definitions(cudecomp PRIVATE CUDECOMP_ENABLE_NVSHMEM) + + # Get NVSHMEM version from header + file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) + string(REGEX MATCH "NVSHMEM_VENDOR_MAJOR_VERSION ([0-9]*)" _ ${NVSHMEM_VERSION_RAW}) + list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1}) + string(REGEX MATCH "NVSHMEM_VENDOR_MINOR_VERSION ([0-9]*)" _ ${NVSHMEM_VERSION_RAW}) + list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1}) + list(JOIN NVSHMEM_VERSION "." NVSHMEM_VERSION) + + if (NVSHMEM_VERSION VERSION_LESS "2.7") + # NVSHMEM versions before 2.7 will export NCCL symbols erroneously, need to define this flag + target_compile_definitions(cudecomp PRIVATE NVSHMEM_USE_NCCL) + endif() + + if (NVSHMEM_VERSION VERSION_LESS "2.5") + target_link_libraries(cudecomp PRIVATE ${NVHPC_NVSHMEM_LIBRARY_DIR}/libnvshmem.a) + else() + target_link_libraries(cudecomp PRIVATE -L${NVHPC_NVSHMEM_LIBRARY_DIR} -lnvshmem_host) + target_link_libraries(cudecomp PRIVATE ${NVHPC_NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a) + target_link_libraries(cudecomp PUBLIC -L${NVHPC_CUDA_LIBRARY_DIR}/stubs -lnvidia-ml) + endif() + target_link_libraries(cudecomp PUBLIC -L${NVHPC_CUDA_LIBRARY_DIR}/stubs -lcuda) + set_target_properties(cudecomp PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + set_target_properties(cudecomp PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) +endif() + +set_target_properties(cudecomp PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/cudecomp.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/cudecomp.h ${CMAKE_BINARY_DIR}/include/cudecomp.h) + +install( + TARGETS cudecomp + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/include +) + +# Building Fortran shared lib and module +if (CUDECOMP_BUILD_FORTRAN) + # Creating -gpu argument string for Fortran files + foreach(CUDA_CC ${CUDECOMP_CUDA_CC_LIST}) + list(APPEND CUF_GPU_ARG "cc${CUDA_CC}") + endforeach() + list(APPEND CUF_GPU_ARG "cuda${NVHPC_CUDA_VERSION}") + list(JOIN CUF_GPU_ARG "," CUF_GPU_ARG) + + #set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/fortran_modules) + + add_library(cudecomp_fort SHARED) + set_target_properties(cudecomp_fort PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + set_target_properties(cudecomp_fort PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/include) + set_target_properties(cudecomp_fort PROPERTIES LINKER_LANGUAGE Fortran) + target_compile_options(cudecomp_fort PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) + target_sources( + cudecomp_fort + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_m.cuf + ) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/cudecomp_m.cuf PROPERTIES LANGUAGE Fortran) + + target_link_libraries(cudecomp_fort PUBLIC MPI::MPI_Fortran) + + install( + TARGETS cudecomp_fort + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + ) + # Install cuDecomp module + install(FILES ${CMAKE_BINARY_DIR}/include/cudecomp.mod DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +endif() + +if (CUDECOMP_BUILD_EXTRAS) + add_subdirectory(benchmark) + + add_subdirectory(tests/cc) + add_subdirectory(examples/cc/basic_usage) + add_subdirectory(examples/cc/taylor_green) + + if (CUDECOMP_BUILD_FORTRAN) + add_subdirectory(tests/fortran) + add_subdirectory(examples/fortran/basic_usage) + add_subdirectory(examples/fortran/poisson) + endif() +endif() + diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 0000000..4b449ff --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,35 @@ +set(benchmark_targets + benchmark_r2c + benchmark_c2c + benchmark_r2c_f + benchmark_c2c_f +) + +foreach(tgt ${benchmark_targets}) + add_executable(${tgt}) + if (CMAKE_VERSION VERSION_LESS 3.18) + target_compile_options(${tgt} PRIVATE $<$: ${CUDA_CC_FLAGS}>) + else() + set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") + endif() + target_sources(${tgt} + PRIVATE + benchmark.cu + ) + target_include_directories(${tgt} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${NVHPC_CUFFT_INCLUDE_DIE}) + target_link_libraries(${tgt} PRIVATE cudecomp) + target_link_libraries(${tgt} PRIVATE NVHPC::CUFFT) +endforeach() + +target_compile_definitions(benchmark_r2c PRIVATE R2C) +target_compile_definitions(benchmark_c2c PRIVATE C2C) +target_compile_definitions(benchmark_r2c_f PRIVATE R2C USE_FLOAT) +target_compile_definitions(benchmark_c2c_f PRIVATE R2C USE_FLOAT) + +install( + TARGETS ${benchmark_targets} + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/benchmark +) diff --git a/examples/cc/basic_usage/CMakeLists.txt b/examples/cc/basic_usage/CMakeLists.txt new file mode 100644 index 0000000..6365006 --- /dev/null +++ b/examples/cc/basic_usage/CMakeLists.txt @@ -0,0 +1,38 @@ +set(basic_usage_targets_cc + basic_usage_cc + basic_usage_autotune_cc +) + +add_executable(basic_usage_cc) + target_sources(basic_usage_cc + PRIVATE + basic_usage.cu +) + +add_executable(basic_usage_autotune_cc) +target_sources(basic_usage_autotune_cc + PRIVATE + basic_usage_autotune.cu +) + +foreach(tgt ${basic_usage_targets_cc}) + string(REPLACE "_cc" "" bin_name ${tgt}) + set_target_properties(${tgt} PROPERTIES OUTPUT_NAME ${bin_name}) + if (CMAKE_VERSION VERSION_LESS 3.18) + target_compile_options(${tgt} PRIVATE $<$: ${CUDA_CC_FLAGS}>) + else() + set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") + endif() + target_include_directories(${tgt} + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_CXX_INCLUDE_DIRS} + ) + target_link_libraries(${tgt} PRIVATE MPI::MPI_CXX) + target_link_libraries(${tgt} PRIVATE cudecomp) +endforeach() + +install( + TARGETS ${basic_usage_targets_cc} + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/examples/cc/basic_usage +) diff --git a/examples/cc/taylor_green/CMakeLists.txt b/examples/cc/taylor_green/CMakeLists.txt new file mode 100644 index 0000000..acc7d22 --- /dev/null +++ b/examples/cc/taylor_green/CMakeLists.txt @@ -0,0 +1,27 @@ +add_executable(tg) + +target_sources(tg + PRIVATE + tg.cu +) + +if (CMAKE_VERSION VERSION_LESS 3.18) + target_compile_options(tg PRIVATE $<$: ${CUDA_CC_FLAGS}>) +else() + set_target_properties(tg PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") +endif() + +target_include_directories(tg + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_CXX_INCLUDE_DIRS} + ${NVHPC_CUFFT_INCLUDE_DIR} +) +target_link_libraries(tg PRIVATE MPI::MPI_CXX) +target_link_libraries(tg PRIVATE NVHPC::CUFFT) +target_link_libraries(tg PRIVATE cudecomp) + +install( + TARGETS tg + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/examples/cc/taylor_green +) diff --git a/examples/fortran/basic_usage/CMakeLists.txt b/examples/fortran/basic_usage/CMakeLists.txt new file mode 100644 index 0000000..78a5bb4 --- /dev/null +++ b/examples/fortran/basic_usage/CMakeLists.txt @@ -0,0 +1,36 @@ +set(basic_usage_targets_f + basic_usage_f + basic_usage_autotune_f +) + +add_executable(basic_usage_f) +target_sources(basic_usage_f + PRIVATE + basic_usage.f90 +) + +add_executable(basic_usage_autotune_f) +target_sources(basic_usage_autotune_f + PRIVATE + basic_usage_autotune.f90 +) + +foreach(tgt ${basic_usage_targets_f}) + string(REPLACE "_f" "" bin_name ${tgt}) + set_target_properties(${tgt} PROPERTIES OUTPUT_NAME ${bin_name}) + target_include_directories(${tgt} + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ) + target_link_libraries(${tgt} PRIVATE MPI::MPI_Fortran) + target_link_libraries(${tgt} PRIVATE cudecomp) + target_link_libraries(${tgt} PRIVATE cudecomp_fort) + target_compile_options(${tgt} PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) + target_link_options(${tgt} PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) +endforeach() + +install( + TARGETS ${basic_usage_targets_f} + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/examples/fortran/basic_usage +) diff --git a/examples/fortran/poisson/CMakeLists.txt b/examples/fortran/poisson/CMakeLists.txt new file mode 100644 index 0000000..2ec91d1 --- /dev/null +++ b/examples/fortran/poisson/CMakeLists.txt @@ -0,0 +1,23 @@ +add_executable(poisson) +target_sources(poisson + PRIVATE + poisson.f90 +) + +target_include_directories(poisson + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ${NVHPC_CUFFT_INCLUDE_DIR} +) +target_link_libraries(poisson PRIVATE MPI::MPI_Fortran) +target_link_libraries(poisson PRIVATE NVHPC::CUFFT) +target_link_libraries(poisson PRIVATE cudecomp) +target_link_libraries(poisson PRIVATE cudecomp_fort) +target_compile_options(poisson PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) +target_link_options(poisson PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) + +install( + TARGETS poisson + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/examples/fortran/poisson +) diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt new file mode 100644 index 0000000..44c474e --- /dev/null +++ b/tests/cc/CMakeLists.txt @@ -0,0 +1,58 @@ +set(transpose_test_targets_cc + transpose_test_R32_cc + transpose_test_R64_cc + transpose_test_C32_cc + transpose_test_C64_cc +) + +set(halo_test_targets_cc + halo_test_R32_cc + halo_test_R64_cc + halo_test_C32_cc + halo_test_C64_cc +) + +foreach(tgt ${transpose_test_targets_cc}) + add_executable(${tgt}) + target_sources(${tgt} + PRIVATE + transpose_test.cc + ) +endforeach() + +foreach(tgt ${halo_test_targets_cc}) + add_executable(${tgt}) + target_sources(${tgt} + PRIVATE + halo_test.cc + ) +endforeach() + +foreach(tgt ${transpose_test_targets_cc} ${halo_test_targets_cc}) + string(REPLACE "_cc" "" bin_name ${tgt}) + set_target_properties(${tgt} PROPERTIES OUTPUT_NAME ${bin_name}) + target_include_directories(${tgt} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../../include + ${MPI_CXX_INCLUDE_DIRS} + ${NVHPC_CUDA_INCLUDE_DIR} + ) + target_link_libraries(${tgt} PRIVATE MPI::MPI_CXX) + target_link_libraries(${tgt} PRIVATE NVHPC::CUDART) + target_link_libraries(${tgt} PUBLIC cudecomp) +endforeach() + +target_compile_definitions(transpose_test_R32_cc PRIVATE R32) +target_compile_definitions(transpose_test_R64_cc PRIVATE R64) +target_compile_definitions(transpose_test_C32_cc PRIVATE C32) +target_compile_definitions(transpose_test_C64_cc PRIVATE C64) + +target_compile_definitions(halo_test_R32_cc PRIVATE R32) +target_compile_definitions(halo_test_R64_cc PRIVATE R64) +target_compile_definitions(halo_test_C32_cc PRIVATE C32) +target_compile_definitions(halo_test_C64_cc PRIVATE C64) + +install( + TARGETS ${transpose_test_targets_cc} ${halo_test_targets_cc} + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/cc +) diff --git a/tests/fortran/CMakeLists.txt b/tests/fortran/CMakeLists.txt new file mode 100644 index 0000000..ef820ef --- /dev/null +++ b/tests/fortran/CMakeLists.txt @@ -0,0 +1,59 @@ +set(transpose_test_targets_f + transpose_test_R32_f + transpose_test_R64_f + transpose_test_C32_f + transpose_test_C64_f +) + +set(halo_test_targets_f + halo_test_R32_f + halo_test_R64_f + halo_test_C32_f + halo_test_C64_f +) + +foreach(tgt ${transpose_test_targets_f}) + add_executable(${tgt}) + target_sources(${tgt} + PRIVATE + transpose_test.f90 + ) +endforeach() + +foreach(tgt ${halo_test_targets_f}) + add_executable(${tgt}) + target_sources(${tgt} + PRIVATE + halo_test.f90 + ) +endforeach() + +foreach(tgt ${transpose_test_targets_f} ${halo_test_targets_f}) + string(REPLACE "_f" "" bin_name ${tgt}) + set_target_properties(${tgt} PROPERTIES OUTPUT_NAME ${bin_name}) + target_include_directories(${tgt} + PRIVATE + ${CMAKE_BINARY_DIR}/include + ${MPI_Fortran_INCLUDE_DIRS} + ) + target_link_libraries(${tgt} PRIVATE MPI::MPI_Fortran) + target_link_libraries(${tgt} PUBLIC cudecomp) + target_link_libraries(${tgt} PUBLIC cudecomp_fort) + target_compile_options(${tgt} PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) + target_link_options(${tgt} PRIVATE $<$:-cpp -cuda -gpu=${CUF_GPU_ARG}>) +endforeach() + +target_compile_definitions(transpose_test_R32_f PUBLIC R32) +target_compile_definitions(transpose_test_R64_f PUBLIC R64) +target_compile_definitions(transpose_test_C32_f PUBLIC C32) +target_compile_definitions(transpose_test_C64_f PUBLIC C64) + +target_compile_definitions(halo_test_R32_f PUBLIC R32) +target_compile_definitions(halo_test_R64_f PUBLIC R64) +target_compile_definitions(halo_test_C32_f PUBLIC C32) +target_compile_definitions(halo_test_C64_f PUBLIC C64) + +install( + TARGETS ${transpose_test_targets_f} ${halo_test_targets_f} + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/fortran +) diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index fc6d0a4..3f2f63e 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -26,28 +26,33 @@ ! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#if defined(R32) +#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) +#define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) + +#ifdef R32 #define ARRTYPE real(real32) #define DTYPE CUDECOMP_FLOAT -#define MODNAME halo_CUDECOMP_FLOAT_mod -#elif defined(R64) +module halo_CUDECOMP_FLOAT_mod +#endif + +#ifdef R64 #define ARRTYPE real(real64) #define DTYPE CUDECOMP_DOUBLE -#define MODNAME halo_CUDECOMP_DOUBLE_mod -#elif defined(C32) +module halo_CUDECOMP_DOUBLE_mod +#endif + +#ifdef C32 #define ARRTYPE complex(real32) #define DTYPE CUDECOMP_FLOAT_COMPLEX -#define MODNAME halo_CUDECOMP_FLOAT_COMPLEX_mod -#elif defined(C64) +module halo_CUDECOMP_FLOAT_COMPLEX_mod +#endif + +#ifdef C64 #define ARRTYPE complex(real64) #define DTYPE CUDECOMP_DOUBLE_COMPLEX -#define MODNAME halo_CUDECOMP_DOUBLE_COMPLEX_mod +module halo_CUDECOMP_DOUBLE_COMPLEX_mod #endif -#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) -#define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) - -module MODNAME use, intrinsic :: iso_fortran_env, only: real32, real64 contains function compare_pencils(ref, res, pinfo) result(mismatch) @@ -160,7 +165,7 @@ subroutine flat_copy(src, dst, count) dst(1:count) = src(1:count) end subroutine flat_copy -end module MODNAME +end module program main use cudafor @@ -168,7 +173,21 @@ program main use cudecomp use, intrinsic :: iso_fortran_env, only: real32, real64 - use MODNAME +#ifdef R32 + use halo_CUDECOMP_FLOAT_mod +#endif + +#ifdef R64 + use halo_CUDECOMP_DOUBLE_mod +#endif + +#ifdef C32 + use halo_CUDECOMP_FLOAT_COMPLEX_mod +#endif + +#ifdef C64 + use halo_CUDECOMP_DOUBLE_COMPLEX_mod +#endif implicit none diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index 6f1f709..4e42a6a 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -26,28 +26,33 @@ ! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#if defined(R32) +#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) +#define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) + +#ifdef R32 #define ARRTYPE real(real32) #define DTYPE CUDECOMP_FLOAT -#define MODNAME transpose_CUDECOMP_FLOAT_mod -#elif defined(R64) +module transpose_CUDECOMP_FLOAT_mod +#endif + +#ifdef R64 #define ARRTYPE real(real64) #define DTYPE CUDECOMP_DOUBLE -#define MODNAME transpose_CUDECOMP_DOUBLE_mod -#elif defined(C32) +module transpose_CUDECOMP_DOUBLE_mod +#endif + +#ifdef C32 #define ARRTYPE complex(real32) #define DTYPE CUDECOMP_FLOAT_COMPLEX -#define MODNAME transpose_CUDECOMP_FLOAT_COMPLEX_mod -#elif defined(C64) +module transpose_CUDECOMP_FLOAT_COMPLEX_mod +#endif + +#ifdef C64 #define ARRTYPE complex(real64) #define DTYPE CUDECOMP_DOUBLE_COMPLEX -#define MODNAME transpose_CUDECOMP_DOUBLE_COMPLEX_mod +module transpose_CUDECOMP_DOUBLE_COMPLEX_mod #endif -#define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) -#define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) - -module MODNAME use, intrinsic :: iso_fortran_env, only: real32, real64 contains function compare_pencils(ref, res, pinfo) result(mismatch) @@ -109,7 +114,7 @@ subroutine flat_copy(src, dst, count) dst(1:count) = src(1:count) end subroutine flat_copy -end module MODNAME +end module program main use cudafor @@ -117,7 +122,21 @@ program main use cudecomp use, intrinsic :: iso_fortran_env, only: real32, real64 - use MODNAME +#ifdef R32 + use transpose_CUDECOMP_FLOAT_mod +#endif + +#ifdef R64 + use transpose_CUDECOMP_DOUBLE_mod +#endif + +#ifdef C32 + use transpose_CUDECOMP_FLOAT_COMPLEX_mod +#endif + +#ifdef C64 + use transpose_CUDECOMP_DOUBLE_COMPLEX_mod +#endif implicit none From b168a801804063b4e3f9e2b63f323e92c4beccc5 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 10 Feb 2023 11:25:08 -0800 Subject: [PATCH 02/13] Fix typo. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e10d63e..b73f2aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,7 +61,7 @@ target_sources(cudecomp target_include_directories(cudecomp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include - ${MPI_CXX_INCLUDE_DIR} + ${MPI_CXX_INCLUDE_DIRS} ${NVHPC_CUDA_INCLUDE_DIR} ${NVHPC_CUTENSOR_INCLUDE_DIR} ${NVHPC_NCCL_INCLUDE_DIR} From a44bdd51025f8d432031abe160030929b06a6c7d Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 10 Feb 2023 17:36:08 -0800 Subject: [PATCH 03/13] Update defines. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b73f2aa..867391f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,11 +73,11 @@ target_link_libraries(cudecomp PRIVATE NVHPC::CUTENSOR) target_link_libraries(cudecomp PRIVATE NVHPC::NCCL) if (CUDECOMP_ENABLE_NVTX) - target_compile_definitions(cudecomp PRIVATE CUDECOMP_ENABLE_NVTX) + target_compile_definitions(cudecomp PRIVATE ENABLE_NVTX) endif() if (CUDECOMP_ENABLE_NVSHMEM) - target_compile_definitions(cudecomp PRIVATE CUDECOMP_ENABLE_NVSHMEM) + target_compile_definitions(cudecomp PRIVATE ENABLE_NVSHMEM) # Get NVSHMEM version from header file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) From 44f9b57dc7eb3ad1bc6856809deee5657ddf1a25 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 13 Feb 2023 15:56:08 -0800 Subject: [PATCH 04/13] Add NVSHMEM include path. --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 867391f..ceb13d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,10 @@ endif() if (CUDECOMP_ENABLE_NVSHMEM) target_compile_definitions(cudecomp PRIVATE ENABLE_NVSHMEM) + target_include_directories(cudecomp + PRIVATE + ${NVHPC_NVSHMEM_INCLUDE_DIR} + ) # Get NVSHMEM version from header file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) From 234e91b3a088c3c71841f8d9ab4126ce57848b8d Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 9 Jan 2024 16:52:35 -0800 Subject: [PATCH 05/13] Refresh CMake build files. Add modifications for Cray/HPE build environments. --- CMakeLists.txt | 59 ++++++++++++++++++++++-- benchmark/CMakeLists.txt | 6 ++- cmake/test_mpi_f2c.f90 | 61 +++++++++++++++++++++++++ examples/cc/basic_usage/CMakeLists.txt | 3 +- examples/cc/taylor_green/CMakeLists.txt | 3 +- 5 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 cmake/test_mpi_f2c.f90 diff --git a/CMakeLists.txt b/CMakeLists.txt index ceb13d2..393c024 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,28 @@ option(CUDECOMP_BUILD_FORTRAN "Build Fortran bindings" ON) option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON) option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF) option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF) -set(CUDECOMP_CUDA_CC_LIST "60;70;80" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") +set(CUDECOMP_CUDA_CC_LIST "70;80;90" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") + +# Use NVHPC compilers by default +set(CMAKE_CXX_COMPILER "nvc++") +set(CMAKE_Fortran_COMPILER "nvfortran") + +# Locate and use NVHPC CMake configuration +find_program(NVHPC_CXX_BIN "nvc++") +string(REPLACE "compilers/bin/nvc++" "cmake" NVHPC_CMAKE_DIR ${NVHPC_CXX_BIN}) +set(CMAKE_PREFIX_PATH ${NVHPC_CMAKE_DIR}) + +# Detect if Cray compiler wrappers are available. If so, use them to get +# correct linking against CUDA-aware MPI libraries. +find_program(CRAY_FTN_BIN "ftn") +find_program(CRAY_CC_BIN "CC") + +if (CRAY_FTN_BIN) + set(CMAKE_Fortran_COMPILER ${CRAY_FTN_BIN}) +endif() +if (CRAY_CC_BIN) + set(CMAKE_CXX_COMPILER ${CRAY_CC_BIN}) +endif() if (CUDECOMP_BUILD_FORTRAN) set(LANGS CXX CUDA Fortran) @@ -20,6 +41,20 @@ project(cudecomp LANGUAGES ${LANGS}) # MPI find_package(MPI REQUIRED) + +# FindMPI does not populate include variables when in a Cray environment. +# Extract MPI includes from CC wrapper. +if (CRAY_CC_BIN) + execute_process( + COMMAND ${CRAY_CC_BIN} --cray-print-opts=cflags + OUTPUT_VARIABLE CRAY_CC_INCLUDES_RAW + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + string(REPLACE "-I" "" CRAY_CC_INCLUDES_RAW ${CRAY_CC_INCLUDES_RAW}) + string(REPLACE " " ";" MPI_CXX_INCLUDE_DIRS ${CRAY_CC_INCLUDES_RAW}) +endif() + # TODO: Check for MPICH to define `-DMPICH` flag # HPC SDK @@ -29,7 +64,7 @@ else() find_package(NVHPC REQUIRED COMPONENTS CUDA MATH NCCL) endif() -# Set up required include directory flags +# Set up required include directory flags, NVHPC CMake config only defined library directories string(REPLACE "/lib64" "/include" NVHPC_CUDA_INCLUDE_DIR ${NVHPC_CUDA_LIBRARY_DIR}) string(REPLACE "/lib64" "/include" NVHPC_CUFFT_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) string(REPLACE "/lib64" "/include" NVHPC_CUTENSOR_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) @@ -84,7 +119,11 @@ if (CUDECOMP_ENABLE_NVSHMEM) ) # Get NVSHMEM version from header - file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) + if (EXISTS ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h) + file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) + else() + file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/common/nvshmem_version.h NVSHMEM_VERSION_RAW) + endif() string(REGEX MATCH "NVSHMEM_VENDOR_MAJOR_VERSION ([0-9]*)" _ ${NVSHMEM_VERSION_RAW}) list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1}) string(REGEX MATCH "NVSHMEM_VENDOR_MINOR_VERSION ([0-9]*)" _ ${NVSHMEM_VERSION_RAW}) @@ -126,8 +165,6 @@ if (CUDECOMP_BUILD_FORTRAN) list(APPEND CUF_GPU_ARG "cuda${NVHPC_CUDA_VERSION}") list(JOIN CUF_GPU_ARG "," CUF_GPU_ARG) - #set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/fortran_modules) - add_library(cudecomp_fort SHARED) set_target_properties(cudecomp_fort PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set_target_properties(cudecomp_fort PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_BINARY_DIR}/include) @@ -142,6 +179,18 @@ if (CUDECOMP_BUILD_FORTRAN) target_link_libraries(cudecomp_fort PUBLIC MPI::MPI_Fortran) + # Test for MPI_Comm_f2c/c2f + try_compile( + TEST_F2C_RESULT + ${CMAKE_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/test_mpi_f2c.f90 + LINK_LIBRARIES MPI::MPI_Fortran + ) + if (NOT TEST_F2C_RESULT) + message(STATUS "Could not link MPI_Comm_f2c. Setting -DMPICH.") + target_compile_definitions(cudecomp_fort PRIVATE MPICH) + endif() + install( TARGETS cudecomp_fort LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 4b449ff..a09ef40 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -10,7 +10,7 @@ foreach(tgt ${benchmark_targets}) if (CMAKE_VERSION VERSION_LESS 3.18) target_compile_options(${tgt} PRIVATE $<$: ${CUDA_CC_FLAGS}>) else() - set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") + set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDECOMP_CUDA_CC_LIST}") endif() target_sources(${tgt} PRIVATE @@ -19,9 +19,11 @@ foreach(tgt ${benchmark_targets}) target_include_directories(${tgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${NVHPC_CUFFT_INCLUDE_DIE}) + ${NVHPC_CUFFT_INCLUDE_DIR} + ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(${tgt} PRIVATE cudecomp) target_link_libraries(${tgt} PRIVATE NVHPC::CUFFT) + set_target_properties(${tgt} PROPERTIES LINKER_LANGUAGE CXX) endforeach() target_compile_definitions(benchmark_r2c PRIVATE R2C) diff --git a/cmake/test_mpi_f2c.f90 b/cmake/test_mpi_f2c.f90 new file mode 100644 index 0000000..997fe60 --- /dev/null +++ b/cmake/test_mpi_f2c.f90 @@ -0,0 +1,61 @@ +! SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +! SPDX-License-Identifier: BSD-3-Clause +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions are met: +! +! 1. Redistributions of source code must retain the above copyright notice, this +! list of conditions and the following disclaimer. +! +! 2. Redistributions in binary form must reproduce the above copyright notice, +! this list of conditions and the following disclaimer in the documentation +! and/or other materials provided with the distribution. +! +! 3. Neither the name of the copyright holder nor the names of its +! contributors may be used to endorse or promote products derived from +! this software without specific prior written permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +! DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +! FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +! DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +! SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +! OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +! OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +module test_f2c + use iso_c_binding + implicit none + + type, bind(c) :: MPI_C_Comm + integer(c_int64_t) :: comm + end type MPI_C_Comm + + type, bind(c) :: MPI_F_Comm + integer(c_int) :: comm + end type MPI_F_Comm + + interface + function MPI_Comm_f2c(fcomm) bind(C,name='MPI_Comm_f2c') result(res) + import + type(MPI_F_Comm), value :: fcomm + type(MPI_C_Comm) :: res + end function MPI_Comm_f2c + end interface +end module + +program main + use mpi + use test_f2c + implicit none + + type(MPI_F_Comm) :: fcomm + type(MPI_C_Comm) :: ccomm + + fcomm%comm = MPI_COMM_WORLD + + ccomm = MPI_Comm_f2c(fcomm) +end program diff --git a/examples/cc/basic_usage/CMakeLists.txt b/examples/cc/basic_usage/CMakeLists.txt index 6365006..2a825ca 100644 --- a/examples/cc/basic_usage/CMakeLists.txt +++ b/examples/cc/basic_usage/CMakeLists.txt @@ -21,7 +21,7 @@ foreach(tgt ${basic_usage_targets_cc}) if (CMAKE_VERSION VERSION_LESS 3.18) target_compile_options(${tgt} PRIVATE $<$: ${CUDA_CC_FLAGS}>) else() - set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") + set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES "${CUDECOMP_CUDA_CC_LIST}") endif() target_include_directories(${tgt} PRIVATE @@ -30,6 +30,7 @@ foreach(tgt ${basic_usage_targets_cc}) ) target_link_libraries(${tgt} PRIVATE MPI::MPI_CXX) target_link_libraries(${tgt} PRIVATE cudecomp) + set_target_properties(${tgt} PROPERTIES LINKER_LANGUAGE CXX) endforeach() install( diff --git a/examples/cc/taylor_green/CMakeLists.txt b/examples/cc/taylor_green/CMakeLists.txt index acc7d22..0580c28 100644 --- a/examples/cc/taylor_green/CMakeLists.txt +++ b/examples/cc/taylor_green/CMakeLists.txt @@ -8,7 +8,7 @@ target_sources(tg if (CMAKE_VERSION VERSION_LESS 3.18) target_compile_options(tg PRIVATE $<$: ${CUDA_CC_FLAGS}>) else() - set_target_properties(tg PROPERTIES CUDA_ARCHITECTURES "${CUDA_CC_LIST}") + set_target_properties(tg PROPERTIES CUDA_ARCHITECTURES "${CUDECOMP_CUDA_CC_LIST}") endif() target_include_directories(tg @@ -20,6 +20,7 @@ target_include_directories(tg target_link_libraries(tg PRIVATE MPI::MPI_CXX) target_link_libraries(tg PRIVATE NVHPC::CUFFT) target_link_libraries(tg PRIVATE cudecomp) +set_target_properties(tg PROPERTIES LINKER_LANGUAGE CXX) install( TARGETS tg From 02279ead2d335ef317251eb5109168f1ef8235b1 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 9 Jan 2024 18:14:00 -0800 Subject: [PATCH 06/13] Add NCCL and NVSHMEM overrides to CMake build. --- CMakeLists.txt | 69 +++++++++++++++++++++++++++++++---------- tests/cc/CMakeLists.txt | 1 + 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 393c024..de77949 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON) option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF) option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF) set(CUDECOMP_CUDA_CC_LIST "70;80;90" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") +set(CUDECOMP_NCCL_PATH CACHE STRING "Path to search for NCCL installation. Use to override NVHPC provided NCCL version.") +set(CUDECOMP_NVSHMEM_PATH CACHE STRING "Path to search for NVSHMEM installation. Use to override NVHPC provided NVSHMEM version.") # Use NVHPC compilers by default set(CMAKE_CXX_COMPILER "nvc++") @@ -58,21 +60,56 @@ endif() # TODO: Check for MPICH to define `-DMPICH` flag # HPC SDK -if (CUDECOMP_ENABLE_NVSHMEM) - find_package(NVHPC REQUIRED COMPONENTS CUDA MATH NCCL NVSHMEM) -else() - find_package(NVHPC REQUIRED COMPONENTS CUDA MATH NCCL) -endif() +find_package(NVHPC REQUIRED COMPONENTS CUDA MATH) # Set up required include directory flags, NVHPC CMake config only defined library directories string(REPLACE "/lib64" "/include" NVHPC_CUDA_INCLUDE_DIR ${NVHPC_CUDA_LIBRARY_DIR}) string(REPLACE "/lib64" "/include" NVHPC_CUFFT_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) string(REPLACE "/lib64" "/include" NVHPC_CUTENSOR_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) -string(REPLACE "/lib" "/include" NVHPC_NCCL_INCLUDE_DIR ${NVHPC_NCCL_LIBRARY_DIR}) + +# Get NCCL library (with optional override) +if (CUDECOMP_NCCL_PATH) + find_path(NCCL_INCLUDE_DIR REQUIRED + NAMES nccl.h + HINTS ${CUDECOMP_NCCL_PATH}/include + ) + + find_library(NCCL_LIBRARY REQUIRED + NAMES nccl + HINTS ${CUDECOMP_NCCL_PATH}/lib + ) +else() + find_package(NVHPC REQUIRED COMPONENTS NCCL) + find_library(NCCL_LIBRARY + NAMES nccl + HINTS ${NVHPC_NCCL_LIBRARY_DIR} + ) + string(REPLACE "/lib" "/include" NCCL_INCLUDE_DIR ${NVHPC_NCCL_LIBRARY_DIR}) +endif() + +message(STATUS "Using NCCL library: ${NCCL_LIBRARY}") + if (CUDECOMP_ENABLE_NVSHMEM) - string(REPLACE "/lib" "/include" NVHPC_NVSHMEM_INCLUDE_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) + # Get NVSHMEM library (with optional override) + if (CUDECOMP_NVSHMEM_PATH) + find_path(NVSHMEM_INCLUDE_DIR REQUIRED + NAMES nvshmem.h + HINTS ${CUDECOMP_NVSHMEM_PATH}/include + ) + + find_path(NVSHMEM_LIBRARY_DIR REQUIRED + NAMES libnvshmem.a + HINTS ${CUDECOMP_NVSHMEM_PATH}/lib + ) + else() + find_package(NVHPC REQUIRED COMPONENTS NVSHMEM) + set(NVSHMEM_LIBRARY_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) + string(REPLACE "/lib" "/include" NVSHMEM_INCLUDE_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) + endif() endif() +message(STATUS "Using NVSHMEM installation at: ${NVSHMEM_LIBRARY_DIR}") + # Building cuDecomp shared lib add_library(cudecomp SHARED) set_target_properties(cudecomp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) @@ -99,13 +136,13 @@ target_include_directories(cudecomp ${MPI_CXX_INCLUDE_DIRS} ${NVHPC_CUDA_INCLUDE_DIR} ${NVHPC_CUTENSOR_INCLUDE_DIR} - ${NVHPC_NCCL_INCLUDE_DIR} + ${NCCL_INCLUDE_DIR} ) target_link_libraries(cudecomp PUBLIC NVHPC::CUDART) target_link_libraries(cudecomp PUBLIC MPI::MPI_CXX) target_link_libraries(cudecomp PRIVATE NVHPC::CUTENSOR) -target_link_libraries(cudecomp PRIVATE NVHPC::NCCL) +target_link_libraries(cudecomp PRIVATE ${NCCL_LIBRARY}) if (CUDECOMP_ENABLE_NVTX) target_compile_definitions(cudecomp PRIVATE ENABLE_NVTX) @@ -115,14 +152,14 @@ if (CUDECOMP_ENABLE_NVSHMEM) target_compile_definitions(cudecomp PRIVATE ENABLE_NVSHMEM) target_include_directories(cudecomp PRIVATE - ${NVHPC_NVSHMEM_INCLUDE_DIR} + ${NVSHMEM_INCLUDE_DIR} ) # Get NVSHMEM version from header - if (EXISTS ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h) - file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) + if (EXISTS ${NVSHMEM_INCLUDE_DIR}/nvshmem_version.h) + file(READ ${NVSHMEM_INCLUDE_DIR}/nvshmem_version.h NVSHMEM_VERSION_RAW) else() - file(READ ${NVHPC_NVSHMEM_INCLUDE_DIR}/common/nvshmem_version.h NVSHMEM_VERSION_RAW) + file(READ ${NVSHMEM_INCLUDE_DIR}/common/nvshmem_version.h NVSHMEM_VERSION_RAW) endif() string(REGEX MATCH "NVSHMEM_VENDOR_MAJOR_VERSION ([0-9]*)" _ ${NVSHMEM_VERSION_RAW}) list(APPEND NVSHMEM_VERSION ${CMAKE_MATCH_1}) @@ -136,10 +173,10 @@ if (CUDECOMP_ENABLE_NVSHMEM) endif() if (NVSHMEM_VERSION VERSION_LESS "2.5") - target_link_libraries(cudecomp PRIVATE ${NVHPC_NVSHMEM_LIBRARY_DIR}/libnvshmem.a) + target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem.a) else() - target_link_libraries(cudecomp PRIVATE -L${NVHPC_NVSHMEM_LIBRARY_DIR} -lnvshmem_host) - target_link_libraries(cudecomp PRIVATE ${NVHPC_NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a) + target_link_libraries(cudecomp PRIVATE -L${NVSHMEM_LIBRARY_DIR} -lnvshmem_host) + target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a) target_link_libraries(cudecomp PUBLIC -L${NVHPC_CUDA_LIBRARY_DIR}/stubs -lnvidia-ml) endif() target_link_libraries(cudecomp PUBLIC -L${NVHPC_CUDA_LIBRARY_DIR}/stubs -lcuda) diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt index 44c474e..7ecacd1 100644 --- a/tests/cc/CMakeLists.txt +++ b/tests/cc/CMakeLists.txt @@ -35,6 +35,7 @@ foreach(tgt ${transpose_test_targets_cc} ${halo_test_targets_cc}) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include ${MPI_CXX_INCLUDE_DIRS} + ${NVHPC_CUFFT_INCLUDE_DIR} ${NVHPC_CUDA_INCLUDE_DIR} ) target_link_libraries(${tgt} PRIVATE MPI::MPI_CXX) From ae8081d192df1285dc3c52bd94b44a283174eb2f Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 10 Jan 2024 09:30:27 -0800 Subject: [PATCH 07/13] Small updates. --- CMakeLists.txt | 7 ++++--- benchmark/CMakeLists.txt | 1 + tests/cc/CMakeLists.txt | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de77949..d3854cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,9 +106,10 @@ if (CUDECOMP_ENABLE_NVSHMEM) set(NVSHMEM_LIBRARY_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) string(REPLACE "/lib" "/include" NVSHMEM_INCLUDE_DIR ${NVHPC_NVSHMEM_LIBRARY_DIR}) endif() -endif() -message(STATUS "Using NVSHMEM installation at: ${NVSHMEM_LIBRARY_DIR}") + message(STATUS "Using NVSHMEM installation at: ${NVSHMEM_LIBRARY_DIR}") + +endif() # Building cuDecomp shared lib add_library(cudecomp SHARED) @@ -224,7 +225,7 @@ if (CUDECOMP_BUILD_FORTRAN) LINK_LIBRARIES MPI::MPI_Fortran ) if (NOT TEST_F2C_RESULT) - message(STATUS "Could not link MPI_Comm_f2c. Setting -DMPICH.") + message(STATUS "Could not link MPI_Comm_f2c in Fortran module. Setting -DMPICH flag during module compilation.") target_compile_definitions(cudecomp_fort PRIVATE MPICH) endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index a09ef40..d821144 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,6 +20,7 @@ foreach(tgt ${benchmark_targets}) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${NVHPC_CUFFT_INCLUDE_DIR} + ${NCCL_INCLUDE_DIR} ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(${tgt} PRIVATE cudecomp) target_link_libraries(${tgt} PRIVATE NVHPC::CUFFT) diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt index 7ecacd1..a3ec659 100644 --- a/tests/cc/CMakeLists.txt +++ b/tests/cc/CMakeLists.txt @@ -35,6 +35,7 @@ foreach(tgt ${transpose_test_targets_cc} ${halo_test_targets_cc}) PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include ${MPI_CXX_INCLUDE_DIRS} + ${NCCL_INCLUDE_DIR} ${NVHPC_CUFFT_INCLUDE_DIR} ${NVHPC_CUDA_INCLUDE_DIR} ) From 3dd768e2fa9a885467cd8245eb93b927dc80372b Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 11 Jan 2024 10:50:46 -0800 Subject: [PATCH 08/13] Do not use Cray compiler wrappers. --- CMakeLists.txt | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d3854cd..e4e442f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,18 +21,6 @@ find_program(NVHPC_CXX_BIN "nvc++") string(REPLACE "compilers/bin/nvc++" "cmake" NVHPC_CMAKE_DIR ${NVHPC_CXX_BIN}) set(CMAKE_PREFIX_PATH ${NVHPC_CMAKE_DIR}) -# Detect if Cray compiler wrappers are available. If so, use them to get -# correct linking against CUDA-aware MPI libraries. -find_program(CRAY_FTN_BIN "ftn") -find_program(CRAY_CC_BIN "CC") - -if (CRAY_FTN_BIN) - set(CMAKE_Fortran_COMPILER ${CRAY_FTN_BIN}) -endif() -if (CRAY_CC_BIN) - set(CMAKE_CXX_COMPILER ${CRAY_CC_BIN}) -endif() - if (CUDECOMP_BUILD_FORTRAN) set(LANGS CXX CUDA Fortran) else() @@ -41,20 +29,34 @@ endif() project(cudecomp LANGUAGES ${LANGS}) +# Detect if Cray compiler wrappers are available to assess if in Cray environment. +# We do not use the Cray compiler wrappers directly for greater flexibility. +find_program(CRAY_CC_BIN "CC") + +if (CRAY_CC_BIN) + message(STATUS "Found Cray CC wrapper. Compiling for Cray programming environment.") +endif() + # MPI find_package(MPI REQUIRED) -# FindMPI does not populate include variables when in a Cray environment. -# Extract MPI includes from CC wrapper. if (CRAY_CC_BIN) - execute_process( - COMMAND ${CRAY_CC_BIN} --cray-print-opts=cflags - OUTPUT_VARIABLE CRAY_CC_INCLUDES_RAW - OUTPUT_STRIP_TRAILING_WHITESPACE + # FindMPI does not include Cray GTL (e.g. CUDA-aware) libs + # automatically in Cray environment. Locate it to include in linking. + string(REPLACE ":" ";" CRAY_LIB_PATHS $ENV{CRAY_LD_LIBRARY_PATH}) + find_library(CRAY_MPI_GTL_CUDA_LIBRARY REQUIRED + NAMES mpi_gtl_cuda + HINTS ${CRAY_LIB_PATHS} ) - string(REPLACE "-I" "" CRAY_CC_INCLUDES_RAW ${CRAY_CC_INCLUDES_RAW}) - string(REPLACE " " ";" MPI_CXX_INCLUDE_DIRS ${CRAY_CC_INCLUDES_RAW}) + # Cray GTL libs benefit from linking against gdrcopy, so also + # locating that library. + find_library(GDRCOPY_LIBRARY REQUIRED + NAMES gdrapi + ) + + message(STATUS "Found Cray GTL library: " ${CRAY_MPI_GTL_CUDA_LIBRARY}) + message(STATUS "Found GDRCopy library: " ${GDRCOPY_LIBRARY}) endif() # TODO: Check for MPICH to define `-DMPICH` flag @@ -144,6 +146,11 @@ target_link_libraries(cudecomp PUBLIC NVHPC::CUDART) target_link_libraries(cudecomp PUBLIC MPI::MPI_CXX) target_link_libraries(cudecomp PRIVATE NVHPC::CUTENSOR) target_link_libraries(cudecomp PRIVATE ${NCCL_LIBRARY}) +if (CRAY_CC_BIN) + # In Cray environments, add links to GTL and GDRCopy libs for CUDA-aware support + target_link_libraries(cudecomp PRIVATE ${CRAY_MPI_GTL_CUDA_LIBRARY}) + target_link_libraries(cudecomp PRIVATE ${GDRCOPY_LIBRARY}) +endif() if (CUDECOMP_ENABLE_NVTX) target_compile_definitions(cudecomp PRIVATE ENABLE_NVTX) From 96bfeea802a96c99f1b466805e33c85fc17055b6 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 23 Jan 2024 00:30:05 +0100 Subject: [PATCH 09/13] Update libnvshmem_host link in CMake so it sets rpath. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4e442f..1071aff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,7 +183,7 @@ if (CUDECOMP_ENABLE_NVSHMEM) if (NVSHMEM_VERSION VERSION_LESS "2.5") target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem.a) else() - target_link_libraries(cudecomp PRIVATE -L${NVSHMEM_LIBRARY_DIR} -lnvshmem_host) + target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_host.so) target_link_libraries(cudecomp PRIVATE ${NVSHMEM_LIBRARY_DIR}/libnvshmem_device.a) target_link_libraries(cudecomp PUBLIC -L${NVHPC_CUDA_LIBRARY_DIR}/stubs -lnvidia-ml) endif() From d0728aeb78f931f87ad6f9f1a41fa9764b5b27e8 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 22 Jan 2024 16:09:44 -0800 Subject: [PATCH 10/13] Set default CMake build to RelWithDebInfo. --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1071aff..079011c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.16) set(CMAKE_CXX_STANDARD 14) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() # User-defined build options option(CUDECOMP_BUILD_FORTRAN "Build Fortran bindings" ON) From 7891c879c903bba252bc63a63438c214501770c8 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 22 Jan 2024 16:20:52 -0800 Subject: [PATCH 11/13] Update README.md with CMake build instructions. --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 15bb8cf..56fd357 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,9 @@ Please contact us or open a GitHub issue if you are interested in using this lib ## Build +### Method 1: Makefile with Configuration file To build the library, you must first create a configuration file to point the installed to dependent library paths and enable/disable features. - See the default [`nvhpcsdk.conf`](configs/nvhpcsdk.conf) for an example of settings to build the library using the [NVHPC SDK compilers and libraries](https://developer.nvidia.com/hpc-sdk). +See the default [`nvhpcsdk.conf`](configs/nvhpcsdk.conf) for an example of settings to build the library using the [NVHPC SDK compilers and libraries](https://developer.nvidia.com/hpc-sdk). The [`configs/`](configs) directory also contains several sample build configuration files for a number of GPU compute clusters, like Perlmutter, Summit, and Marconi 100. With this configuration file created, you can build the library using the command @@ -26,6 +27,21 @@ $ make -j CONFIGFILE= The library will be compiled and installed in a newly created `build/` directory. +### Method 2: CMake +We also enable builds using CMake. A CMake build of the library without additional examples/tests can be completed using the following commands +```shell +$ mkdir build +$ cd build +$ cmake .. +$ make -j +``` +There are several build variables available to configure the CMake build which can be found at the top of the main [`CMakeLists.txt`](CMakeLists.txt) file. As an example, +to configure the build to compile additional examples and enable NVSHMEM backends, you can run the following CMake command +```shell +$ cmake -DCUDECOMP_BUILD_EXTRAS=1 -DCUDECOMP_ENABLE_NVSHMEM=1 .. +``` + + ### Dependencies We strongly recommend building this library using NVHPC SDK compilers and libraries, as the SDK contains all required dependencies for this library and is the focus of our testing. Fortran features are only supported using NVHPC SDK compilers. From 1caa84cd119e995f46f97820c7499f89c47cfa40 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 22 Jan 2024 16:24:06 -0800 Subject: [PATCH 12/13] Update README.md. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 56fd357..d9585a1 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ $ cd build $ cmake .. $ make -j ``` -There are several build variables available to configure the CMake build which can be found at the top of the main [`CMakeLists.txt`](CMakeLists.txt) file. As an example, +There are several build variables available to configure the CMake build which can be found at the top of the project [`CMakeLists.txt`](CMakeLists.txt) file. As an example, to configure the build to compile additional examples and enable NVSHMEM backends, you can run the following CMake command ```shell $ cmake -DCUDECOMP_BUILD_EXTRAS=1 -DCUDECOMP_ENABLE_NVSHMEM=1 .. From 9f47cd3a8c98bafb4fba9d3ba1f5a3b91219a33d Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 25 Jan 2024 13:34:22 -0800 Subject: [PATCH 13/13] Update NCCL and NVSHMEM path naming to be consistent with Makefile build configuration. --- CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 079011c..735e1fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,8 +12,8 @@ option(CUDECOMP_ENABLE_NVTX "Enable NVTX ranges" ON) option(CUDECOMP_ENABLE_NVSHMEM "Enable NVSHMEM" OFF) option(CUDECOMP_BUILD_EXTRAS "Build benchmark, examples, and tests" OFF) set(CUDECOMP_CUDA_CC_LIST "70;80;90" CACHE STRING "List of CUDA compute capabilities to build cuDecomp for.") -set(CUDECOMP_NCCL_PATH CACHE STRING "Path to search for NCCL installation. Use to override NVHPC provided NCCL version.") -set(CUDECOMP_NVSHMEM_PATH CACHE STRING "Path to search for NVSHMEM installation. Use to override NVHPC provided NVSHMEM version.") +set(CUDECOMP_NCCL_HOME CACHE STRING "Path to search for NCCL installation. Use to override NVHPC provided NCCL version.") +set(CUDECOMP_NVSHMEM_HOME CACHE STRING "Path to search for NVSHMEM installation. Use to override NVHPC provided NVSHMEM version.") # Use NVHPC compilers by default set(CMAKE_CXX_COMPILER "nvc++") @@ -73,15 +73,15 @@ string(REPLACE "/lib64" "/include" NVHPC_CUFFT_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_ string(REPLACE "/lib64" "/include" NVHPC_CUTENSOR_INCLUDE_DIR ${NVHPC_MATH_LIBRARY_DIR}) # Get NCCL library (with optional override) -if (CUDECOMP_NCCL_PATH) +if (CUDECOMP_NCCL_HOME) find_path(NCCL_INCLUDE_DIR REQUIRED NAMES nccl.h - HINTS ${CUDECOMP_NCCL_PATH}/include + HINTS ${CUDECOMP_NCCL_HOME}/include ) find_library(NCCL_LIBRARY REQUIRED NAMES nccl - HINTS ${CUDECOMP_NCCL_PATH}/lib + HINTS ${CUDECOMP_NCCL_HOME}/lib ) else() find_package(NVHPC REQUIRED COMPONENTS NCCL) @@ -96,15 +96,15 @@ message(STATUS "Using NCCL library: ${NCCL_LIBRARY}") if (CUDECOMP_ENABLE_NVSHMEM) # Get NVSHMEM library (with optional override) - if (CUDECOMP_NVSHMEM_PATH) + if (CUDECOMP_NVSHMEM_HOME) find_path(NVSHMEM_INCLUDE_DIR REQUIRED NAMES nvshmem.h - HINTS ${CUDECOMP_NVSHMEM_PATH}/include + HINTS ${CUDECOMP_NVSHMEM_HOME}/include ) find_path(NVSHMEM_LIBRARY_DIR REQUIRED NAMES libnvshmem.a - HINTS ${CUDECOMP_NVSHMEM_PATH}/lib + HINTS ${CUDECOMP_NVSHMEM_HOME}/lib ) else() find_package(NVHPC REQUIRED COMPONENTS NVSHMEM)