From 29903e64b2d550d60a9aa97754559f2f811e9cb9 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 20 Feb 2025 15:00:13 -0800 Subject: [PATCH 1/8] Begin work on adding padding argument support. Transposes updated. --- benchmark/benchmark.cu | 20 +++-- examples/cc/basic_usage/basic_usage.cu | 14 ++-- .../cc/basic_usage/basic_usage_autotune.cu | 14 ++-- examples/cc/taylor_green/tg.cu | 16 ++-- include/cudecomp.h | 47 +++++++++--- include/internal/halo.h | 2 +- include/internal/transpose.h | 53 ++++++++----- src/autotune.cc | 24 +++--- src/cudecomp.cc | 69 ++++++++++------- src/cudecomp_m.cuf | 76 +++++++++++++++---- tests/cc/halo_test.cc | 2 +- tests/cc/transpose_test.cc | 61 +++++++++++---- tests/fortran/transpose_test.f90 | 40 +++++++--- 13 files changed, 298 insertions(+), 140 deletions(-) diff --git a/benchmark/benchmark.cu b/benchmark/benchmark.cu index d2871b4..8e9391b 100644 --- a/benchmark/benchmark.cu +++ b/benchmark/benchmark.cu @@ -267,20 +267,20 @@ int main(int argc, char** argv) { #ifdef R2C // Get x-pencil information (real) cudecompPencilInfo_t pinfo_x_r; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr, nullptr)); #endif // Get x-pencil information (complex) cudecompPencilInfo_t pinfo_x_c; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr, nullptr)); // Get y-pencil information (complex) cudecompPencilInfo_t pinfo_y_c; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr, nullptr)); // Get z-pencil information (complex) cudecompPencilInfo_t pinfo_z_c; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr, nullptr)); // Get workspace size int64_t num_elements_work_c; @@ -508,7 +508,8 @@ int main(int argc, char** argv) { if (!slab_xyz) { CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc_c, input, output, work_c_d, - get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0)); + get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, + nullptr, nullptr, 0)); } if (!slab_xy && !slab_xyz) { @@ -530,7 +531,8 @@ int main(int argc, char** argv) { // For y-z slab case, no need to perform yz transposes or z-axis FFT if (!slab_yz && !slab_xyz) { CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc_c, input, output, work_c_d, - get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0)); + get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, + nullptr, nullptr, 0)); } if (!slab_yz && !slab_xyz) { @@ -545,7 +547,8 @@ int main(int argc, char** argv) { if (!slab_yz && !slab_xyz) { CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc_c, input, output, work_c_d, - get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0)); + get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, + nullptr, nullptr, 0)); } if (!slab_xy && !slab_xyz) { @@ -566,7 +569,8 @@ int main(int argc, char** argv) { if (!slab_xyz) { CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc_c, input, output, work_c_d, - get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, 0)); + get_cudecomp_datatype(complex_t(0)), nullptr, nullptr, + nullptr, nullptr, 0)); } #ifdef R2C CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2r_x, output, output_r, CUFFT_INVERSE)); diff --git a/examples/cc/basic_usage/basic_usage.cu b/examples/cc/basic_usage/basic_usage.cu index e91062b..c151012 100644 --- a/examples/cc/basic_usage/basic_usage.cu +++ b/examples/cc/basic_usage/basic_usage.cu @@ -157,15 +157,15 @@ int main(int argc, char** argv) { // Get X-pencil information (with halo elements) cudecompPencilInfo_t pinfo_x; int32_t halo_extents_x[3]{1, 1, 1}; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr)); // Get Y-pencil information cudecompPencilInfo_t pinfo_y; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr)); // Get Z-pencil information cudecompPencilInfo_t pinfo_z; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr)); // Allocate pencil memory int64_t data_num_elements = std::max(std::max(pinfo_x.size, pinfo_y.size), pinfo_z.size); @@ -263,19 +263,19 @@ int main(int argc, char** argv) { // Transpose from X-pencils to Y-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, nullptr, 0)); + pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to Z-pencils. CHECK_CUDECOMP_EXIT( - cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Z-pencils to Y-pencils. CHECK_CUDECOMP_EXIT( - cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to X-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, - nullptr, pinfo_x.halo_extents, 0)); + nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0)); // Updating halos diff --git a/examples/cc/basic_usage/basic_usage_autotune.cu b/examples/cc/basic_usage/basic_usage_autotune.cu index 3fc10ab..bb5e47d 100644 --- a/examples/cc/basic_usage/basic_usage_autotune.cu +++ b/examples/cc/basic_usage/basic_usage_autotune.cu @@ -190,15 +190,15 @@ int main(int argc, char** argv) { // Get X-pencil information (with halo elements). cudecompPencilInfo_t pinfo_x; int32_t halo_extents_x[3]{1, 1, 1}; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr)); // Get Y-pencil information cudecompPencilInfo_t pinfo_y; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr)); // Get Z-pencil information cudecompPencilInfo_t pinfo_z; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr)); // Allocate pencil memory int64_t data_num_elements = std::max(std::max(pinfo_x.size, pinfo_y.size), pinfo_z.size); @@ -241,19 +241,19 @@ int main(int argc, char** argv) { // Transpose from X-pencils to Y-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, nullptr, 0)); + pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to Z-pencils. CHECK_CUDECOMP_EXIT( - cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Z-pencils to Y-pencils. CHECK_CUDECOMP_EXIT( - cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to X-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, CUDECOMP_DOUBLE, - nullptr, pinfo_x.halo_extents, 0)); + nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0)); // Updating halos diff --git a/examples/cc/taylor_green/tg.cu b/examples/cc/taylor_green/tg.cu index 564267f..dd9c195 100644 --- a/examples/cc/taylor_green/tg.cu +++ b/examples/cc/taylor_green/tg.cu @@ -381,16 +381,16 @@ public: cudecompGridDescCreate(handle, &grid_desc_r, &config, nullptr); // Get x-pencil information (real) - cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr); + cudecompGetPencilInfo(handle, grid_desc_r, &pinfo_x_r, 0, nullptr, nullptr); // Get x-pencil information (complex) - cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr); + cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_x_c, 0, nullptr, nullptr); // Get y-pencil information (complex) - cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr); + cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_y_c, 1, nullptr, nullptr); // Get z-pencil information (complex) - cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr); + cudecompGetPencilInfo(handle, grid_desc_c, &pinfo_z_c, 2, nullptr, nullptr); // Get workspace size (only complex workspace required) int64_t num_elements_work_c; @@ -665,10 +665,10 @@ private: for (int i = 0; i < 3; ++i) { CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_r2c_x, U_r[i], U_c[i], CUFFT_FORWARD)); cudecompTransposeXToY(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr, - nullptr, 0); + nullptr, nullptr, nullptr, 0); CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_y, U_c[i], U_c[i], CUFFT_FORWARD)); cudecompTransposeYToZ(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr, - nullptr, 0); + nullptr, nullptr, nullptr, 0); CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_z, U_c[i], U_c[i], CUFFT_FORWARD)); } } @@ -677,10 +677,10 @@ private: for (int i = 0; i < 3; ++i) { CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_z, U_c[i], U_c[i], CUFFT_INVERSE)); cudecompTransposeZToY(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr, - nullptr, 0); + nullptr, nullptr, nullptr, 0); CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2c_y, U_c[i], U_c[i], CUFFT_INVERSE)); cudecompTransposeYToX(handle, grid_desc_c, U_c[i], U_c[i], work_c, get_cudecomp_datatype(complex_t(0)), nullptr, - nullptr, 0); + nullptr, nullptr, nullptr, 0); CHECK_CUFFT_EXIT(cufftXtExec(cufft_plan_c2r_x, U_c[i], U_r[i], CUFFT_INVERSE)); } } diff --git a/include/cudecomp.h b/include/cudecomp.h index 5e931d1..83ea503 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -187,12 +187,13 @@ typedef struct { * @brief A data structure containing geometry information about a pencil data buffer. */ typedef struct { - int32_t shape[3]; ///< pencil shape (in local order, including halo elements) - int32_t lo[3]; ///< lower bound coordinates (in local order, excluding halo elements) - int32_t hi[3]; ///< upper bound coordinates (in local order, excluding halo elements) + int32_t shape[3]; ///< pencil shape (in local order, including halo and padding elements) + int32_t lo[3]; ///< lower bound coordinates (in local order, excluding halo and padding elements) + int32_t hi[3]; ///< upper bound coordinates (in local order, excluding halo and padding elements) int32_t order[3]; ///< data layout order (e.g. 2,1,0 means memory is ordered Z,Y,X) int32_t halo_extents[3]; ///< halo extents by dimension (in global order) - int64_t size; ///< number of elements in pencil (including halo elements) + int32_t padding[3]; ///< padding by dimension (in global order) + int64_t size; ///< number of elements in pencil (including halo and padding elements) } cudecompPencilInfo_t; // cuDecomp initialization/finalization functions @@ -280,11 +281,15 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto * i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th * global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, * one element on each side). If no halo regions are necessary, a NULL pointer can be provided in place of this array. + * @param[in] padding An array of three integers to define padding of the pencil, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If no padding is + * necesary, a NULL pointer can be provided in place of this array. * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, - cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[]); + cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[], + const int32_t padding[]); /** * @brief Queries the required transpose workspace size, in elements, for a provided grid descriptor. @@ -423,13 +428,19 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided. * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo * regions, a NULL pointer can be provided. + * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. + * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer + * can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream); + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream); /** * @brief Function to transpose data from Y-axis aligned pencils to a Z-axis aligned pencils. @@ -446,13 +457,19 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided. * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo * regions, a NULL pointer can be provided. + * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. + * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer + * can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream); + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream); /** * @brief Function to transpose data from Z-axis aligned pencils to a Y-axis aligned pencils. @@ -469,13 +486,19 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided. * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo * regions, a NULL pointer can be provided. + * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. + * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer + * can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream); + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream); /** * @brief Function to transpose data from Y-axis aligned pencils to a X-axis aligned pencils. @@ -492,13 +515,19 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc * elements, one element on each side). If the input has no halo regions, a NULL pointer can be provided. * @param[in] output_halo_extents Similar to input_halo_extents, but for the output data. If the output has no halo * regions, a NULL pointer can be provided. + * @param[in] input_padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. + * @param[in] output_padding Similar to input_padding, but for the output data. If the output has no padding, a NULL pointer + * can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream); + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream); // Halo functions /** diff --git a/include/internal/halo.h b/include/internal/halo.h index 8cd6aa3..2a922d3 100644 --- a/include/internal/halo.h +++ b/include/internal/halo.h @@ -55,7 +55,7 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG // Get pencil info cudecompPencilInfo_t pinfo_h; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr)); // Get global ordered shapes auto shape_g_h = getShapeG(pinfo_h); diff --git a/include/internal/transpose.h b/include/internal/transpose.h index bfc402e..610e8ac 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -163,7 +163,8 @@ static void localPermute(const cudecompHandle_t handle, const std::array static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr, - const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) { + const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr, + const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) { std::array input_halo_extents{}; std::array output_halo_extents{}; @@ -171,11 +172,16 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (output_halo_extents_ptr) std::copy(output_halo_extents_ptr, output_halo_extents_ptr + 3, output_halo_extents.begin()); + std::array input_padding{}; + std::array output_padding{}; + if (input_padding_ptr) std::copy(input_padding_ptr, input_padding_ptr + 3, input_padding.begin()); + if (output_padding_ptr) std::copy(output_padding_ptr, output_padding_ptr + 3, output_padding.begin()); + bool fwd = dir > 0; bool inplace = (input == output); - bool input_has_halos = anyNonzeros(input_halo_extents); - bool output_has_halos = anyNonzeros(output_halo_extents); + bool input_has_halos_padding = anyNonzeros(input_halo_extents) || anyNonzeros(input_padding); + bool output_has_halos_padding = anyNonzeros(output_halo_extents) || anyNonzeros(output_padding); bool pipelined = isTransposeCommPipelined(grid_desc->config.transpose_comm_backend); int memcpy_limit = pipelined ? 1 : CUDECOMP_BATCHED_D2D_3D_PARAM_CAPACITY; @@ -206,11 +212,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c // Get pencil info cudecompPencilInfo_t pinfo_a, pinfo_a_h; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a, ax_a, nullptr)); - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a_h, ax_a, input_halo_extents.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a, ax_a, nullptr, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_a_h, ax_a, input_halo_extents.data(), input_padding.data())); cudecompPencilInfo_t pinfo_b, pinfo_b_h; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b, ax_b, nullptr)); - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b, ax_b, nullptr, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_b_h, ax_b, output_halo_extents.data(), output_padding.data())); // Check if input and output orders are the same bool orders_equal = true; @@ -218,10 +224,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c if (pinfo_a.order[i] != pinfo_b.order[i]) orders_equal = false; } - // Check if input and output halo extents are the same - bool halos_equal = true; + // Check if input and output halo extents and padding are the same + bool halos_padding_equal = true; for (int i = 0; i < 3; ++i) { - if (input_halo_extents[i] != output_halo_extents[i]) halos_equal = false; + if (input_halo_extents[i] != output_halo_extents[i]) halos_padding_equal = false; + if (input_padding[i] != output_padding[i]) halos_padding_equal = false; } // Get global ordered shapes @@ -248,7 +255,7 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c // Special cases for single rank communicators if (orders_equal) { if (inplace) { - if (halos_equal) { + if (halos_padding_equal) { // Single rank, in place, Pack -> Unpack: No transpose necessary. return; } @@ -295,11 +302,11 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c } if (enable) { - if (pinfo_a.order[2] == ax_a && !input_has_halos) { + if (pinfo_a.order[2] == ax_a && !input_has_halos_padding) { // Input is already packed for all to all, skip pack o1 = input; o2 = work; - } else if (pinfo_a.order[2] == ax_b && orders_equal && !output_has_halos) { + } else if (pinfo_a.order[2] == ax_b && orders_equal && !output_has_halos_padding) { // Output of all to all is in correct orientation, skip unpack o2 = output; } @@ -728,40 +735,44 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c template void cudecompTransposeXToY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr, - const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) { + const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr, + const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) { nvtx::rangePush("cudecompTransposeXToY"); cudecompTranspose_(0, 1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr, - stream); + input_padding_ptr, output_padding_ptr, stream); nvtx::rangePop(); } template void cudecompTransposeYToZ(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr, - const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) { + const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr, + const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) { nvtx::rangePush("cudecompTransposeYToZ"); cudecompTranspose_(1, 1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr, - stream); + input_padding_ptr, output_padding_ptr, stream); nvtx::rangePop(); } template void cudecompTransposeZToY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr, - const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) { + const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr, + const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) { nvtx::rangePush("cudecompTransposeZToY"); cudecompTranspose_(2, -1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr, - stream); + input_padding_ptr, output_padding_ptr, stream); nvtx::rangePop(); } template void cudecompTransposeYToX(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* output, T* work, const int32_t input_halo_extents_ptr[] = nullptr, - const int32_t output_halo_extents_ptr[] = nullptr, cudaStream_t stream = 0) { + const int32_t output_halo_extents_ptr[] = nullptr, const int32_t input_padding_ptr[] = nullptr, + const int32_t output_padding_ptr[] = nullptr, cudaStream_t stream = 0) { nvtx::rangePush("cudecompTransposeYToX"); cudecompTranspose_(1, -1, handle, grid_desc, input, output, work, input_halo_extents_ptr, output_halo_extents_ptr, - stream); + input_padding_ptr, output_padding_ptr, stream); nvtx::rangePop(); } diff --git a/src/autotune.cc b/src/autotune.cc index 5248768..9df20b3 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -154,9 +154,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; cudecompPencilInfo_t pinfo_x, pinfo_y, pinfo_z; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, nullptr)); - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr)); - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, nullptr, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr)); // Skip any decompositions with empty pencils if (grid_desc->config.pdims[0] > std::min(grid_desc->config.gdims_dist[0], grid_desc->config.gdims_dist[1]) || @@ -268,22 +268,22 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d if (options->transpose_op_weights[0] != 0.0) { CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, data, options->transpose_use_inplace_buffers[0] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } if (options->transpose_op_weights[1] != 0.0) { CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, data, options->transpose_use_inplace_buffers[1] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } if (options->transpose_op_weights[2] != 0.0) { CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, data, options->transpose_use_inplace_buffers[2] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } if (options->transpose_op_weights[3] != 0.0) { CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, data, options->transpose_use_inplace_buffers[3] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } } @@ -303,25 +303,25 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d if (options->transpose_op_weights[0] != 0.0) { CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, data, options->transpose_use_inplace_buffers[0] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } CHECK_CUDA(cudaEventRecord(events[1], 0)); if (options->transpose_op_weights[1] != 0.0) { CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, data, options->transpose_use_inplace_buffers[1] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } CHECK_CUDA(cudaEventRecord(events[2], 0)); if (options->transpose_op_weights[2] != 0.0) { CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, data, options->transpose_use_inplace_buffers[2] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } CHECK_CUDA(cudaEventRecord(events[3], 0)); if (options->transpose_op_weights[3] != 0.0) { CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, data, options->transpose_use_inplace_buffers[3] ? data : data2, w, - options->dtype, nullptr, nullptr, 0)); + options->dtype, nullptr, nullptr, nullptr, nullptr, 0)); } CHECK_CUDA(cudaEventRecord(events[4], 0)); CHECK_CUDA(cudaDeviceSynchronize()); @@ -536,7 +536,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, grid_desc->pidx[1] = handle->rank % grid_desc->config.pdims[1]; cudecompPencilInfo_t pinfo; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, options->halo_axis, options->halo_extents, nullptr)); // Skip any decompositions with empty pencils if (std::max(grid_desc->config.pdims[0], grid_desc->config.pdims[1]) > diff --git a/src/cudecomp.cc b/src/cudecomp.cc index c1d5bbc..8af3643 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -774,7 +774,8 @@ cudecompResult_t cudecompGridDescAutotuneOptionsSetDefaults(cudecompGridDescAuto } cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, - cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[]) { + cudecompPencilInfo_t* pencil_info, int32_t axis, const int32_t halo_extents[], + const int32_t padding[]) { using namespace cudecomp; try { checkHandle(handle); @@ -817,6 +818,13 @@ cudecompResult_t cudecompGetPencilInfo(cudecompHandle_t handle, cudecompGridDesc } else { pencil_info->halo_extents[i] = 0; } + + if (padding) { + pencil_info->shape[ord] += padding[i]; + pencil_info->padding[i] = padding[i]; + } else { + pencil_info->padding[i] = 0; + } pencil_info->size *= pencil_info->shape[ord]; } @@ -888,7 +896,7 @@ cudecompResult_t cudecompGetHaloWorkspaceSize(cudecompHandle_t handle, cudecompG if (!halo_extents) { THROW_INVALID_USAGE("halo_extents argument cannot be null."); } if (!workspace_size) { THROW_INVALID_USAGE("workspace_size argument cannot be null."); } cudecompPencilInfo_t pinfo; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, halo_extents)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, halo_extents, nullptr)); auto shape_g = getShapeG(pinfo); size_t halo_size_x = 4 * shape_g[1] * shape_g[2] * pinfo.halo_extents[0]; size_t halo_size_y = 4 * shape_g[0] * shape_g[2] * pinfo.halo_extents[1]; @@ -1114,7 +1122,7 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes } cudecompPencilInfo_t pinfo; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, nullptr)); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, axis, nullptr, nullptr)); int count = 0; for (int i = 0; i < 3; ++i) { @@ -1144,7 +1152,8 @@ cudecompResult_t cudecompGetShiftedRank(cudecompHandle_t handle, cudecompGridDes cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream) { + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1156,23 +1165,25 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeXToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE: cudecompTransposeXToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompTransposeXToY(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, output_halo_extents, - stream); + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompTransposeXToY(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, - output_halo_extents, stream); + output_halo_extents, input_padding, output_padding, stream); break; } } catch (const cudecomp::BaseException& e) { @@ -1184,7 +1195,8 @@ cudecompResult_t cudecompTransposeXToY(cudecompHandle_t handle, cudecompGridDesc cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream) { + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1196,26 +1208,27 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE: cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, output_halo_extents, - stream); + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompTransposeYToZ(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, - output_halo_extents, stream); + output_halo_extents, input_padding, output_padding, stream); break; } - } catch (const cudecomp::BaseException& e) { std::cerr << e.what(); return e.getResult(); @@ -1225,7 +1238,8 @@ cudecompResult_t cudecompTransposeYToZ(cudecompHandle_t handle, cudecompGridDesc cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream) { + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1237,26 +1251,27 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeZToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE: cudecompTransposeZToY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompTransposeZToY(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, output_halo_extents, - stream); + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompTransposeZToY(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, - output_halo_extents, stream); + output_halo_extents, input_padding, output_padding, stream); break; } - } catch (const cudecomp::BaseException& e) { std::cerr << e.what(); return e.getResult(); @@ -1266,7 +1281,8 @@ cudecompResult_t cudecompTransposeZToY(cudecompHandle_t handle, cudecompGridDesc cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* output, void* work, cudecompDataType_t dtype, const int32_t input_halo_extents[], - const int32_t output_halo_extents[], cudaStream_t stream) { + const int32_t output_halo_extents[], const int32_t input_padding[], + const int32_t output_padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1278,26 +1294,27 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc switch (dtype) { case CUDECOMP_FLOAT: cudecompTransposeYToX(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE: cudecompTransposeYToX(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(output), - reinterpret_cast(work), input_halo_extents, output_halo_extents, stream); + reinterpret_cast(work), input_halo_extents, output_halo_extents, + input_padding, output_padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompTransposeYToX(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, output_halo_extents, - stream); + input_padding, output_padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompTransposeYToX(handle, grid_desc, reinterpret_cast*>(input), reinterpret_cast*>(output), reinterpret_cast*>(work), input_halo_extents, - output_halo_extents, stream); + output_halo_extents, input_padding, output_padding, stream); break; } - } catch (const cudecomp::BaseException& e) { std::cerr << e.what(); return e.getResult(); diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf index a9d913b..d96022e 100644 --- a/src/cudecomp_m.cuf +++ b/src/cudecomp_m.cuf @@ -157,6 +157,7 @@ module cudecomp integer(c_int32_t) :: hi(3) ! upper bound coordinates (in local order, excluding halo elements) integer(c_int32_t) :: order(3) ! data layout order (e.g. 3,2,1 means memory is ordered z,y,x) integer(c_int32_t) :: halo_extents(3) ! halo extents by dimension (always in x,y,z order) + integer(c_int32_t) :: padding(3) ! padding by dimension (always in x,y,z order) integer(c_int64_t) :: size ! number of elements in pencil (including halo elements) end type cudecompPencilInfo @@ -258,13 +259,14 @@ module cudecomp ! General functions interface - function cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis, halo_extents) & + function cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis, halo_extents, padding) & bind(C, name="cudecompGetPencilInfo") result(res) import type(cudecompHandle), value :: handle type(cudecompGridDesc), value :: grid_desc integer(c_int32_t), value :: axis integer(c_int32_t) :: halo_extents(3) + integer(c_int32_t) :: padding(3) type(cudecompPencilInfo) :: pencil_info integer(c_int) :: res end function cudecompGetPencilInfoC @@ -380,7 +382,8 @@ module cudecomp ! Transpose functions interface function cudecompTransposeXToY_C(handle, grid_desc, input, output, work, dtype, & - input_halo_extents, output_halo_extents, stream) & + input_halo_extents, output_halo_extents, input_padding, & + output_padding, stream) & bind(C, name="cudecompTransposeXToY") result(res) import type(cudecompHandle), value :: handle @@ -389,6 +392,7 @@ module cudecomp real(c_float), device :: input(*), output(*), work(*) integer(c_int), value :: dtype integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3) + integer(c_int32_t) :: input_padding(3), output_padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompTransposeXToY_C @@ -396,7 +400,8 @@ module cudecomp interface function cudecompTransposeYToZ_C(handle, grid_desc, input, output, work, dtype, & - input_halo_extents, output_halo_extents, stream) & + input_halo_extents, output_halo_extents, input_padding, & + output_padding, stream) & bind(C, name="cudecompTransposeYToZ") result(res) import type(cudecompHandle), value :: handle @@ -405,6 +410,7 @@ module cudecomp real(c_float), device :: input(*), output(*), work(*) integer(c_int), value :: dtype integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3) + integer(c_int32_t) :: input_padding(3), output_padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompTransposeYToZ_C @@ -412,7 +418,8 @@ module cudecomp interface function cudecompTransposeZToY_C(handle, grid_desc, input, output, work, dtype, & - input_halo_extents, output_halo_extents, stream) & + input_halo_extents, output_halo_extents, input_padding, & + output_padding, stream) & bind(C, name="cudecompTransposeZToY") result(res) import type(cudecompHandle), value :: handle @@ -421,6 +428,7 @@ module cudecomp real(c_float), device :: input(*), output(*), work(*) integer(c_int), value :: dtype integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3) + integer(c_int32_t) :: input_padding(3), output_padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompTransposeZToY_C @@ -428,7 +436,8 @@ module cudecomp interface function cudecompTransposeYToX_C(handle, grid_desc, input, output, work, dtype, & - input_halo_extents, output_halo_extents, stream) & + input_halo_extents, output_halo_extents, input_padding, & + output_padding, stream) & bind(C, name="cudecompTransposeYToX") result(res) import type(cudecompHandle), value :: handle @@ -437,6 +446,7 @@ module cudecomp real(c_float), device :: input(*), output(*), work(*) integer(c_int), value :: dtype integer(c_int32_t) :: input_halo_extents(3), output_halo_extents(3) + integer(c_int32_t) :: input_padding(3), output_padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompTransposeYToX_C @@ -593,20 +603,24 @@ contains end function cudecompGridDescCreate ! General functions - function cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis, halo_extents) result(res) + function cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis, halo_extents, padding) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc integer :: axis ! unit offset, so x/y/z = 1/2/3 integer, optional:: halo_extents(3) + integer, optional:: padding(3) type(cudecompPencilInfo) :: pencil_info ! res%order is unit offset, x/y/z = 1/2/3 integer(c_int) :: res integer :: halo_extents_(3) + integer :: padding_(3) halo_extents_(:) = [0, 0, 0] + padding_(:) = [0, 0, 0] if (present(halo_extents)) halo_extents_ = halo_extents - res = cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis - 1, halo_extents_) + if (present(padding)) padding_ = padding + res = cudecompGetPencilInfoC(handle, grid_desc, pencil_info, axis - 1, halo_extents_, padding_) ! Update entries for Fortran indexing pencil_info%order = pencil_info%order + 1 pencil_info%lo = pencil_info%lo + 1 @@ -786,7 +800,7 @@ contains ! Transpose functions function cudecompTransposeXToY(handle, grid_desc, & input, output, work, dtype, input_halo_extents, output_halo_extents, & - stream) result(res) + input_padding, output_padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -796,26 +810,34 @@ contains integer(cuda_stream_kind), optional :: stream integer, optional :: input_halo_extents(3) integer, optional :: output_halo_extents(3) + integer, optional :: input_padding(3) + integer, optional :: output_padding(3) integer(c_int) :: res integer(cuda_stream_kind) :: stream_ integer :: input_halo_extents_(3) integer :: output_halo_extents_(3) + integer :: input_padding_(3) + integer :: output_padding_(3) stream_ = 0 input_halo_extents_(:) = [0, 0, 0] output_halo_extents_(:) = [0, 0, 0] + input_padding_(:) = [0, 0, 0] + output_padding_(:) = [0, 0, 0] if (present(stream)) stream_ = stream if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents + if (present(input_padding)) input_padding_ = input_padding + if (present(output_padding)) output_padding_ = output_padding res = cudecompTransposeXToY_C(handle, grid_desc, & input, output, work, dtype, input_halo_extents_, output_halo_extents_, & - stream_) + input_padding_, output_padding_, stream_) end function cudecompTransposeXToY function cudecompTransposeYToZ(handle, grid_desc, & input, output, work, dtype, input_halo_extents, output_halo_extents, & - stream) result(res) + input_padding, output_padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -825,26 +847,34 @@ contains integer(cuda_stream_kind), optional :: stream integer, optional :: input_halo_extents(3) integer, optional :: output_halo_extents(3) + integer, optional :: input_padding(3) + integer, optional :: output_padding(3) integer(c_int) :: res integer(cuda_stream_kind) :: stream_ integer :: input_halo_extents_(3) integer :: output_halo_extents_(3) + integer :: input_padding_(3) + integer :: output_padding_(3) stream_ = 0 input_halo_extents_(:) = [0, 0, 0] output_halo_extents_(:) = [0, 0, 0] + input_padding_(:) = [0, 0, 0] + output_padding_(:) = [0, 0, 0] if (present(stream)) stream_ = stream if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents + if (present(input_padding)) input_padding_ = input_padding + if (present(output_padding)) output_padding_ = output_padding res = cudecompTransposeYToZ_C(handle, grid_desc, & input, output, work, dtype, input_halo_extents_, output_halo_extents_, & - stream_) + input_padding_, output_padding_, stream_) end function cudecompTransposeYToZ function cudecompTransposeZToY(handle, grid_desc, & input, output, work, dtype, input_halo_extents, output_halo_extents, & - stream) result(res) + input_padding, output_padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -854,26 +884,34 @@ contains integer(cuda_stream_kind), optional :: stream integer, optional :: input_halo_extents(3) integer, optional :: output_halo_extents(3) + integer, optional :: input_padding(3) + integer, optional :: output_padding(3) integer(c_int) :: res integer(cuda_stream_kind) :: stream_ integer :: input_halo_extents_(3) integer :: output_halo_extents_(3) + integer :: input_padding_(3) + integer :: output_padding_(3) stream_ = 0 input_halo_extents_(:) = [0, 0, 0] output_halo_extents_(:) = [0, 0, 0] + input_padding_(:) = [0, 0, 0] + output_padding_(:) = [0, 0, 0] if (present(stream)) stream_ = stream if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents + if (present(input_padding)) input_padding_ = input_padding + if (present(output_padding)) output_padding_ = output_padding res = cudecompTransposeZToY_C(handle, grid_desc, & input, output, work, dtype, input_halo_extents_, output_halo_extents_, & - stream_) + input_padding_, output_padding_, stream_) end function cudecompTransposeZToY function cudecompTransposeYToX(handle, grid_desc, & input, output, work, dtype, input_halo_extents, output_halo_extents, & - stream) result(res) + input_padding, output_padding ,stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -883,21 +921,29 @@ contains integer(cuda_stream_kind), optional :: stream integer, optional :: input_halo_extents(3) integer, optional :: output_halo_extents(3) + integer, optional :: input_padding(3) + integer, optional :: output_padding(3) integer(c_int) :: res integer(cuda_stream_kind) :: stream_ integer :: input_halo_extents_(3) integer :: output_halo_extents_(3) + integer :: input_padding_(3) + integer :: output_padding_(3) stream_ = 0 input_halo_extents_(:) = [0, 0, 0] output_halo_extents_(:) = [0, 0, 0] + input_padding_(:) = [0, 0, 0] + output_padding_(:) = [0, 0, 0] if (present(stream)) stream_ = stream if (present(input_halo_extents)) input_halo_extents_ = input_halo_extents if (present(output_halo_extents)) output_halo_extents_ = output_halo_extents + if (present(input_padding)) input_padding_ = input_padding + if (present(output_padding)) output_padding_ = output_padding res = cudecompTransposeYToX_C(handle, grid_desc, & input, output, work, dtype, input_halo_extents_, output_halo_extents_, & - stream_) + input_padding_, output_padding_, stream_) end function cudecompTransposeYToX ! Halo functions diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc index 10c69cb..86e175c 100644 --- a/tests/cc/halo_test.cc +++ b/tests/cc/halo_test.cc @@ -401,7 +401,7 @@ static int run_test(const std::string& arguments, bool silent) { // Get pencil information cudecompPencilInfo_t pinfo; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data())); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), nullptr)); // Get workspace size int64_t workspace_num_elements; diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc index 1595199..88fe76a 100644 --- a/tests/cc/transpose_test.cc +++ b/tests/cc/transpose_test.cc @@ -123,9 +123,9 @@ static bool compare_pencils(const std::vector& ref, const std::vector= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) && - lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) && - lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) { + if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) && + lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) && + lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) { return false; } } @@ -150,9 +150,9 @@ static void initialize_pencil(std::vector& ref, const cudecompPencilInfo int64_t gi = gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]); // Only set values inside internal region - if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) && - lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) && - lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) { + if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) && + lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) && + lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) { ref[i] = gi; } else { ref[i] = -1; @@ -198,6 +198,12 @@ static void usage(const char* pname) { "\t\tY-pencil halo_extents setting. (default: 0 0 0) \n" "\t--hez\n" "\t\tZ-pencil halo_extents setting. (default: 0 0 0) \n" + "\t--pdx\n" + "\t\tX-pencil padding setting. (default: 0 0 0) \n" + "\t--pdy\n" + "\t\tY-pencil padding setting. (default: 0 0 0) \n" + "\t--pdz\n" + "\t\tZ-pencil padding setting. (default: 0 0 0) \n" "\t--mem_order\n" "\t\ttranspose_mem_order setting. (default: unset) \n" "\t-m|--use-managed-memory\n" @@ -220,6 +226,9 @@ struct transposeTestArgs { std::array halo_extents_x{}; std::array halo_extents_y{}; std::array halo_extents_z{}; + std::array padding_x{}; + std::array padding_y{}; + std::array padding_z{}; bool out_of_place = false; bool use_managed_memory = false; std::array mem_order{-1, -1, -1, -1, -1, -1, -1, -1, -1}; @@ -248,6 +257,9 @@ static transposeTestArgs parse_arguments(const std::string& arguments) { {"hex", required_argument, 0, '7'}, {"hey", required_argument, 0, '8'}, {"hez", required_argument, 0, '9'}, + {"pdx", required_argument, 0, '&'}, + {"pdy", required_argument, 0, '*'}, + {"pdz", required_argument, 0, '('}, {"mem_order", required_argument, 0, 'q'}, {"out-of-place", no_argument, 0, 'o'}, {"use-managed-memory", no_argument, 0, 'm'}, @@ -255,7 +267,7 @@ static transposeTestArgs parse_arguments(const std::string& arguments) { {0, 0, 0, 0}}; int option_index = 0; - int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:q:omh", long_options, &option_index); + int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:&:*:(:q:omh", long_options, &option_index); if (ch == -1) break; switch (ch) { @@ -297,6 +309,27 @@ static transposeTestArgs parse_arguments(const std::string& arguments) { optind++; } break; + case '&': + optind--; + for (int i = 0; i < 3; ++i) { + args.padding_x[i] = atoi(argv[optind]); + optind++; + } + break; + case '*': + optind--; + for (int i = 0; i < 3; ++i) { + args.padding_y[i] = atoi(argv[optind]); + optind++; + } + break; + case '(': + optind--; + for (int i = 0; i < 3; ++i) { + args.padding_z[i] = atoi(argv[optind]); + optind++; + } + break; case 'o': args.out_of_place = true; break; case 'm': args.use_managed_memory = true; break; case 'q': @@ -389,15 +422,15 @@ static int run_test(const std::string& arguments, bool silent) { // Get x-pencil information cudecompPencilInfo_t pinfo_x; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, args.halo_extents_x.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, args.halo_extents_x.data(), args.padding_x.data())); // Get y-pencil information cudecompPencilInfo_t pinfo_y; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_x.data())); // Get z-pencil information cudecompPencilInfo_t pinfo_z; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_x.data())); // Get workspace size int64_t workspace_num_elements; @@ -449,7 +482,7 @@ static int run_test(const std::string& arguments, bool silent) { CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)), - pinfo_x.halo_extents, pinfo_y.halo_extents, 0)); + pinfo_x.halo_extents, pinfo_y.halo_extents, pinfo_x.padding, pinfo_y.padding, 0)); CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*output), cudaMemcpyDeviceToHost)); if (!compare_pencils(yref, data, pinfo_y)) { fprintf(stderr, "FAILED cudecompTransposeXToY\n"); @@ -460,7 +493,7 @@ static int run_test(const std::string& arguments, bool silent) { CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)), - pinfo_y.halo_extents, pinfo_z.halo_extents, 0)); + pinfo_y.halo_extents, pinfo_z.halo_extents, pinfo_y.padding, pinfo_z.padding, 0)); CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost)); if (!compare_pencils(zref, data, pinfo_z)) { fprintf(stderr, "FAILED cudecompTransposeYToZ\n"); @@ -471,7 +504,7 @@ static int run_test(const std::string& arguments, bool silent) { CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)), - pinfo_z.halo_extents, pinfo_y.halo_extents, 0)); + pinfo_z.halo_extents, pinfo_y.halo_extents, pinfo_z.padding, pinfo_y.padding, 0)); CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost)); if (!compare_pencils(yref, data, pinfo_y)) { fprintf(stderr, "FAILED cudecompTransposeZToY\n"); @@ -482,7 +515,7 @@ static int run_test(const std::string& arguments, bool silent) { CHECK_CUDA(cudaMemset(work_d, 0, workspace_num_elements * dtype_size)); CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, get_cudecomp_datatype(real_t(0)), - pinfo_y.halo_extents, pinfo_x.halo_extents, 0)); + pinfo_y.halo_extents, pinfo_x.halo_extents, pinfo_y.padding, pinfo_x.padding, 0)); CHECK_CUDA(cudaMemcpy(data.data(), output, data.size() * sizeof(*data_d), cudaMemcpyDeviceToHost)); if (!compare_pencils(xref, data, pinfo_x)) { fprintf(stderr, "FAILED cudecompTransposeYToX\n"); diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index c36c7b8..9b18d15 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -69,12 +69,12 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod function compare_pencils(ref, res, pinfo) result(mismatch) implicit none type(cudecompPencilInfo) :: pinfo - ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))) - ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))) + ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), & + pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), & + pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3))) + ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), & + pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), & + pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3))) logical :: mismatch mismatch = any(ref(pinfo%lo(1): pinfo%hi(1), pinfo%lo(2): pinfo%hi(2), pinfo%lo(3): pinfo%hi(3)) /= & @@ -90,10 +90,10 @@ subroutine initialize_pencil(ref, pinfo, gdims) integer :: gdims(3) integer :: gx(3) - ! Allocate reference pencil with halo regions - allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))) + ! Allocate reference pencil with halo and padding regions + allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), & + pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), & + pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3)))) ref = -1 @@ -175,6 +175,7 @@ function run_test(arguments, silent) result(res) logical :: axis_contiguous(3) integer :: gdims_dist(3) integer :: halo_extents_x(3), halo_extents_y(3), halo_extents_z(3) + integer :: padding_x(3), padding_y(3), padding_z(3) integer :: mem_order(3, 3) logical :: out_of_place, use_managed_memory integer :: pr, pc @@ -307,6 +308,24 @@ function run_test(arguments, silent) result(res) read(arg, *) halo_extents_z(j) enddo skip_count = 3 + case('--pdx') + do j = 1, 3 + call get_command_argument(i+j, arg) + read(arg, *) padding_x(j) + enddo + skip_count = 3 + case('--pdy') + do j = 1, 3 + call get_command_argument(i+j, arg) + read(arg, *) padding_y(j) + enddo + skip_count = 3 + case('--pdz') + do j = 1, 3 + call get_command_argument(i+j, arg) + read(arg, *) padding_z(j) + enddo + skip_count = 3 case('--mem_order') l = 1 do j = 1, 3 @@ -570,7 +589,6 @@ program main call read_testfile(testfile, testcases) endif - nfailed = 0 allocate(failed_cases(size(testcases))) From ab1add29a8a855ead99a5372a48b2f727222fe4d Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Fri, 21 Feb 2025 12:20:05 -0800 Subject: [PATCH 2/8] Add padding support to halo implementation and API. --- examples/cc/basic_usage/basic_usage.cu | 6 +- .../cc/basic_usage/basic_usage_autotune.cu | 6 +- include/cudecomp.h | 15 +++- include/internal/common.h | 4 + include/internal/halo.h | 78 +++++++++++-------- include/internal/transpose.h | 4 - src/autotune.cc | 12 +-- src/cudecomp.cc | 30 +++---- src/cudecomp_m.cuf | 33 +++++--- tests/cc/halo_test.cc | 35 +++++---- tests/fortran/halo_test.f90 | 46 ++++++----- 11 files changed, 161 insertions(+), 108 deletions(-) diff --git a/examples/cc/basic_usage/basic_usage.cu b/examples/cc/basic_usage/basic_usage.cu index c151012..9201285 100644 --- a/examples/cc/basic_usage/basic_usage.cu +++ b/examples/cc/basic_usage/basic_usage.cu @@ -284,15 +284,15 @@ int main(int argc, char** argv) { // Update X-pencil halos in X direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 0, 0)); + pinfo_x.halo_extents, halo_periods, 0, nullptr, 0)); // Update X-pencil halos in Y direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 1, 0)); + pinfo_x.halo_extents, halo_periods, 1, nullptr, 0)); // Update X-pencil halos in Z direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 2, 0)); + pinfo_x.halo_extents, halo_periods, 2, nullptr, 0)); // Cleanup resources free(data); diff --git a/examples/cc/basic_usage/basic_usage_autotune.cu b/examples/cc/basic_usage/basic_usage_autotune.cu index bb5e47d..a61b039 100644 --- a/examples/cc/basic_usage/basic_usage_autotune.cu +++ b/examples/cc/basic_usage/basic_usage_autotune.cu @@ -262,15 +262,15 @@ int main(int argc, char** argv) { // Update X-pencil halos in X direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 0, 0)); + pinfo_x.halo_extents, halo_periods, 0, nullptr, 0)); // Update X-pencil halos in Y direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 1, 0)); + pinfo_x.halo_extents, halo_periods, 1, nullptr, 0)); // Update X-pencil halos in Z direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, - pinfo_x.halo_extents, halo_periods, 2, 0)); + pinfo_x.halo_extents, halo_periods, 2, nullptr, 0)); // Cleanup resources free(data); diff --git a/include/cudecomp.h b/include/cudecomp.h index 83ea503..82f47ee 100644 --- a/include/cudecomp.h +++ b/include/cudecomp.h @@ -546,13 +546,16 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. A NULL * pointer can be provided if none of the domain axes are periodic. * @param[in] dim Which pencil dimension (global indexed) to perform the halo update + * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream); + int32_t dim, const int32_t padding[], cudaStream_t stream); /** * @brief Function to perform halo communication of Y-axis aligned pencil data @@ -569,13 +572,16 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_ * @param[in] halo_periods An array of three booleans to define halo periodicity of the input data, in global order. * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. * @param[in] dim Which pencil dimension (global indexed) to perform the halo update + * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream); + int32_t dim, const int32_t padding[], cudaStream_t stream); /** * @brief Function to perform halo communication of Z-axis aligned pencil data @@ -592,13 +598,16 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_ * @param[in] halo_periods An array of three booleans to define halo periodicity of the input data, in global order. * If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. * @param[in] dim Which pencil dimension (global indexed) to perform the halo update + * @param[in] padding An array of three integers to define padding of the input data, in global order. The i-th entry + * in this array should contain the number of elements to treat as padding in the i-th global domain axis. If the input has + * no padding, a NULL pointer can be provided. * @param[in] stream CUDA stream to enqueue GPU operations into * * @return CUDECOMP_RESULT_SUCCESS on success or error code on failure. */ cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream); + int32_t dim, const int32_t padding[], cudaStream_t stream); #ifdef __cplusplus } diff --git a/include/internal/common.h b/include/internal/common.h index c971a38..9f899f0 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -329,6 +329,10 @@ static inline std::vector getSplits(int64_t N, int nchunks, int pad) { return splits; } +template static inline bool anyNonzeros(const std::array& arr) { + return (arr[0] != T(0) || arr[1] != T(0) || arr[2] != T(0)); +} + // Assigns an integer ID to every unique value in a vector template std::unordered_map getUniqueIds(const std::vector& v) { diff --git a/include/internal/halo.h b/include/internal/halo.h index 2a922d3..a636286 100644 --- a/include/internal/halo.h +++ b/include/internal/halo.h @@ -46,19 +46,25 @@ namespace cudecomp { template void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work, - const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim, + const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], + int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) { std::array halo_extents{}; if (halo_extents_ptr) std::copy(halo_extents_ptr, halo_extents_ptr + 3, halo_extents.begin()); std::array halo_periods{}; if (halo_periods_ptr) std::copy(halo_periods_ptr, halo_periods_ptr + 3, halo_periods.begin()); + std::array padding{}; + if (padding_ptr) std::copy(padding_ptr, padding_ptr + 3, padding.begin()); // Get pencil info cudecompPencilInfo_t pinfo_h; CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h, ax, halo_extents.data(), nullptr)); + cudecompPencilInfo_t pinfo_h_p; // with padding + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_h_p, ax, halo_extents.data(), padding.data())); // Get global ordered shapes auto shape_g_h = getShapeG(pinfo_h); + auto shape_g_h_p = getShapeG(pinfo_h_p); // Get neighbors std::array neighbors; @@ -120,9 +126,13 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG } bool managed = isManagedPointer(input); + bool input_has_padding = anyNonzeros(padding); + if (c == 2 && - (haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend) || - (managed && haloBackendRequiresMpi(grid_desc->config.halo_comm_backend)))) { + (input_has_padding || + haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend) || + (managed && haloBackendRequiresMpi(grid_desc->config.halo_comm_backend)))) { + // For padded input, always stage to work space. // For managed memory, always stage to work space if using MPI. // For any memory, always stage to workspace if using NVSHMEM. // Can revisit for NVSHMEM if input is NVSHMEM allocated. @@ -136,21 +146,21 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG std::array lx{}; // Left - lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim]; - memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h, lx); - memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h, {0, 0, 0}); + lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim] - padding[dim]; + memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h_p, lx); + memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h_p, {0, 0, 0}); // Right lx[dim] = halo_extents[dim]; - memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h, lx); - lx[dim] = shape_g_h[dim] - halo_extents[dim]; - memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h, lx); + memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h_p, lx); + lx[dim] = shape_g_h_p[dim] - halo_extents[dim] - padding[dim]; + memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h_p, lx); for (int i = 0; i < 2; ++i) { - memcpy_params.src_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1]; - memcpy_params.src_strides[1][i] = pinfo_h.shape[0]; - memcpy_params.dest_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1]; - memcpy_params.dest_strides[1][i] = pinfo_h.shape[0]; + memcpy_params.src_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1]; + memcpy_params.src_strides[1][i] = pinfo_h_p.shape[0]; + memcpy_params.dest_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1]; + memcpy_params.dest_strides[1][i] = pinfo_h_p.shape[0]; memcpy_params.extents[0][i] = (dim == pinfo_h.order[2]) ? halo_extents[dim] : pinfo_h.shape[2]; memcpy_params.extents[1][i] = (dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1]; memcpy_params.extents[2][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0]; @@ -172,17 +182,17 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG // Pack // Left lx[dim] = halo_extents[dim]; - memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h, lx); + memcpy_params.src[0] = input + getPencilPtrOffset(pinfo_h_p, lx); memcpy_params.dest[0] = send_buff; // Right - lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim]; - memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h, lx); + lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim] - padding[dim]; + memcpy_params.src[1] = input + getPencilPtrOffset(pinfo_h_p, lx); memcpy_params.dest[1] = send_buff + halo_size; for (int i = 0; i < 2; ++i) { - memcpy_params.src_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1]; - memcpy_params.src_strides[1][i] = pinfo_h.shape[0]; + memcpy_params.src_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1]; + memcpy_params.src_strides[1][i] = pinfo_h_p.shape[0]; memcpy_params.dest_strides[1][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0]; memcpy_params.dest_strides[0][i] = memcpy_params.dest_strides[1][i] * ((dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1]); @@ -203,16 +213,16 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG // Unpack // Left memcpy_params.src[0] = recv_buff; - memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h, {0, 0, 0}); + memcpy_params.dest[0] = input + getPencilPtrOffset(pinfo_h_p, {0, 0, 0}); // Right memcpy_params.src[1] = recv_buff + halo_size; - lx[dim] = shape_g_h[dim] - halo_extents[dim]; - memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h, lx); + lx[dim] = shape_g_h_p[dim] - halo_extents[dim] - padding[dim]; + memcpy_params.dest[1] = input + getPencilPtrOffset(pinfo_h_p, lx); for (int i = 0; i < 2; ++i) { - memcpy_params.dest_strides[0][i] = pinfo_h.shape[0] * pinfo_h.shape[1]; - memcpy_params.dest_strides[1][i] = pinfo_h.shape[0]; + memcpy_params.dest_strides[0][i] = pinfo_h_p.shape[0] * pinfo_h_p.shape[1]; + memcpy_params.dest_strides[1][i] = pinfo_h_p.shape[0]; memcpy_params.src_strides[1][i] = (dim == pinfo_h.order[0]) ? halo_extents[dim] : pinfo_h.shape[0]; memcpy_params.src_strides[0][i] = memcpy_params.src_strides[1][i] * ((dim == pinfo_h.order[1]) ? halo_extents[dim] : pinfo_h.shape[1]); @@ -250,9 +260,9 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG send_offsets[0] = getPencilPtrOffset(pinfo_h, lx); recv_offsets[0] = getPencilPtrOffset(pinfo_h, {0, 0, 0}); // Right - lx[dim] = shape_g_h[dim] - 2 * halo_extents[dim]; + lx[dim] = shape_g_h_p[dim] - 2 * halo_extents[dim]; send_offsets[1] = getPencilPtrOffset(pinfo_h, lx); - lx[dim] = shape_g_h[dim] - halo_extents[dim]; + lx[dim] = shape_g_h_p[dim] - halo_extents[dim]; recv_offsets[1] = getPencilPtrOffset(pinfo_h, lx); cudecompSendRecvPair(handle, grid_desc, neighbors, input, counts, send_offsets, input, counts, recv_offsets, @@ -263,34 +273,34 @@ void cudecompUpdateHalos_(int ax, const cudecompHandle_t handle, const cudecompG template void cudecompUpdateHalosX(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work, - const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim, - cudaStream_t stream) { + const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], + int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) { std::stringstream os; os << "cudecompUpdateHalosX_" << dim; nvtx::rangePush(os.str()); - cudecompUpdateHalos_(0, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream); + cudecompUpdateHalos_(0, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream); nvtx::rangePop(); } template void cudecompUpdateHalosY(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work, - const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim, - cudaStream_t stream) { + const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], + int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) { std::stringstream os; os << "cudecompUpdateHalosY_" << dim; nvtx::rangePush(os.str()); - cudecompUpdateHalos_(1, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream); + cudecompUpdateHalos_(1, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream); nvtx::rangePop(); } template void cudecompUpdateHalosZ(const cudecompHandle_t handle, const cudecompGridDesc_t grid_desc, T* input, T* work, - const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], int32_t dim, - cudaStream_t stream) { + const int32_t halo_extents_ptr[], const bool halo_periods_ptr[], + int32_t dim, const int32_t padding_ptr[], cudaStream_t stream) { std::stringstream os; os << "cudecompUpdateHalosZ_" << dim; nvtx::rangePush(os.str()); - cudecompUpdateHalos_(2, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, stream); + cudecompUpdateHalos_(2, handle, grid_desc, input, work, halo_extents_ptr, halo_periods_ptr, dim, padding_ptr, stream); nvtx::rangePop(); } diff --git a/include/internal/transpose.h b/include/internal/transpose.h index 610e8ac..8c67d4d 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -54,10 +54,6 @@ static inline bool isTransposeCommPipelined(cudecompTransposeCommBackend_t commT commType == CUDECOMP_TRANSPOSE_COMM_MPI_P2P_PL); } -template static inline bool anyNonzeros(const std::array& arr) { - return (arr[0] != T(0) || arr[1] != T(0) || arr[2] != T(0)); -} - #if CUTENSOR_MAJOR >= 2 static inline cutensorDataType_t getCutensorDataType(float) { return CUTENSOR_R_32F; } static inline cutensorDataType_t getCutensorDataType(double) { return CUTENSOR_R_64F; } diff --git a/src/autotune.cc b/src/autotune.cc index 9df20b3..d3a0ccb 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -648,15 +648,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, switch (options->halo_axis) { case 0: CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; case 1: CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; case 2: CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; } } @@ -672,15 +672,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, switch (options->halo_axis) { case 0: CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; case 1: CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; case 2: CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, d, w, options->dtype, pinfo.halo_extents, - options->halo_periods, dim, 0)); + options->halo_periods, dim, nullptr, 0)); break; } } diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 8af3643..7ffb7d9 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -1324,7 +1324,7 @@ cudecompResult_t cudecompTransposeYToX(cudecompHandle_t handle, cudecompGridDesc cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream) { + int32_t dim, const int32_t padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1343,19 +1343,19 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_ switch (dtype) { case CUDECOMP_FLOAT: cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE: cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, stream); + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompUpdateHalosX(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; } @@ -1369,7 +1369,7 @@ cudecompResult_t cudecompUpdateHalosX(cudecompHandle_t handle, cudecompGridDesc_ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream) { + int32_t dim, const int32_t padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1388,19 +1388,19 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_ switch (dtype) { case CUDECOMP_FLOAT: cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE: cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, stream); + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompUpdateHalosY(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; } @@ -1414,7 +1414,7 @@ cudecompResult_t cudecompUpdateHalosY(cudecompHandle_t handle, cudecompGridDesc_ cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* input, void* work, cudecompDataType_t dtype, const int32_t halo_extents[], const bool halo_periods[], - int32_t dim, cudaStream_t stream) { + int32_t dim, const int32_t padding[], cudaStream_t stream) { using namespace cudecomp; try { checkHandle(handle); @@ -1433,19 +1433,19 @@ cudecompResult_t cudecompUpdateHalosZ(cudecompHandle_t handle, cudecompGridDesc_ switch (dtype) { case CUDECOMP_FLOAT: cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE: cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast(input), reinterpret_cast(work), - halo_extents, halo_periods, dim, stream); + halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_FLOAT_COMPLEX: cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, stream); + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; case CUDECOMP_DOUBLE_COMPLEX: cudecompUpdateHalosZ(handle, grid_desc, reinterpret_cast*>(input), - reinterpret_cast*>(work), halo_extents, halo_periods, dim, + reinterpret_cast*>(work), halo_extents, halo_periods, dim, padding, stream); break; } diff --git a/src/cudecomp_m.cuf b/src/cudecomp_m.cuf index d96022e..55945e6 100644 --- a/src/cudecomp_m.cuf +++ b/src/cudecomp_m.cuf @@ -455,7 +455,7 @@ module cudecomp ! Halo functions interface function cudecompUpdateHalosX_C(handle, grid_desc, input, work, dtype, & - halo_extents, halo_periods, dim, stream) & + halo_extents, halo_periods, dim, padding, stream) & bind(C, name="cudecompUpdateHalosX") result(res) import type(cudecompHandle), value :: handle @@ -466,6 +466,7 @@ module cudecomp integer(c_int32_t) :: halo_extents(3) logical(c_bool) :: halo_periods(3) integer(c_int32_t), value :: dim + integer(c_int32_t) :: padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompUpdateHalosX_C @@ -473,7 +474,7 @@ module cudecomp interface function cudecompUpdateHalosY_C(handle, grid_desc, input, work, dtype, & - halo_extents, halo_periods, dim, stream) & + halo_extents, halo_periods, dim, padding, stream) & bind(C, name="cudecompUpdateHalosY") result(res) import type(cudecompHandle), value :: handle @@ -484,6 +485,7 @@ module cudecomp integer(c_int32_t) :: halo_extents(3) logical(c_bool) :: halo_periods(3) integer(c_int32_t), value :: dim + integer(c_int32_t) :: padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompUpdateHalosY_C @@ -491,7 +493,7 @@ module cudecomp interface function cudecompUpdateHalosZ_C(handle, grid_desc, input, work, dtype, & - halo_extents, halo_periods, dim, stream) & + halo_extents, halo_periods, dim, padding, stream) & bind(C, name="cudecompUpdateHalosZ") result(res) import type(cudecompHandle), value :: handle @@ -502,6 +504,7 @@ module cudecomp integer(c_int32_t) :: halo_extents(3) logical(c_bool) :: halo_periods(3) integer(c_int32_t), value :: dim + integer(c_int32_t) :: padding(3) integer(cuda_stream_kind), value :: stream integer(c_int) :: res end function cudecompUpdateHalosZ_C @@ -949,7 +952,7 @@ contains ! Halo functions function cudecompUpdateHalosX(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods, & - dim, stream) result(res) + dim, padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -959,24 +962,28 @@ contains integer :: halo_extents(3) logical :: halo_periods(3) integer :: dim + integer, optional :: padding(3) integer(cuda_stream_kind), optional :: stream integer(c_int) :: res integer(cuda_stream_kind) :: stream_ logical(c_bool) :: halo_periods_c(3) + integer :: padding_(3) halo_periods_c(:) = halo_periods stream_ = 0 + padding_ = [0, 0, 0] if (present(stream)) stream_ = stream + if (present(padding)) padding_ = padding res = cudecompUpdateHalosX_C(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods_c, & - dim - 1, stream_) + dim - 1, padding_, stream_) end function cudecompUpdateHalosX function cudecompUpdateHalosY(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods, & - dim, stream) result(res) + dim, padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -986,24 +993,28 @@ contains integer :: halo_extents(3) logical :: halo_periods(3) integer :: dim + integer, optional :: padding(3) integer(cuda_stream_kind), optional :: stream integer(c_int) :: res integer(cuda_stream_kind) :: stream_ logical(c_bool) :: halo_periods_c(3) + integer :: padding_(3) halo_periods_c(:) = halo_periods stream_ = 0 + padding_ = [0, 0, 0] if (present(stream)) stream_ = stream + if (present(padding)) padding_ = padding res = cudecompUpdateHalosY_C(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods_c, & - dim - 1, stream_) + dim - 1, padding_, stream_) end function cudecompUpdateHalosY function cudecompUpdateHalosZ(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods, & - dim, stream) result(res) + dim, padding, stream) result(res) implicit none type(cudecompHandle) :: handle type(cudecompGridDesc) :: grid_desc @@ -1013,19 +1024,23 @@ contains integer :: halo_extents(3) logical :: halo_periods(3) integer :: dim + integer, optional :: padding(3) integer(cuda_stream_kind), optional :: stream integer(c_int) :: res integer(cuda_stream_kind) :: stream_ logical(c_bool) :: halo_periods_c(3) + integer :: padding_(3) halo_periods_c(:) = halo_periods stream_ = 0 + padding_ = [0, 0, 0] if (present(stream)) stream_ = stream + if (present(padding)) padding_ = padding res = cudecompUpdateHalosZ_C(handle, grid_desc, & input, work, dtype, halo_extents, halo_periods_c, & - dim - 1, stream_) + dim - 1, padding, stream_) end function cudecompUpdateHalosZ ! Helper function to copy string diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc index 86e175c..2a231be 100644 --- a/tests/cc/halo_test.cc +++ b/tests/cc/halo_test.cc @@ -115,9 +115,7 @@ static std::vector read_testfile(const std::string& filename) { static bool compare_pencils(const std::vector& ref, const std::vector& res, const cudecompPencilInfo_t& pinfo) { for (int64_t i = 0; i < ref.size(); ++i) { - if (ref[i] != real_t(-1)) { - if (ref[i] != res[i]) return false; - } + if (ref[i] != res[i]) return false; } return true; } @@ -138,9 +136,9 @@ static void initialize_pencil(std::vector& ref, const cudecompPencilInfo int64_t gi = gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]); // Only set values inside internal region - if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]]) && - lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]]) && - lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]])) { + if (lx[0] >= pinfo.halo_extents[pinfo.order[0]] && lx[0] < (pinfo.shape[0] - pinfo.halo_extents[pinfo.order[0]] - pinfo.padding[pinfo.order[0]]) && + lx[1] >= pinfo.halo_extents[pinfo.order[1]] && lx[1] < (pinfo.shape[1] - pinfo.halo_extents[pinfo.order[1]] - pinfo.padding[pinfo.order[1]]) && + lx[2] >= pinfo.halo_extents[pinfo.order[2]] && lx[2] < (pinfo.shape[2] - pinfo.halo_extents[pinfo.order[2]] - pinfo.padding[pinfo.order[2]])) { ref[i] = gi; } else { ref[i] = -1; @@ -176,6 +174,12 @@ static void initialize_reference(std::vector& ref, const cudecompPencilI } } } + // Also mark any padded elements for unset value (-1) + if (lx[0] >= pinfo.shape[0] - pinfo.padding[pinfo.order[0]] || + lx[1] >= pinfo.shape[1] - pinfo.padding[pinfo.order[1]] || + lx[2] >= pinfo.shape[2] - pinfo.padding[pinfo.order[2]]) { + unset = true; + } int64_t gi = (unset) ? -1 : gx[0] + gdims[0] * (gx[1] + gx[2] * gdims[1]); ref[i] = gi; @@ -247,6 +251,7 @@ struct haloTestArgs { std::array gdims_dist{}; std::array halo_extents{1, 1, 1}; std::array halo_periods{true, true, true}; + std::array padding{}; int axis = 0; bool use_managed_memory = false; std::array mem_order{-1, -1, -1, -1, -1, -1, -1, -1, -1}; @@ -271,12 +276,13 @@ static haloTestArgs parse_arguments(const std::string& arguments) { {"hex", required_argument, 0, '7'}, {"hey", required_argument, 0, '8'}, {"hez", required_argument, 0, '9'}, {"hpx", required_argument, 0, 'e'}, {"hpy", required_argument, 0, 'f'}, {"hpz", required_argument, 0, 'g'}, - {"ax", required_argument, 0, 'a'}, {"use-managed-memory", no_argument, 0, 'm'}, - {"mem_order", required_argument, 0, 'q'}, + {"pdx", required_argument, 0, '&'}, {"pdy", required_argument, 0, '*'}, + {"pdz", required_argument, 0, '*'}, {"ax", required_argument, 0, 'a'}, + {"use-managed-memory", no_argument, 0, 'm'}, {"mem_order", required_argument, 0, 'q'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0}}; int option_index = 0; - int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:e:f:g:a:q:mh", long_options, &option_index); + int ch = getopt_long(argc, argv, "x:y:z:b:r:c:1:2:3:4:7:8:9:e:f:g:a:q:&:*:(:mh", long_options, &option_index); if (ch == -1) break; switch (ch) { @@ -303,6 +309,9 @@ static haloTestArgs parse_arguments(const std::string& arguments) { case 'e': args.halo_periods[0] = atoi(optarg); break; case 'f': args.halo_periods[1] = atoi(optarg); break; case 'g': args.halo_periods[2] = atoi(optarg); break; + case '&': args.padding[0] = atoi(optarg); break; + case '*': args.padding[1] = atoi(optarg); break; + case '(': args.padding[2] = atoi(optarg); break; case 'a': args.axis = atoi(optarg); break; case 'm': args.use_managed_memory = true; break; case 'q': @@ -401,7 +410,7 @@ static int run_test(const std::string& arguments, bool silent) { // Get pencil information cudecompPencilInfo_t pinfo; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo, args.axis, args.halo_extents.data(), args.padding.data())); // Get workspace size int64_t workspace_num_elements; @@ -441,15 +450,15 @@ static int run_test(const std::string& arguments, bool silent) { switch (args.axis) { case 0: CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)), - pinfo.halo_extents, args.halo_periods.data(), i, 0)); + pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0)); break; case 1: CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)), - pinfo.halo_extents, args.halo_periods.data(), i, 0)); + pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0)); break; case 2: CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, get_cudecomp_datatype(real_t(0)), - pinfo.halo_extents, args.halo_periods.data(), i, 0)); + pinfo.halo_extents, args.halo_periods.data(), i, pinfo.padding, 0)); break; } } diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index 3f001f4..041398b 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -70,12 +70,8 @@ function compare_pencils(ref, res, pinfo) result(mismatch) use cudecomp implicit none type(cudecompPencilInfo) :: pinfo - ARRTYPE :: ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))) - ARRTYPE :: res(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3))) + ARRTYPE :: ref(pinfo%shape(1), pinfo%shape(2), pinfo%shape(3)) + ARRTYPE :: res(pinfo%shape(1), pinfo%shape(2), pinfo%shape(3)) logical :: mismatch mismatch = any(ref /= res) @@ -91,10 +87,10 @@ subroutine initialize_pencil(ref, pinfo, gdims) integer :: gdims(3) integer :: gx(3) - ! Allocate reference pencil with halo regions - allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))) + ! Allocate reference pencil with halo and padding regions + allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), & + pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), & + pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3)))) ref = -1 @@ -127,10 +123,10 @@ subroutine initialize_reference(ref, pinfo, gdims, halo_periods) logical :: halo_periods(3) logical :: unset - ! Allocate reference pencil with halo regions - allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)), & - pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)), & - pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)))) + ! Allocate reference pencil with halo and padding regions + allocate(ref(pinfo%lo(1) - pinfo%halo_extents(pinfo%order(1)): pinfo%hi(1) + pinfo%halo_extents(pinfo%order(1)) + pinfo%padding(pinfo%order(1)), & + pinfo%lo(2) - pinfo%halo_extents(pinfo%order(2)): pinfo%hi(2) + pinfo%halo_extents(pinfo%order(2)) + pinfo%padding(pinfo%order(2)), & + pinfo%lo(3) - pinfo%halo_extents(pinfo%order(3)): pinfo%hi(3) + pinfo%halo_extents(pinfo%order(3)) + pinfo%padding(pinfo%order(3)))) ref = -1 @@ -229,6 +225,7 @@ function run_test(arguments, silent) result(res) integer :: gdims_dist(3) integer :: halo_extents(3) logical :: halo_periods(3) + integer :: padding(3) integer :: mem_order(3, 3) logical :: use_managed_memory integer :: pr, pc @@ -282,6 +279,7 @@ function run_test(arguments, silent) result(res) gdims_dist(:) = 0 halo_extents(:) = 1 halo_periods(:) = .true. + padding(:) = 0 mem_order(:, :) = -1 axis = 1 use_managed_memory = .false. @@ -369,6 +367,18 @@ function run_test(arguments, silent) result(res) read(arg, *) iarg halo_periods(3) = iarg skip_count = 1 + case('--pdx') + read(args(i+1), *) arg + read(arg, *) padding(1) + skip_count = 1 + case('--pdy') + read(args(i+1), *) arg + read(arg, *) padding(2) + skip_count = 1 + case('--pdz') + read(args(i+1), *) arg + read(arg, *) padding(3) + skip_count = 1 case('--ax') read(args(i+1), *) arg read(arg, *) axis @@ -437,7 +447,7 @@ function run_test(arguments, silent) result(res) endif ! Get pencil information - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents)) + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo, axis, halo_extents, padding)) ! Get workspace size CHECK_CUDECOMP(cudecompGetHaloWorkspaceSize(handle, grid_desc, axis, halo_extents, workspace_num_elements)) @@ -477,11 +487,11 @@ function run_test(arguments, silent) result(res) do i = 1, 3 select case(axis) case(1) - CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i)) + CHECK_CUDECOMP(cudecompUpdateHalosX(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding)) case(2) - CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i)) + CHECK_CUDECOMP(cudecompUpdateHalosY(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding)) case(3) - CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i)) + CHECK_CUDECOMP(cudecompUpdateHalosZ(handle, grid_desc, input, work_d, dtype, pinfo%halo_extents, halo_periods, i, padding)) end select end do From 1d3b65522c3dee40b1b74900304e5bd56554878c Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 10 Mar 2025 11:54:59 -0700 Subject: [PATCH 3/8] Fixes after rebase. --- tests/cc/transpose_test.cc | 4 ++-- tests/fortran/transpose_test.f90 | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc index 88fe76a..0e1ddc8 100644 --- a/tests/cc/transpose_test.cc +++ b/tests/cc/transpose_test.cc @@ -426,11 +426,11 @@ static int run_test(const std::string& arguments, bool silent) { // Get y-pencil information cudecompPencilInfo_t pinfo_y; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_x.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, args.halo_extents_y.data(), args.padding_y.data())); // Get z-pencil information cudecompPencilInfo_t pinfo_z; - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_x.data())); + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, args.halo_extents_z.data(), args.padding_z.data())); // Get workspace size int64_t workspace_num_elements; diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index 9b18d15..908042d 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -233,6 +233,9 @@ function run_test(arguments, silent) result(res) halo_extents_x(:) = 0 halo_extents_y(:) = 0 halo_extents_z(:) = 0 + padding_x(:) = 0 + padding_y(:) = 0 + padding_z(:) = 0 mem_order(:,:) = -1 out_of_place = .false. use_managed_memory = .false. @@ -310,19 +313,19 @@ function run_test(arguments, silent) result(res) skip_count = 3 case('--pdx') do j = 1, 3 - call get_command_argument(i+j, arg) + read(args(i+j), *) arg read(arg, *) padding_x(j) enddo skip_count = 3 case('--pdy') do j = 1, 3 - call get_command_argument(i+j, arg) + read(args(i+j), *) arg read(arg, *) padding_y(j) enddo skip_count = 3 case('--pdz') do j = 1, 3 - call get_command_argument(i+j, arg) + read(args(i+j), *) arg read(arg, *) padding_z(j) enddo skip_count = 3 @@ -389,13 +392,13 @@ function run_test(arguments, silent) result(res) endif ! Get x-pencil information - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x)) + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_x, 1, halo_extents_x, padding_x)) ! Get y-pencil information - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y)) + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_y, 2, halo_extents_y, padding_y)) ! Get z-pencil information - CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z)) + CHECK_CUDECOMP(cudecompGetPencilInfo(handle, grid_desc, pinfo_z, 3, halo_extents_z, padding_z)) ! Get workspace size CHECK_CUDECOMP(cudecompGetTransposeWorkspaceSize(handle, grid_desc, workspace_num_elements)) @@ -444,7 +447,7 @@ function run_test(arguments, silent) result(res) endif work_d = 0 - CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, dtype, pinfo_x%halo_extents, pinfo_y%halo_extents)) + CHECK_CUDECOMP(cudecompTransposeXToY(handle, grid_desc, input, output, work_d, dtype, pinfo_x%halo_extents, pinfo_y%halo_extents, pinfo_x%padding, pinfo_y%padding)) data = output if (compare_pencils(yref, data, pinfo_y)) then print*, "FAILED cudecompTranposeXToY" @@ -463,7 +466,7 @@ function run_test(arguments, silent) result(res) endif work_d = 0 - CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_z%halo_extents)) + CHECK_CUDECOMP(cudecompTransposeYToZ(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_z%halo_extents, pinfo_y%padding, pinfo_z%padding)) data = output if (compare_pencils(zref, data, pinfo_z)) then print*, "FAILED cudecompTranposeYToZ" @@ -482,7 +485,7 @@ function run_test(arguments, silent) result(res) endif work_d = 0 - CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, dtype, pinfo_z%halo_extents, pinfo_y%halo_extents)) + CHECK_CUDECOMP(cudecompTransposeZToY(handle, grid_desc, input, output, work_d, dtype, pinfo_z%halo_extents, pinfo_y%halo_extents, pinfo_z%padding, pinfo_y%padding)) data = output if (compare_pencils(yref, data, pinfo_y)) then print*, "FAILED cudecompTranposeZToY" @@ -501,7 +504,7 @@ function run_test(arguments, silent) result(res) endif work_d = 0 - CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_x%halo_extents)) + CHECK_CUDECOMP(cudecompTransposeYToX(handle, grid_desc, input, output, work_d, dtype, pinfo_y%halo_extents, pinfo_x%halo_extents, pinfo_y%padding, pinfo_x%padding)) data = output if (compare_pencils(xref, data, pinfo_x)) then print*, "FAILED cudecompTranposeYToX" From 0261a29656f8881ecc6e30276932aa3a811321c5 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 10 Mar 2025 15:39:34 -0700 Subject: [PATCH 4/8] Adding padding test cases with some additional test reorganization/optimization. --- tests/test_config.yaml | 187 +++++++++++++++++++++++++++++++++++++++-- tests/test_runner.py | 48 +++++++++-- 2 files changed, 225 insertions(+), 10 deletions(-) diff --git a/tests/test_config.yaml b/tests/test_config.yaml index abef7f2..293e86f 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -4,7 +4,8 @@ base: &base NVSHMEM_DISABLE_NCCL: 1 dtypes: ['R32', 'R64', 'C32', 'C64'] - + apply_skips: None + use_single_pdim: False transpose_test_base: &transpose_test_base <<: *base @@ -13,7 +14,8 @@ transpose_test_base: &transpose_test_base args : ['backend', 'gx', 'gy', 'gz', 'gd', - 'hex' ,'hey', 'hez'] + 'hex' ,'hey', 'hez', + 'pdx', 'pdy', 'pdz'] backend: [1, 2, 3, 4, 5, 6, 7] @@ -27,6 +29,10 @@ transpose_test_base: &transpose_test_base hey: ["0 0 0"] hez: ["0 0 0"] + pdx: ["0 0 0"] + pdy: ["0 0 0"] + pdz: ["0 0 0"] + out_of_place: [True, False] managed_memory : [True, False] run_autotuning : False @@ -36,11 +42,51 @@ transpose_test_base: &transpose_test_base transpose_test: &transpose_test <<: *transpose_test_base +transpose_test_halo: &transpose_test_halo + <<: *transpose_test_base + + backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend + dtypes: ['R32'] # Limit testing to one dtype + + hex: ["0 0 0", "1 1 1"] + hey: ["0 0 0", "1 1 1"] + hez: ["0 0 0", "1 1 1"] + apply_skips: 'transpose' + +transpose_test_padding: &transpose_test_padding + <<: *transpose_test_base + + backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend + dtypes: ['R32'] # Limit testing to one dtype + + pdx: ["0 0 0", "1 1 1"] + pdy: ["0 0 0", "1 1 1"] + pdz: ["0 0 0", "1 1 1"] + apply_skips: 'transpose' + +transpose_test_gdimdist: &transpose_test_gdimdist + <<: *transpose_test_base + backend: [1] # Limit testing to one backend + dtypes: ['R32'] # Limit testing to one dtype + + gd: ["16 16 16"] + apply_skips: 'transpose' + +transpose_test_mix: &transpose_test_mix + <<: *transpose_test_base + + backend: [1, 2] # Limit testing to one non-pipelined and one pipelined backend + dtypes: ['R32'] # Limit testing to one dtype + hex: ["0 0 0", "1 1 1"] hey: ["0 0 0", "1 1 1"] hez: ["0 0 0", "1 1 1"] - gd: ["0 0 0", "16 16 16"] + pdx: ["0 0 0", "1 1 1"] + pdy: ["0 0 0", "1 1 1"] + pdz: ["0 0 0", "1 1 1"] + + apply_skips: 'transpose_mix' transpose_test_ac: &transpose_test_ac <<: *transpose_test_base @@ -48,7 +94,8 @@ transpose_test_ac: &transpose_test_ac 'acx', 'acy', 'acz', 'gx', 'gy', 'gz', 'gd', - 'hex' ,'hey', 'hez'] + 'hex' ,'hey', 'hez', + 'pdx', 'pdy', 'pdz'] backend: [1] # Limit this testing to one backend dtypes: ['R32'] # Limit to one data type @@ -61,6 +108,22 @@ transpose_test_cc: <<: *transpose_test executable_prefix: 'cc/transpose_test' +transpose_test_halo_cc: + <<: *transpose_test_halo + executable_prefix: 'cc/transpose_test' + +transpose_test_padding_cc: + <<: *transpose_test_padding + executable_prefix: 'cc/transpose_test' + +transpose_test_gdimdist_cc: + <<: *transpose_test_gdimdist + executable_prefix: 'cc/transpose_test' + +transpose_test_mix_cc: + <<: *transpose_test_mix + executable_prefix: 'cc/transpose_test' + transpose_test_ac_cc: <<: *transpose_test_ac executable_prefix: 'cc/transpose_test' @@ -70,6 +133,26 @@ transpose_test_fortran: executable_prefix: 'fortran/transpose_test' fortran_indexing: true +transpose_test_halo_fortran: + <<: *transpose_test_halo + executable_prefix: 'fortran/transpose_test' + fortran_indexing: true + +transpose_test_padding_fortran: + <<: *transpose_test_padding + executable_prefix: 'fortran/transpose_test' + fortran_indexing: true + +transpose_test_gdimdist_fortran: + <<: *transpose_test_gdimdist + executable_prefix: 'fortran/transpose_test' + fortran_indexing: true + +transpose_test_mix_fortran: + <<: *transpose_test_mix + executable_prefix: 'fortran/transpose_test' + fortran_indexing: true + transpose_test_ac_fortran: <<: *transpose_test_ac executable_prefix: 'fortran/transpose_test' @@ -83,6 +166,7 @@ halo_test_base: &halo_test_base 'gd', 'hex' ,'hey', 'hez', 'hpx', 'hpy', 'hpz', + 'pdx', 'pdy', 'pdz', 'ax'] backend: [1, 2, 3, 4, 5] @@ -101,6 +185,10 @@ halo_test_base: &halo_test_base hpy: [1] hpz: [1] + pdx: [0] + pdy: [0] + pdz: [0] + ax: [0, 1, 2] managed_memory : [True, False] @@ -112,6 +200,13 @@ halo_test_base: &halo_test_base halo_test: &halo_test <<: *halo_test_base + +halo_test_halomix: &halo_test_halomix + <<: *halo_test_base + + backend: [1] # Limit testing to one backend + dtypes: ['R32'] # Limit testing to one dtype + hex: [0, 1] hey: [0, 1] hez: [0, 1] @@ -119,8 +214,49 @@ halo_test: &halo_test hpx: [0, 1] hpy: [0, 1] hpz: [0, 1] + apply_skips: 'halo' - gd: ["0 0 0", "16 16 16"] +halo_test_padding: &halo_test_padding + <<: *halo_test_base + + backend: [1] # Limit testing to one backend + dtypes: ['R32'] # Limit testing to one dtype + + pdx: [0, 1] + pdy: [0, 1] + pdz: [0, 1] + + apply_skips: 'halo_padding' + +halo_test_gdimdist: &halo_test_gdimdist + <<: *halo_test_base + + backend: [1] # Limit testing to one backend + dtypes: ['R32'] # Limit testing to one dtype + + gd: ["16 16 16"] + + apply_skips: 'halo' + +halo_test_mix: &halo_test_mix + <<: *halo_test_base + + backend: [1] # Limit testing to one backend + dtypes: ['R32'] # Limit testing to one dtype + + pdx: [0, 1] + pdy: [0, 1] + pdz: [0, 1] + + hex: [0, 1] + hey: [0, 1] + hez: [0, 1] + + hpx: [0, 1] + hpy: [0, 1] + hpz: [0, 1] + + apply_skips: 'halo_padding' halo_test_ac: &halo_test_ac <<: *halo_test_base @@ -130,6 +266,7 @@ halo_test_ac: &halo_test_ac 'gd', 'hex' ,'hey', 'hez', 'hpx', 'hpy', 'hpz', + 'pdx', 'pdy', 'pdz', 'ax'] backend: [1] # Limit this testing to one backend @@ -143,6 +280,22 @@ halo_test_cc: <<: *halo_test executable_prefix: 'cc/halo_test' +halo_test_halomix_cc: + <<: *halo_test_halomix + executable_prefix: 'cc/halo_test' + +halo_test_padding_cc: + <<: *halo_test_padding + executable_prefix: 'cc/halo_test' + +halo_test_gdimdist_cc: + <<: *halo_test_gdimdist + executable_prefix: 'cc/halo_test' + +halo_test_mix_cc: + <<: *halo_test_mix + executable_prefix: 'cc/halo_test' + halo_test_ac_cc: <<: *halo_test_ac executable_prefix: 'cc/halo_test' @@ -153,6 +306,30 @@ halo_test_fortran: executable_prefix: 'fortran/halo_test' fortran_indexing: true +halo_test_halomix_fortran: + <<: *halo_test_halomix + ax: [1, 2, 3] + executable_prefix: 'fortran/halo_test' + fortran_indexing: true + +halo_test_padding_fortran: + <<: *halo_test_padding + ax: [1, 2, 3] + executable_prefix: 'fortran/halo_test' + fortran_indexing: true + +halo_test_gdimdist_fortran: + <<: *halo_test_gdimdist + ax: [1, 2, 3] + executable_prefix: 'fortran/halo_test' + fortran_indexing: true + +halo_test_mix_fortran: + <<: *halo_test_mix + ax: [1, 2, 3] + executable_prefix: 'fortran/halo_test' + fortran_indexing: true + halo_test_ac_fortran: <<: *halo_test_ac ax: [1, 2, 3] diff --git a/tests/test_runner.py b/tests/test_runner.py index 44b2b03..e988709 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -25,10 +25,36 @@ def load_yaml_config(yaml_file, config_name): config = yaml.safe_load(f)[config_name] return config -def should_skip_case(arg_dict): +def should_skip_case(arg_dict, key): skip = False - try: + if key == 'transpose': + # Skip cases with all halo extents, padding, and gdimdist as zero + if ((arg_dict["hex"] == "0 0 0" and arg_dict["hey"] == "0 0 0" and arg_dict["hez"] == "0 0 0") and + (arg_dict["pdx"] == "0 0 0" and arg_dict["pdy"] == "0 0 0" and arg_dict["pdz"] == "0 0 0") and + (arg_dict["gd"] == "0 0 0")): + skip = True + + # Skip cases where halo extents in X and Z are unequal as these cases are redundant + if (arg_dict["hex"] != arg_dict["hez"]): + skip = True + + # Skip cases where padding in X and Z are unequal as these cases are redundant + if (arg_dict["pdx"] != arg_dict["pdz"]): + skip = True + + elif key == 'transpose_mix': + skip = should_skip_case(arg_dict, 'transpose') + + # Skip cases where all halo extents are zero + if (arg_dict["hex"] == "0 0 0" and arg_dict["hey"] == "0 0 0" and arg_dict["hez"] == "0 0 0"): + skip = True + + # Skip cases where all padding is zero + if (arg_dict["pdx"] == "0 0 0" and arg_dict["pdy"] == "0 0 0" and arg_dict["pdz"] == "0 0 0"): + skip = True + + elif key == 'halo': # No need to test periodic flags if halo extent is zero if arg_dict["hex"] == 0 and arg_dict["hpx"] == 1: skip = True @@ -40,8 +66,14 @@ def should_skip_case(arg_dict): # Skip cases with all halo extents as zero if arg_dict["hex"] == 0 and arg_dict["hey"] == 0 and arg_dict["hez"] == 0: skip = True - except: - pass + + elif key == 'halo_padding': + skip = should_skip_case(arg_dict, 'halo') + + # Skip cases where all padding is zero + if (arg_dict["pdx"] == 0 and arg_dict["pdy"] == 0 and arg_dict["pdz"] == 0): + skip = True + return skip @@ -61,6 +93,11 @@ def generate_command_lines(config, args): cmds = [] prs = get_factors(args.ngpu) + if len(prs) > 3: + prs = [prs[0], prs[len(pr) // 2], prs[-1]] + if config['use_single_pdim']: + prs = [prs[min(len(prs) + 1, 1)]] + pcs = [args.ngpu // x for x in prs] if config['run_autotuning']: prs = [0] + prs @@ -81,7 +118,8 @@ def generate_command_lines(config, args): continue # Check additional skip conditions - if should_skip_case(arg_dict): continue + if (config['apply_skips']): + if should_skip_case(arg_dict, config['apply_skips']): continue extra_flags = [] extra_flags.append(['-m' if x else '' for x in config['managed_memory']]) From 534f2cf84b6a9e7030ca71aa3c5fa3bff5038d62 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 10 Mar 2025 15:54:56 -0700 Subject: [PATCH 5/8] Fix in test_runner.py --- tests/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_runner.py b/tests/test_runner.py index e988709..e641b92 100644 --- a/tests/test_runner.py +++ b/tests/test_runner.py @@ -94,7 +94,7 @@ def generate_command_lines(config, args): cmds = [] prs = get_factors(args.ngpu) if len(prs) > 3: - prs = [prs[0], prs[len(pr) // 2], prs[-1]] + prs = [prs[0], prs[len(prs) // 2], prs[-1]] if config['use_single_pdim']: prs = [prs[min(len(prs) + 1, 1)]] From 9ad04d066f2a04040b0765e322834497e4070de9 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 11 Mar 2025 14:00:21 -0700 Subject: [PATCH 6/8] Use workspace caching in test implementation to improve efficiency. --- tests/cc/halo_test.cc | 30 +++++++++++++++++++++++++++--- tests/cc/transpose_test.cc | 31 ++++++++++++++++++++++++++++--- tests/fortran/halo_test.f90 | 21 +++++++++++++++++---- tests/fortran/transpose_test.f90 | 21 ++++++++++++++++++--- 4 files changed, 90 insertions(+), 13 deletions(-) diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc index 2a231be..1ece724 100644 --- a/tests/cc/halo_test.cc +++ b/tests/cc/halo_test.cc @@ -340,6 +340,7 @@ static haloTestArgs parse_arguments(const std::string& arguments) { int rank, nranks; cudecompHandle_t handle; std::unordered_map grid_desc_cache; +std::tuple workspace{-1, nullptr, 0}; // Cache a single grid descriptor per backend type. This keeps NCCL/NVSHMEM initialized between tests for // better throughput. @@ -437,8 +438,32 @@ static int run_test(const std::string& arguments, bool silent) { } int64_t dtype_size; CHECK_CUDECOMP(cudecompGetDataTypeSize(get_cudecomp_datatype(real_t(0)), &dtype_size)); - CHECK_CUDECOMP( - cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + + // Allocate workspace (reuse exising workspace if able) + if (std::get<0>(workspace) == static_cast(config.halo_comm_backend)){ + work_d = std::get<1>(workspace); + if (std::get<2>(workspace) < workspace_num_elements * dtype_size) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)); + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } + } else if (std::get<0>(workspace) > 0 && std::get<0>(workspace) != static_cast(config.halo_comm_backend)) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache[static_cast(std::get<0>(workspace))], + std::get<1>(workspace))); + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<0>(workspace) = static_cast(config.halo_comm_backend); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } else { + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<0>(workspace) = static_cast(config.halo_comm_backend); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } // Running correctness tests if (!silent && rank == 0) printf("running correctness tests...\n"); @@ -470,7 +495,6 @@ static int run_test(const std::string& arguments, bool silent) { } CHECK_CUDA(cudaFree(data_d)); - CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)); } catch (const std::exception& e) { return 1; } diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc index 0e1ddc8..6f3c19b 100644 --- a/tests/cc/transpose_test.cc +++ b/tests/cc/transpose_test.cc @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -358,6 +359,7 @@ static transposeTestArgs parse_arguments(const std::string& arguments) { int rank, nranks; cudecompHandle_t handle; std::unordered_map grid_desc_cache; +std::tuple workspace{-1, nullptr, 0}; // Cache a single grid descriptor per backend type. This keeps NCCL/NVSHMEM initialized between tests for // better throughput. @@ -458,8 +460,32 @@ static int run_test(const std::string& arguments, bool silent) { } int64_t dtype_size; CHECK_CUDECOMP(cudecompGetDataTypeSize(get_cudecomp_datatype(real_t(0)), &dtype_size)); - CHECK_CUDECOMP( - cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + + // Allocate workspace (reuse exising workspace if able) + if (std::get<0>(workspace) == static_cast(config.transpose_comm_backend)){ + work_d = std::get<1>(workspace); + if (std::get<2>(workspace) < workspace_num_elements * dtype_size) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)); + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } + } else if (std::get<0>(workspace) > 0 && std::get<0>(workspace) != static_cast(config.transpose_comm_backend)) { + CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache[static_cast(std::get<0>(workspace))], + std::get<1>(workspace))); + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<0>(workspace) = static_cast(config.transpose_comm_backend); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } else { + CHECK_CUDECOMP( + cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_d), workspace_num_elements * dtype_size)); + std::get<0>(workspace) = static_cast(config.transpose_comm_backend); + std::get<1>(workspace) = work_d; + std::get<2>(workspace) = workspace_num_elements * dtype_size; + } real_t* data_2_d = nullptr; if (args.out_of_place) { @@ -524,7 +550,6 @@ static int run_test(const std::string& arguments, bool silent) { CHECK_CUDA(cudaFree(data_d)); if (data_2_d) CHECK_CUDA(cudaFree(data_2_d)); - CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)); } catch (const std::exception& e) { return 1; } diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index 041398b..589f455 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -64,6 +64,8 @@ module halo_CUDECOMP_DOUBLE_COMPLEX_mod integer :: rank, nranks type(cudecompGridDesc) :: grid_desc_cache(5) logical :: grid_desc_cache_set(5) = .false. + ARRTYPE, pointer, device, contiguous :: work_d(:) + integer :: work_backend = -1 contains function compare_pencils(ref, res, pinfo) result(mismatch) @@ -249,7 +251,6 @@ function run_test(arguments, silent) result(res) ARRTYPE, allocatable :: ref(:, :, :), init(:, :, :), data(:) ARRTYPE, allocatable, device, target:: data_d(:) ARRTYPE, allocatable, managed, target:: data_m(:) - ARRTYPE, pointer, device, contiguous :: work_d(:) ARRTYPE, pointer, device:: input(:) integer :: dtype = DTYPE @@ -465,7 +466,21 @@ function run_test(arguments, silent) result(res) else allocate(data_d(data_num_elements)) endif - CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + + ! Allocate workspace (reuse exising workspace if able) + if (work_backend == config%halo_comm_backend) then + if (size(work_d) < workspace_num_elements) then + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)) + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + endif + elseif (work_backend > 0 .and. work_backend /= config%halo_comm_backend) then + CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache(work_backend), work_d)) + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + work_backend = config%halo_comm_backend; + else + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + work_backend = config%halo_comm_backend; + endif ! Running correctness tests if (.not. silent .and. rank == 0) write(*,"('Running correctness tests using ', a, ' backend ...')") & @@ -508,8 +523,6 @@ function run_test(arguments, silent) result(res) deallocate(data_d) endif - CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)) - end function end module diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index 908042d..29b9cc0 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -64,6 +64,8 @@ module transpose_CUDECOMP_DOUBLE_COMPLEX_mod integer :: rank, nranks type(cudecompGridDesc) :: grid_desc_cache(7) logical :: grid_desc_cache_set(7) = .false. + ARRTYPE, pointer, device, contiguous :: work_d(:) + integer :: work_backend = -1 contains function compare_pencils(ref, res, pinfo) result(mismatch) @@ -198,7 +200,6 @@ function run_test(arguments, silent) result(res) ARRTYPE, allocatable :: xref(:, :, :), yref(:, :, :), zref(:, :, :), data(:) ARRTYPE, allocatable, device, target:: data_d(:), data_2_d(:) ARRTYPE, allocatable, managed, target:: data_m(:), data_2_m(:) - ARRTYPE, pointer, device, contiguous :: work_d(:) ARRTYPE, pointer, device:: input(:), output(:) integer :: dtype = DTYPE @@ -417,7 +418,22 @@ function run_test(arguments, silent) result(res) else allocate(data_d(data_num_elements)) endif - CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + + ! Allocate workspace (reuse exising workspace if able) + if (work_backend == config%transpose_comm_backend) then + if (size(work_d) < workspace_num_elements) then + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)) + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + endif + elseif (work_backend > 0 .and. work_backend /= config%transpose_comm_backend) then + CHECK_CUDECOMP(cudecompFree(handle, grid_desc_cache(work_backend), work_d)) + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + work_backend = config%transpose_comm_backend; + else + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, work_d, workspace_num_elements)) + work_backend = config%transpose_comm_backend; + endif + if (out_of_place) then if (use_managed_memory) then allocate(data_2_m(data_num_elements)) @@ -519,7 +535,6 @@ function run_test(arguments, silent) result(res) deallocate(data_d) if (out_of_place) deallocate(data_2_d) endif - CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_d)) end function run_test end module From 3a7ad3cd220e5abb3edd38a46b1c60517bcb8e21 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 11 Mar 2025 14:52:10 -0700 Subject: [PATCH 7/8] Update documentation. --- docs/api/f_api.rst | 58 ++++++++++++++++++++++++++++---------------- docs/basic_usage.rst | 38 ++++++++++++++++------------- 2 files changed, 58 insertions(+), 38 deletions(-) diff --git a/docs/api/f_api.rst b/docs/api/f_api.rst index 48df4c7..0fef5b4 100644 --- a/docs/api/f_api.rst +++ b/docs/api/f_api.rst @@ -94,12 +94,13 @@ ____________________ A data structure containing geometry information about a pencil data buffer. - :f integer shape(3): pencil shape (in local order, including halo elements) - :f integer lo(3): lower bound coordinates (in local order, excluding halo elements) - :f integer hi(3): upper bound coordinates (in local order, excluding halo elements) + :f integer shape(3): pencil shape (in local order, including halo and padding elements) + :f integer lo(3): lower bound coordinates (in local order, excluding halo and padding elements) + :f integer hi(3): upper bound coordinates (in local order, excluding halo and padding elements) :f integer order(3): data layout order (e.g. 3,2,1 means memory is ordered Z,Y,X) :f integer halo_extents(3): halo extents by dimension (in global order) - :f int64 size: number of elements in pencil (including halo elements) + :f integer padding(3): padding by dimension (in global order) + :f int64 size: number of elements in pencil (including halo and padding elements) Communication Backends --------------------------------- @@ -345,7 +346,7 @@ Helper Functions cudecompGetPencilInfo _____________________ -.. f:function:: cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis[, halo_extents]) +.. f:function:: cudecompGetPencilInfo(handle, grid_desc, pencil_info, axis[, halo_extents, padding]) Collects geometry information about assigned pencils, by domain axis. @@ -355,7 +356,8 @@ _____________________ :p cudecompGridDesc grid_desc [in]: A cuDecomp grid descriptor. :p cudecompPencilInfo pencil_info [out]: A cuDecomp pencil information structure. :p integer axis [in]: The domain axis the desired pencil is aligned with. - :p integer halo_extents(3) [in, optional]: An array of three integers to define halo region extents of the pencil, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). + :p integer halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the pencil, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). + :p integer padding(3) [in,optional]: An array of three integers to define padding of the pencil, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -434,7 +436,7 @@ Transposition Functions cudecompTransposeXToY _____________________ -.. f:function:: cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream]) +.. f:function:: cudecompTransposeXToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream]) Function to transpose data from X-axis aligned pencils to a Y-axis aligned pencils. @@ -447,8 +449,10 @@ _____________________ :p T work(*) [in]: Device array to use for transpose workspace. :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos. - :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer output_halo_extents(3) [in,optional]: Similar to :code:`input_halo_extents` but for the output data. If not provided, output data is assumed to have no halos. + :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -458,7 +462,7 @@ _____________________ cudecompTransposeYtoZ _____________________ -.. f:function:: cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream]) +.. f:function:: cudecompTransposeYToZ(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream]) Function to transpose data from Y-axis aligned pencils to a Z-axis aligned pencils. @@ -472,7 +476,9 @@ _____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos. :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -482,7 +488,7 @@ _____________________ cudecompTransposeZToY _____________________ -.. f:function:: cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream]) +.. f:function:: cudecompTransposeZToY(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream]) Function to transpose data from Z-axis aligned pencils to a Y-axis aligned pencils. @@ -496,7 +502,9 @@ _____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos. :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. @@ -507,7 +515,7 @@ _____________________ cudecompTransposeYToX _____________________ -.. f:function:: cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, stream]) +.. f:function:: cudecompTransposeYToX(handle, grid_desc, input, output, work, dtype[, input_halo_extents, output_halo_extents, input_padding, output_padding, stream]) Function to transpose data from Y-axis aligned pencils to a X-axis aligned pencils. @@ -521,7 +529,9 @@ _____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer input_halo_extents(3) [in,optional]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). If not provided, input data is assumed to have no halos. :p integer output_halo_extents(3) [in,optional]: Similar to :code:`intput_halo_extents` but for the output data. If not provided, output data is assumed to have no halos. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer input_padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer output_padding(3) [in,optional]: Similar to :code:`input_padding`, but for the output data. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -534,7 +544,7 @@ Halo Exchange Functions cudecompUpdateHalosX ____________________ -.. f:function:: cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream]) +.. f:function:: cudecompUpdateHalosX(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream]) Function to perform halo communication of X-axis aligned pencil data. @@ -547,7 +557,9 @@ ____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update. + :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -557,7 +569,7 @@ ____________________ cudecompUpdateHalosY ____________________ -.. f:function:: cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream]) +.. f:function:: cudecompUpdateHalosY(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream]) Function to perform halo communication of Y-axis aligned pencil data. @@ -570,7 +582,9 @@ ____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update. + :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. ------ @@ -580,7 +594,7 @@ ____________________ cudecompUpdateHalosZ ____________________ -.. f:function:: cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, halo_extents, halo_periods[, stream]) +.. f:function:: cudecompUpdateHalosZ(handle, grid_desc, input, work, dtype, halo_extents, halo_periods, dim[, padding, stream]) Function to perform halo communication of Z-axis aligned pencil data. @@ -593,5 +607,7 @@ ____________________ :p cudecompDataType dtype [in]: The :code:`cudecompDataType` to use for the operation. :p integer halo_extents(3) [in]: An array of three integers to define halo region extents of the input data, in global order. The i-th entry in this array should contain the number of halo elements (per direction) expected in the along the i-th global domain axis. Symmetric halos are assumed (e.g. a value of one in halo_extents means there are 2 halo elements, one element on each side). :p logical halo_periods(3) [in]: An array of three boolean values to define halo periodicity of the input data, in global order. If the i-th entry in this array is true, the domain is treated periodically along the i-th global domain axis. - :p integer(cuda_stream_kind) stream [in, optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. + :p integer dim [in]: Which pencil dimension (global indexed) to perform the halo update. + :p integer padding(3) [in,optional]: An array of three integers to define padding of the input data, in global order. The i-th entry in this array should contain the number of elements to treat as padding in the i-th global domain axis. + :p integer(cuda_stream_kind) stream [in,optional]: CUDA stream to enqueue GPU operations into. If not provided, operations are enqueued in the default stream. :r cudecompResult res: :code:`CUDECOMP_RESULT_SUCCESS` on success or error code on failure. diff --git a/docs/basic_usage.rst b/docs/basic_usage.rst index 365403f..abfe6e6 100644 --- a/docs/basic_usage.rst +++ b/docs/basic_usage.rst @@ -221,10 +221,9 @@ First, we can query basic information (i.e. metadata) about the pencil configura assigned to this process using the :ref:`cudecompGetPencilInfo-ref` function. This function returns a pencil info structure (:ref:`cudecompPencilInfo_t-ref`) that contains the shape, global lower and upper index bounds (:code:`lo` and :code:`hi`), size of the pencil, and an :code:`order` array to indicate the memory layout -that will be used (to handle permuted, `axis-contiguous` layouts). Additionally, there is a :code:`halo_extents` data -member that indicates the depth of halos for the pencil, by axis, if the argument was provided -to this function. This data member is a copy of the argument provided to the function -and is stored for convenience. +that will be used (to handle permuted layouts). Additionally, there are :code:`halo_extents` and :code:`padding` data +members that indicates halo and padding configurations for the pencil, by axis. This data member is a copy of the +argument provided to the function and is stored for convenience. It should be noted that these metadata structures are provided solely for users to interpret and access data from the data buffers used as input/output arguments to the different @@ -234,7 +233,8 @@ for pencil buffers, nor uses these pencil information structures as input argume In this example, we apply halo elements to the :math:`X`-pencils only. For the other pencils, we instead pass a :code:`nullptr` for the :code:`halo_extents` argument, which is equivalent to setting :code:`halo_extents = [0, 0, 0]` in C/C++. For Fortran, :code:`halo_extents` is optional -and defaults to no halo regions. +and defaults to no halo regions. Similarly, we pass a :code:`nullptr` for the :code:`padding` argument to specify +no padding for all pencils. For Fortran, :code:`padding` is optional, and defaults to no padding. .. tabs:: @@ -243,15 +243,15 @@ and defaults to no halo regions. // Get X-pencil information (with halo elements). cudecompPencilInfo_t pinfo_x; int32_t halo_extents_x[3]{1, 1, 1}; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_x, 0, halo_extents_x, nullptr)); // Get Y-pencil information cudecompPencilInfo_t pinfo_y; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_y, 1, nullptr, nullptr)); // Get Z-pencil information cudecompPencilInfo_t pinfo_z; - CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr)); + CHECK_CUDECOMP_EXIT(cudecompGetPencilInfo(handle, grid_desc, &pinfo_z, 2, nullptr, nullptr)); .. code-tab:: fortran @@ -572,8 +572,10 @@ Transposing the data -------------------- Now, we can use cuDecomp's transposition routines to transpose our data. In these calls, we are using the :code:`data_d` array as both input and output (in-place), but you can also use distinct input and output buffers for -out-of-place operations. For the transposes between :math:`Y`- and :math:`Z`-pencils, we can pass -null pointers to the halo extent arguments to the routines to ignore them in C/C++, or leave them unspecified in Fortran. +out-of-place operations. For the transposes between :math:`Y`- and :math:`Z`-pencils, we pass +null pointers to the halo extent arguments to the routines in C/C++, or leave them unspecified in Fortran. +For all transposes, we pass null pointers to the padding arguments to the routines to disable padding in C/C++, or leave +them unspecified in Fortran. .. tabs:: @@ -581,19 +583,19 @@ null pointers to the halo extent arguments to the routines to ignore them in C/C // Transpose from X-pencils to Y-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeXToY(handle, grid_desc, data_d, data_d, transpose_work_d, - CUDECOMP_DOUBLE, pinfo_x.halo_extents, nullptr, 0)); + CUDECOMP_DOUBLE, pinfo_x.halo_extents, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to Z-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeYToZ(handle, grid_desc, data_d, data_d, transpose_work_d, - CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Z-pencils to Y-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeZToY(handle, grid_desc, data_d, data_d, transpose_work_d, - CUDECOMP_DOUBLE, nullptr, nullptr, 0)); + CUDECOMP_DOUBLE, nullptr, nullptr, nullptr, nullptr, 0)); // Transpose from Y-pencils to X-pencils. CHECK_CUDECOMP_EXIT(cudecompTransposeYToX(handle, grid_desc, data_d, data_d, transpose_work_d, - CUDECOMP_DOUBLE, nullptr, pinfo_x.halo_extents, 0)); + CUDECOMP_DOUBLE, nullptr, pinfo_x.halo_extents, nullptr, nullptr, 0)); .. code-tab:: fortran @@ -618,6 +620,8 @@ Updating halo regions In this example, we have halos for the :math:`X`-pencils only. We can use cuDecomp's halo update routines to update the halo regions of this pencil in the three domain directions. In this example, we set the :code:`halo_periods` argument to enable periodic halos along all directions. +We pass null pointers to the padding arguments to the routines to disable padding in C/C++, or leave +them unspecified in Fortran. .. tabs:: @@ -629,17 +633,17 @@ we set the :code:`halo_periods` argument to enable periodic halos along all dire // Update X-pencil halos in X direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods, - 0, 0)); + 0, nullptr, 0)); // Update X-pencil halos in Y direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods, - 1, 0)); + 1, nullptr, 0)); // Update X-pencil halos in Z direction CHECK_CUDECOMP_EXIT(cudecompUpdateHalosX(handle, grid_desc, data_d, halo_work_d, CUDECOMP_DOUBLE, pinfo_x.halo_extents, halo_periods, - 2, 0)); + 2, nullptr, 0)); .. code-tab:: fortran From a3805be878052f2106455811d6c5ca145e2616b0 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 17 Mar 2025 10:32:53 -0700 Subject: [PATCH 8/8] Remove unused CHECK_CUDA macros from Fortran tests. --- tests/fortran/halo_test.f90 | 1 - tests/fortran/transpose_test.f90 | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index 589f455..d64c681 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -29,7 +29,6 @@ #define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) #define CHECK_CUDECOMP(f) if (f /= CUDECOMP_RESULT_SUCCESS) then; res = 1; return; endif #define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) -#define CHECK_CUDA(f) if (f /= cudaSuccess) thenl res = 1; return; endif #ifdef R32 #define ARRTYPE real(real32) diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index 29b9cc0..02810df 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -29,7 +29,6 @@ #define CHECK_CUDECOMP_EXIT(f) if (f /= CUDECOMP_RESULT_SUCCESS) call exit(1) #define CHECK_CUDECOMP(f) if (f /= CUDECOMP_RESULT_SUCCESS) then; res = 1; return; endif #define CHECK_CUDA_EXIT(f) if (f /= cudaSuccess) call exit(1) -#define CHECK_CUDA(f) if (f /= cudaSuccess) thenl res = 1; return; endif #ifdef R32 #define ARRTYPE real(real32)