From d29666505a7b0a9357a419e50cb2474fd559e98e Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 18:54:15 +0000 Subject: [PATCH 1/8] Revert "Add patched CG headers to rocm install path (#9)" This reverts commit 1c69737e1a8a8ae5ed9d295937458d54a65f4702. --- docker/Dockerfile.rocm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 7b80c3facbd4..3e850a5238b3 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -168,8 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR} RUN cd ${STAGE_DIR}/DeepSpeed && \ git checkout . && \ git checkout master && \ - cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \ - cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \ - DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo + DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo RUN rm -rf ${STAGE_DIR}/DeepSpeed RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)" From f50fa7b98254adb7d205255180c27265579df52c Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 18:54:40 +0000 Subject: [PATCH 2/8] Revert "Update headers and include_dirs to enable transformer extension (#8)" This reverts commit 7be71d322e2e4e33a1dc6b85044e23d4e54b0283. --- .../hip/hcc_detail/hip_cooperative_groups.h | 362 ------------------ .../hip_cooperative_groups_helper.h | 183 --------- op_builder/transformer.py | 6 +- 3 files changed, 1 insertion(+), 550 deletions(-) delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h deleted file mode 100644 index 20e7bb94b8ad..000000000000 --- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h +++ /dev/null @@ -1,362 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/** - * @file hcc_detail/hip_cooperative_groups.h - * - * @brief Device side implementation of `Cooperative Group` feature. - * - * Defines new types and device API wrappers related to `Cooperative Group` - * feature, which the programmer can directly use in his kernel(s) in order to - * make use of this feature. - */ -#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H -#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H - -//#if __cplusplus -#if __cplusplus && defined(__clang__) && defined(__HIP__) -#include -#include -namespace cooperative_groups { - -/** \brief The base type of all cooperative group types - * - * \details Holds the key properties of a constructed cooperative group type - * object, like the group type, its size, etc - */ -/* -class thread_group { - protected: - uint32_t _type; // thread_group type - uint32_t _size; // total number of threads in the tread_group - uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types, - // LSB represents lane 0, and MSB represents lane 63 - - // Construct a thread group, and set thread group type and other essential - // thread group properties. This generic thread group is directly constructed - // only when the group is supposed to contain only the calling the thread - // (throurh the API - `this_thread()`), and in all other cases, this thread - // group object is a sub-object of some other derived thread group object - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size, - uint64_t mask = (uint64_t)0) { - _type = type; - _size = size; - _mask = mask; - } - - public: - // Total number of threads in the thread group, and this serves the purpose - // for all derived cooperative group types since their `size` is directly - // saved during the construction - __CG_QUALIFIER__ uint32_t size() const { - return _size; - } - // Rank of the calling thread within [0, size()) - __CG_QUALIFIER__ uint32_t thread_rank() const; - // Is this cooperative group type valid? - __CG_QUALIFIER__ bool is_valid() const; - // synchronize the threads in the thread group - __CG_QUALIFIER__ void sync() const; -}; -*/ - -class thread_group { - protected: - bool _tiled_partition; // this_thread_block() constructor sets to false - uint32_t _size; // this_thread_block() constructor sets to size() - uint32_t local_rank; // this_thread_block() constructor sets to thread_rank() - uint32_t _mask; - uint32_t _type; - public: - __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size, - uint64_t mask = (uint64_t)0) { - _type = type; - _size = group_size; - _mask = mask; - local_rank = internal::workgroup::thread_rank(); - } - - __CG_QUALIFIER__ void tiled_partition(const thread_group& parent, - unsigned int tile_size) { - if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 || - tile_size > 64 || parent.size() < tile_size) - _tiled_partition = false; - //xxx : abort - _tiled_partition = true; - _size = tile_size; - local_rank = parent.thread_rank() % tile_size; - } - __CG_QUALIFIER__ void sync() const; - __CG_QUALIFIER__ uint32_t size() const { - return _size; - } - __CG_QUALIFIER__ uint32_t thread_rank() const; - __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const { - return (__shfl_down(var, delta, _size)); - } - __CG_QUALIFIER__ float shfl_xor(float var, int mask) const { - return (__shfl_xor(var, mask, _size)); - } - __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const { - return (__shfl(var, src_lane, _size)); - } - __CG_QUALIFIER__ bool is_valid() const; - -}; - -/** \brief The multi-grid cooperative group type - * - * \details Represents an inter-device cooperative group type where the - * participating threads within the group spans across multple - * devices, running the (same) kernel on these devices - */ -class multi_grid_group : public thread_group { - // Only these friend functions are allowed to construct an object of this class - // and access its resources - friend __CG_QUALIFIER__ multi_grid_group this_multi_grid(); - - protected: - // Construct mutli-grid thread group (through the API this_multi_grid()) - explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size) - : thread_group(internal::cg_multi_grid, size) { } - - public: - // Number of invocations participating in this multi-grid group. In other - // words, the number of GPUs - __CG_QUALIFIER__ uint32_t num_grids() { - return internal::multi_grid::num_grids(); - } - // Rank of this invocation. In other words, an ID number within the range - // [0, num_grids()) of the GPU, this kernel is running on - __CG_QUALIFIER__ uint32_t grid_rank() { - return internal::multi_grid::grid_rank(); - } - __CG_QUALIFIER__ uint32_t thread_rank() const { - return internal::multi_grid::thread_rank(); - } - __CG_QUALIFIER__ bool is_valid() const { - return internal::multi_grid::is_valid(); - } - __CG_QUALIFIER__ void sync() const { - internal::multi_grid::sync(); - } -}; - -/** \brief User exposed API interface to construct multi-grid cooperative - * group type object - `multi_grid_group` - * - * \details User is not allowed to directly construct an object of type - * `multi_grid_group`. Instead, he should construct it through this - * API function - */ -__CG_QUALIFIER__ multi_grid_group -this_multi_grid() { - return multi_grid_group(internal::multi_grid::size()); -} - -/** \brief The grid cooperative group type - * - * \details Represents an inter-workgroup cooperative group type where the - * participating threads within the group spans across multiple - * workgroups running the (same) kernel on the same device - */ -class grid_group : public thread_group { - // Only these friend functions are allowed to construct an object of this class - // and access its resources - friend __CG_QUALIFIER__ grid_group this_grid(); - - protected: - // Construct grid thread group (through the API this_grid()) - explicit __CG_QUALIFIER__ grid_group(uint32_t size) - : thread_group(internal::cg_grid, size) { } - - public: - __CG_QUALIFIER__ uint32_t thread_rank() const { - return internal::grid::thread_rank(); - } - __CG_QUALIFIER__ bool is_valid() const { - return internal::grid::is_valid(); - } - __CG_QUALIFIER__ void sync() const { - internal::grid::sync(); - } -}; - -/** \brief User exposed API interface to construct grid cooperative group type - * object - `grid_group` - * - * \details User is not allowed to directly construct an object of type - * `multi_grid_group`. Instead, he should construct it through this - * API function - */ -__CG_QUALIFIER__ grid_group -this_grid() { - return grid_group(internal::grid::size()); -} - -/** \brief The workgroup (thread-block in CUDA terminology) cooperative group - * type - * - * \details Represents an intra-workgroup cooperative group type where the - * participating threads within the group are exctly the same threads - * which are participated in the currently executing `workgroup` - */ -class thread_block : public thread_group { - // Only these friend functions are allowed to construct an object of this - // class and access its resources - friend __CG_QUALIFIER__ thread_block this_thread_block(); - - protected: - // Construct a workgroup thread group (through the API this_thread_block()) - explicit __CG_QUALIFIER__ thread_block(uint32_t size) - : thread_group(internal::cg_workgroup, size) { } - - public: - // 3-dimensional block index within the grid - __CG_QUALIFIER__ dim3 group_index() { - return internal::workgroup::group_index(); - } - // 3-dimensional thread index within the block - __CG_QUALIFIER__ dim3 thread_index() { - return internal::workgroup::thread_index(); - } - __CG_QUALIFIER__ uint32_t thread_rank() const { - return internal::workgroup::thread_rank(); - } - __CG_QUALIFIER__ bool is_valid() const { - return internal::workgroup::is_valid(); - } - __CG_QUALIFIER__ void sync() const { - internal::workgroup::sync(); - } -}; - -/** \brief User exposed API interface to construct workgroup cooperative - * group type object - `thread_block` - * - * \details User is not allowed to directly construct an object of type - * `thread_block`. Instead, he should construct it through this API - * function - */ -__CG_QUALIFIER__ thread_block -this_thread_block() { - return thread_block(internal::workgroup::size()); -} - -/** - * Implemenation of all publicly exposed base class APIs - */ -__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const { - switch (this->_type) { - case internal::cg_multi_grid: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_grid: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_workgroup: { - return (static_cast(this)->thread_rank()); - } - case internal::cg_coalesced_tile: { - return local_rank; - } - default: { - assert(false && "invalid cooperative group type"); - return -1; - } - } -} - -__CG_QUALIFIER__ bool thread_group::is_valid() const { - switch (this->_type) { - case internal::cg_multi_grid: { - return (static_cast(this)->is_valid()); - } - case internal::cg_grid: { - return (static_cast(this)->is_valid()); - } - case internal::cg_workgroup: { - return (static_cast(this)->is_valid()); - } - case internal::cg_coalesced_tile: { - return _tiled_partition; - } - default: { - assert(false && "invalid cooperative group type"); - return false; - } - } -} - -__CG_QUALIFIER__ void thread_group::sync() const { - switch (this->_type) { - case internal::cg_multi_grid: { - static_cast(this)->sync(); - break; - } - case internal::cg_grid: { - static_cast(this)->sync(); - break; - } - case internal::cg_workgroup: { - static_cast(this)->sync(); - break; - } - case internal::cg_coalesced_tile: { - if (!_tiled_partition) // If in a tiled partition, this is a no-op - __syncthreads(); - break; - } - default: { - assert(false && "invalid cooperative group type"); - } - } -} - -/** - * Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative - * group type APIs - */ -template -__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) { - return g.size(); -} - -template -__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) { - return g.thread_rank(); -} - -template -__CG_QUALIFIER__ bool is_valid(CGTy const &g) { - return g.is_valid(); -} - -template -__CG_QUALIFIER__ void sync(CGTy const &g) { - g.sync(); -} - -} // namespace cooperative_groups - -#endif // __cplusplus -#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h deleted file mode 100644 index 7f8e69da11c3..000000000000 --- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h +++ /dev/null @@ -1,183 +0,0 @@ -/* -Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/** - * @file hcc_detail/hip_cooperative_groups_helper.h - * - * @brief Device side implementation of cooperative group feature. - * - * Defines helper constructs and APIs which aid the types and device API - * wrappers defined within `hcc_detail/hip_cooperative_groups.h`. - */ -#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H -#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H - -#if __cplusplus -#include -#include - -#if !defined(__align__) -#define __align__(x) __attribute__((aligned(x))) -#endif - -#if !defined(__CG_QUALIFIER__) -#define __CG_QUALIFIER__ __device__ __forceinline__ -#endif - -#if !defined(__CG_STATIC_QUALIFIER__) -#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__ -#endif - -#if !defined(WAVEFRONT_SIZE) -#define WAVEFRONT_SIZE 64 -#endif - -namespace cooperative_groups { - -namespace internal { - -/** \brief Enums representing different cooperative group types - */ -typedef enum { - cg_invalid, - cg_multi_grid, - cg_grid, - cg_workgroup, - cg_coalesced_tile -} group_type; - -/** - * Functionalities related to multi-grid cooperative group type - */ -namespace multi_grid { - -__CG_STATIC_QUALIFIER__ uint32_t num_grids() { - return (uint32_t)__ockl_multi_grid_num_grids(); -} - -__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { - return (uint32_t)__ockl_multi_grid_grid_rank(); -} - -__CG_STATIC_QUALIFIER__ uint32_t size() { - return (uint32_t)__ockl_multi_grid_size(); -} - -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return (uint32_t)__ockl_multi_grid_thread_rank(); -} - -__CG_STATIC_QUALIFIER__ bool is_valid() { - return (bool)__ockl_multi_grid_is_valid(); -} - -__CG_STATIC_QUALIFIER__ void sync() { - __ockl_multi_grid_sync(); -} - -} // namespace multi_grid - -/** - * Functionalities related to grid cooperative group type - */ -namespace grid { - -__CG_STATIC_QUALIFIER__ uint32_t size() { - return (uint32_t)((hipBlockDim_z * hipGridDim_z) * - (hipBlockDim_y * hipGridDim_y) * - (hipBlockDim_x * hipGridDim_x)); -} - -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - // Compute global id of the workgroup to which the current thread belongs to - uint32_t blkIdx = - (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) + - (hipBlockIdx_y * hipGridDim_x) + - (hipBlockIdx_x)); - - // Compute total number of threads being passed to reach current workgroup - // within grid - uint32_t num_threads_till_current_workgroup = - (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)); - - // Compute thread local rank within current workgroup - uint32_t local_thread_rank = - (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) + - (hipThreadIdx_y * hipBlockDim_x) + - (hipThreadIdx_x)); - - return (num_threads_till_current_workgroup + local_thread_rank); -} - -__CG_STATIC_QUALIFIER__ bool is_valid() { - return (bool)__ockl_grid_is_valid(); -} - -__CG_STATIC_QUALIFIER__ void sync() { - __ockl_grid_sync(); -} - -} // namespace grid - -/** - * Functionalities related to `workgroup` (thread_block in CUDA terminology) - * cooperative group type - */ -namespace workgroup { - -__CG_STATIC_QUALIFIER__ dim3 group_index() { - return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, - (uint32_t)hipBlockIdx_z)); -} - -__CG_STATIC_QUALIFIER__ dim3 thread_index() { - return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, - (uint32_t)hipThreadIdx_z)); -} - -__CG_STATIC_QUALIFIER__ uint32_t size() { - return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z)); -} - -__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { - return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) + - (hipThreadIdx_y * hipBlockDim_x) + - (hipThreadIdx_x))); -} - -__CG_STATIC_QUALIFIER__ bool is_valid() { - //TODO(mahesha) any functionality need to be added here? I believe not - return true; -} - -__CG_STATIC_QUALIFIER__ void sync() { - __syncthreads(); -} - -} // namespace workgroup - -} // namespace internal - -} // namespace cooperative_groups - -#endif // __cplusplus -#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H diff --git a/op_builder/transformer.py b/op_builder/transformer.py index 606d0be255ef..234fb616f0b3 100644 --- a/op_builder/transformer.py +++ b/op_builder/transformer.py @@ -29,11 +29,7 @@ def sources(self): ] def include_paths(self): - includes = ['csrc/includes'] - if is_rocm_pytorch: - from torch.utils.cpp_extension import ROCM_HOME - includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)] - return includes + return ['csrc/includes'] def nvcc_args(self): args = [ From 2585f2918b831d68a3460bc1b47aad827a47d3d8 Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 19:04:37 +0000 Subject: [PATCH 3/8] Added back the required code from the commits, 1c69737e1a8a8ae5ed9d295937458d54a65f4702 and 7be71d322e2e4e33a1dc6b85044e23d4e54b0283 --- docker/Dockerfile.rocm | 2 +- op_builder/transformer.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 3e850a5238b3..5cbb0be580eb 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -168,6 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR} RUN cd ${STAGE_DIR}/DeepSpeed && \ git checkout . && \ git checkout master && \ - DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo + DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo RUN rm -rf ${STAGE_DIR}/DeepSpeed RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)" diff --git a/op_builder/transformer.py b/op_builder/transformer.py index 234fb616f0b3..606d0be255ef 100644 --- a/op_builder/transformer.py +++ b/op_builder/transformer.py @@ -29,7 +29,11 @@ def sources(self): ] def include_paths(self): - return ['csrc/includes'] + includes = ['csrc/includes'] + if is_rocm_pytorch: + from torch.utils.cpp_extension import ROCM_HOME + includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)] + return includes def nvcc_args(self): args = [ From 0be96458a329b5df77d98e43d85f614b89fb388d Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 19:51:43 +0000 Subject: [PATCH 4/8] Revert "Cooperative Groups workaround for transformer kernels extension" This reverts commit fbddd9316c3686a15e3d805cbececf25feb4aa69. --- csrc/transformer/general_kernels.cu | 5 +-- csrc/transformer/normalize_kernels.cu | 64 +++++++-------------------- csrc/transformer/softmax_kernels.cu | 16 ++----- 3 files changed, 21 insertions(+), 64 deletions(-) diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu index 180e93ce4dde..7d318773f354 100644 --- a/csrc/transformer/general_kernels.cu +++ b/csrc/transformer/general_kernels.cu @@ -11,10 +11,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp, __shared__ float tile[TILE_DIM][TILE_DIM + 1]; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM); - g.tiled_partition(b, TILE_DIM); - + cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu index c69c47ebf1c8..366e93724638 100644 --- a/csrc/transformer/normalize_kernels.cu +++ b/csrc/transformer/normalize_kernels.cu @@ -28,9 +28,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -128,9 +126,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, 32); - g.tiled_partition(b, 32); + cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); int row = blockIdx.x; int id = threadIdx.x; @@ -318,9 +314,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, 32); - g.tiled_partition(b, 32); + cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); int row = blockIdx.x; int id = threadIdx.x; @@ -416,9 +410,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, 32); - g.tiled_partition(b, 32); + cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); int row = blockIdx.x; int id = threadIdx.x; @@ -626,9 +618,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad, __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1]; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM); - g.tiled_partition(b, TILE_DIM); + cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; int offset = threadIdx.y * width + idx; @@ -695,9 +685,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad, __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1]; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM); - g.tiled_partition(b, TILE_DIM); + cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; int offset = threadIdx.y * width + idx; @@ -762,9 +750,7 @@ __global__ void LayerNormBackward2(const float* out_grad, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -864,9 +850,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1081,9 +1065,7 @@ __global__ void LayerNormBackward2(const float* out_grad, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1178,9 +1160,7 @@ __global__ void LayerNormBackward2(const __half* out_grad, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1382,9 +1362,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1, __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1]; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM); - g.tiled_partition(b, TILE_DIM); + cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; int offset = threadIdx.y * width + idx; @@ -1446,9 +1424,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1, __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1]; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM); - g.tiled_partition(b, TILE_DIM); + cg::thread_block_tile g = cg::tiled_partition(b); int idx = blockDim.x * blockIdx.x + threadIdx.x; int offset = threadIdx.y * width + idx; @@ -1507,9 +1483,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1613,9 +1587,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1833,9 +1805,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -1938,9 +1908,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1, int iterations = row_stride / iteration_stride; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu index 18398148f559..be776b0c074d 100644 --- a/csrc/transformer/softmax_kernels.cu +++ b/csrc/transformer/softmax_kernels.cu @@ -20,9 +20,7 @@ __global__ void attn_softmax(float* vals, int block_width = blockStride * seq_length; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize); - g.tiled_partition(b, tbSize); + cg::thread_block_tile g = cg::tiled_partition(b); int batch = blockIdx.x; int row = blockIdx.y; @@ -153,9 +151,7 @@ __global__ void attn_softmax(__half* vals, int block_width = blockStride * seq_length; cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize); - g.tiled_partition(b, tbSize); + cg::thread_block_tile g = cg::tiled_partition(b); int batch = blockIdx.x; int row = blockIdx.y; @@ -449,9 +445,7 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_ : MAX_THREAD_ITERATIONS); cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize); - g.tiled_partition(b, tbSize); + cg::thread_block_tile g = cg::tiled_partition(b); int row = blockIdx.x; int id = threadIdx.x; @@ -526,9 +520,7 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/, } cg::thread_block b = cg::this_thread_block(); - //cg::thread_block_tile g = cg::tiled_partition(b); - cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE); - g.tiled_partition(b, WARP_SIZE); + cg::thread_block_tile g = cg::tiled_partition(b); for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i); From f428da58c96b46afcc68952c19c078e7d4e3e738 Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 21:29:39 +0000 Subject: [PATCH 5/8] Added defined(__HIP_PLATFORM_HCC__) to kernels code --- csrc/lamb/fused_lamb_cuda_kernel.cu | 2 +- csrc/quantization/quantizer.cu | 8 ++++---- csrc/transformer/gelu_kernels.cu | 6 +++--- csrc/transformer/inference/csrc/dequantize.cu | 2 +- csrc/transformer/inference/csrc/gelu.cu | 6 +++--- csrc/transformer/inference/csrc/normalize.cu | 4 ++-- csrc/transformer/inference/csrc/softmax.cu | 2 +- csrc/transformer/normalize_kernels.cu | 4 ++-- csrc/transformer/softmax_kernels.cu | 2 +- csrc/transformer/transform_kernels.cu | 10 +++++----- 10 files changed, 23 insertions(+), 23 deletions(-) diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu index e12b2c8585b4..1a8cd6071f91 100644 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu @@ -122,7 +122,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) cg::sync(cta); #endif -#if (__CUDA_ARCH__ >= 300) +#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__) if (tid < 32) { cg::coalesced_group active = cg::coalesced_threads(); diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu index c48ae38969e3..f79c3ecb1e12 100644 --- a/csrc/quantization/quantizer.cu +++ b/csrc/quantization/quantizer.cu @@ -5,7 +5,7 @@ namespace cg = cooperative_groups; __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals, int num_bits, std::pair seed) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals, __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals, int num_bits, std::pair seed) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu index 12048006266e..dbb8828ce977 100644 --- a/csrc/transformer/gelu_kernels.cu +++ b/csrc/transformer/gelu_kernels.cu @@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int row = blockIdx.x; int id = threadIdx.x; int loop_stride = blockDim.x; @@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input, int row_stride, int iterations) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int row = blockIdx.x; int id = threadIdx.x; int loop_stride = blockDim.x; @@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output, int row_stride, int iterations) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int row = blockIdx.x; int id = threadIdx.x; int loop_stride = blockDim.x; diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu index ddf7a958822a..43d50f00c058 100644 --- a/csrc/transformer/inference/csrc/dequantize.cu +++ b/csrc/transformer/inference/csrc/dequantize.cu @@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output, unsigned groups, unsigned merge_count) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) unsigned merge_hidden = hidden_dim >> merge_count; unsigned quantization_stride = (merge_hidden * output_size) / groups; diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu index fc3faacc54e8..1737855f614d 100755 --- a/csrc/transformer/inference/csrc/gelu.cu +++ b/csrc/transformer/inference/csrc/gelu.cu @@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input, int total_count, int intermediate_size) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) float2* input_cast = reinterpret_cast(input); const float2* bias_cast = reinterpret_cast(bias); @@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count, __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) float2* input_cast = reinterpret_cast(input); const float2* bias_cast = reinterpret_cast(bias); @@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input, int total_count, int intermediate_size) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) float2* input_cast = reinterpret_cast(input); const float2* residual_cast = reinterpret_cast(residual); diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu index ecd73154f37f..dc0f6be01144 100755 --- a/csrc/transformer/inference/csrc/normalize.cu +++ b/csrc/transformer/inference/csrc/normalize.cu @@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output, float epsilon, int row_stride) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int iteration_stride = blockDim.x; int iterations = row_stride / iteration_stride; @@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm, int row_stride, bool preLN) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int iteration_stride = blockDim.x; cg::thread_block b = cg::this_thread_block(); diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu index cee509965106..b347945df636 100644 --- a/csrc/transformer/inference/csrc/softmax.cu +++ b/csrc/transformer/inference/csrc/softmax.cu @@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals, int num_seq, float scale) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile g = cg::tiled_partition(b); diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu index 366e93724638..c9bc4a46ee5e 100644 --- a/csrc/transformer/normalize_kernels.cu +++ b/csrc/transformer/normalize_kernels.cu @@ -121,7 +121,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, __half* means, int row_stride) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int iteration_stride = blockDim.x; int iterations = row_stride / iteration_stride; @@ -404,7 +404,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals, __half* vars, int row_stride) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int iteration_stride = blockDim.x; int iterations = row_stride / iteration_stride; diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu index be776b0c074d..a4d84c37dd3b 100644 --- a/csrc/transformer/softmax_kernels.cu +++ b/csrc/transformer/softmax_kernels.cu @@ -142,7 +142,7 @@ __global__ void attn_softmax(__half* vals, int seq_length, int iterations) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) __shared__ float partialSum[MAX_WARP_NUM]; int warp_num = blockDim.x >> 5; diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu index 7d8a27eeeb43..b68d70f67ae1 100755 --- a/csrc/transformer/transform_kernels.cu +++ b/csrc/transformer/transform_kernels.cu @@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output, int heads, int head_ext) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int d0_stride = hidden_dim * seq_length; int d1_stride = hidden_dim; @@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output, int heads, int head_ext) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int d0_stride = hidden_dim * seq_length; int d1_stride = hidden_dim; @@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output, int seq_length, int heads) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) __shared__ float4 in_data[3072]; int d0_stride = hidden_dim * seq_length; @@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out, int hidden_dim, int head_ext) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) int d0_stride = hidden_dim * (seq_length / head_ext); int d1_stride = hidden_dim; @@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out, int seq_length, int hidden_dim) { -#if __CUDA_ARCH__ >= 700 +#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__) __shared__ float4 in_data[3072]; int d0_stride = hidden_dim * seq_length; From ed2ee34e35e82ab27497859e2c36537fa2f43152 Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 21:29:45 +0000 Subject: [PATCH 6/8] Revert "Enable cooperative groups for ROCm" This reverts commit 077638dabbd8a6183758f88352ba69fad53bf98a. --- csrc/lamb/fused_lamb_cuda_kernel.cu | 44 ----------------------------- 1 file changed, 44 deletions(-) diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu index 1a8cd6071f91..f74746f93752 100644 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu @@ -14,11 +14,7 @@ #include //#include -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 -#include -#else #include -#endif #include #include @@ -82,11 +78,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) T a_sum = s_a[tid]; T b_sum = s_b[tid]; -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif // do reduction in shared mem if ((blockSize >= 512) && (tid < 256)) { @@ -94,33 +86,21 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) s_b[tid] = b_sum = b_sum + s_b[tid + 256]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 256) && (tid < 128)) { s_a[tid] = a_sum = a_sum + s_a[tid + 128]; s_b[tid] = b_sum = b_sum + s_b[tid + 128]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 128) && (tid < 64)) { s_a[tid] = a_sum = a_sum + s_a[tid + 64]; s_b[tid] = b_sum = b_sum + s_b[tid + 64]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif #if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__) if (tid < 32) { @@ -144,66 +124,42 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) s_b[tid] = b_sum = b_sum + s_b[tid + 32]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 32) && (tid < 16)) { s_a[tid] = a_sum = a_sum + s_a[tid + 16]; s_b[tid] = b_sum = b_sum + s_b[tid + 16]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 16) && (tid < 8)) { s_a[tid] = a_sum = a_sum + s_a[tid + 8]; s_b[tid] = b_sum = b_sum + s_b[tid + 8]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 8) && (tid < 4)) { s_a[tid] = a_sum = a_sum + s_a[tid + 4]; s_b[tid] = b_sum = b_sum + s_b[tid + 4]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 4) && (tid < 2)) { s_a[tid] = a_sum = a_sum + s_a[tid + 2]; s_b[tid] = b_sum = b_sum + s_b[tid + 2]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif if ((blockSize >= 2) && (tid < 1)) { s_a[tid] = a_sum = a_sum + s_a[tid + 1]; s_b[tid] = b_sum = b_sum + s_b[tid + 1]; } -#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 - cta.sync(); -#else cg::sync(cta); -#endif #endif From 742fd6486d02b84332ac3a8c72c52a22c03661fa Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 23 Jun 2021 21:31:59 +0000 Subject: [PATCH 7/8] Enable cooperative groups for ROCm --- csrc/lamb/fused_lamb_cuda_kernel.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu index f74746f93752..a6c610ada499 100644 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu @@ -14,7 +14,11 @@ #include //#include +#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305 +#include +#else #include +#endif #include #include From 81b744ef24a92ae2eb463458a075e512ac03918e Mon Sep 17 00:00:00 2001 From: rraminen Date: Mon, 28 Jun 2021 23:02:35 +0000 Subject: [PATCH 8/8] hip cooperative groups functionality for coalesced_group in fused_lamb_cuda_kernel.cu is not implemented yet --- csrc/lamb/fused_lamb_cuda_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu index a6c610ada499..10a17e98a13d 100644 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu @@ -106,7 +106,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b) cg::sync(cta); -#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__) +#if (__CUDA_ARCH__ >= 300) if (tid < 32) { cg::coalesced_group active = cg::coalesced_threads();