From d29666505a7b0a9357a419e50cb2474fd559e98e Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 18:54:15 +0000
Subject: [PATCH 1/8] Revert "Add patched CG headers to rocm install path (#9)"

This reverts commit 1c69737e1a8a8ae5ed9d295937458d54a65f4702.
---
 docker/Dockerfile.rocm | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7b80c3facbd4..3e850a5238b3 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -168,8 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
-        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
-        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"

From f50fa7b98254adb7d205255180c27265579df52c Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 18:54:40 +0000
Subject: [PATCH 2/8] Revert "Update headers and include_dirs to enable
 transformer extension (#8)"

This reverts commit 7be71d322e2e4e33a1dc6b85044e23d4e54b0283.
---
 .../hip/hcc_detail/hip_cooperative_groups.h   | 362 ------------------
 .../hip_cooperative_groups_helper.h           | 183 ---------
 op_builder/transformer.py                     |   6 +-
 3 files changed, 1 insertion(+), 550 deletions(-)
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
 delete mode 100644 csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h

diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
deleted file mode 100644
index 20e7bb94b8ad..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups.h
- *
- *  @brief Device side implementation of `Cooperative Group` feature.
- *
- *  Defines new types and device API wrappers related to `Cooperative Group`
- *  feature, which the programmer can directly use in his kernel(s) in order to
- *  make use of this feature.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
-
-//#if __cplusplus
-#if __cplusplus && defined(__clang__) && defined(__HIP__)
-#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
-#include <hip/hcc_detail/device_functions.h>
-namespace cooperative_groups {
-
-/** \brief The base type of all cooperative group types
- *
- *  \details Holds the key properties of a constructed cooperative group type
- *           object, like the group type, its size, etc
- */
-/*
-class thread_group {
- protected:
-  uint32_t _type; // thread_group type
-  uint32_t _size; // total number of threads in the tread_group
-  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
-                  // LSB represents lane 0, and MSB represents lane 63
-
-  // Construct a thread group, and set thread group type and other essential
-  // thread group properties. This generic thread group is directly constructed
-  // only when the group is supposed to contain only the calling the thread
-  // (throurh the API - `this_thread()`), and in all other cases, this thread
-  // group object is a sub-object of some other derived thread group object
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = size;
-    _mask = mask;
-  }
-
- public:
-  // Total number of threads in the thread group, and this serves the purpose
-  // for all derived cooperative group types since their `size` is directly
-  // saved during the construction
-  __CG_QUALIFIER__ uint32_t size() const {
-    return _size;
-  }
-  // Rank of the calling thread within [0, size())
-  __CG_QUALIFIER__ uint32_t thread_rank() const;
-  // Is this cooperative group type valid?
-  __CG_QUALIFIER__ bool is_valid() const;
-  // synchronize the threads in the thread group
-  __CG_QUALIFIER__ void sync() const;
-};
-*/
-
-class thread_group {
- protected:
-    bool _tiled_partition; // this_thread_block() constructor sets to false
-    uint32_t _size;            // this_thread_block() constructor sets to size()
-    uint32_t local_rank;      // this_thread_block() constructor sets to thread_rank()
-    uint32_t _mask;
-    uint32_t _type;
- public:
-    __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t group_size,
-                                uint64_t mask = (uint64_t)0) {
-    _type = type;
-    _size = group_size;
-    _mask = mask;
-    local_rank = internal::workgroup::thread_rank(); 
-    }
-
-    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent,
-                                               unsigned int tile_size) {
-        if ( (ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 ||
-            tile_size > 64 || parent.size() < tile_size)
-            _tiled_partition =  false;
-            //xxx : abort
-        _tiled_partition = true;
-        _size = tile_size;
-        local_rank = parent.thread_rank() % tile_size;
-    }
-    __CG_QUALIFIER__ void sync() const; 
-    __CG_QUALIFIER__ uint32_t size() const {
-        return _size;
-    }
-    __CG_QUALIFIER__ uint32_t thread_rank() const; 
-    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const {
-        return (__shfl_down(var, delta, _size));
-    }
-     __CG_QUALIFIER__ float shfl_xor(float var, int mask) const {
-        return (__shfl_xor(var, mask, _size));
-    }
-    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const {
-        return (__shfl(var, src_lane, _size));
-    }
-    __CG_QUALIFIER__ bool is_valid() const;
-
-};
-
-/** \brief The multi-grid cooperative group type
- *
- *  \details Represents an inter-device cooperative group type where the
- *           participating threads within the group spans across multple
- *           devices, running the (same) kernel on these devices
- */
-class multi_grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
-
- protected:
-  // Construct mutli-grid thread group (through the API this_multi_grid())
-  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
-      : thread_group(internal::cg_multi_grid, size) { }
-
- public:
-  // Number of invocations participating in this multi-grid group. In other
-  // words, the number of GPUs
-	__CG_QUALIFIER__ uint32_t num_grids() {
-    return internal::multi_grid::num_grids();
-  }
-  // Rank of this invocation. In other words, an ID number within the range
-  // [0, num_grids()) of the GPU, this kernel is running on
-	__CG_QUALIFIER__ uint32_t grid_rank() {
-    return internal::multi_grid::grid_rank();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::multi_grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::multi_grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::multi_grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct multi-grid cooperative
- *         group type object - `multi_grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ multi_grid_group
-this_multi_grid() {
-  return multi_grid_group(internal::multi_grid::size());
-}
-
-/** \brief The grid cooperative group type
- *
- *  \details Represents an inter-workgroup cooperative group type where the
- *           participating threads within the group spans across multiple
- *           workgroups running the (same) kernel on the same device
- */
-class grid_group : public thread_group {
-  // Only these friend functions are allowed to construct an object of this class
-  // and access its resources
-  friend __CG_QUALIFIER__ grid_group this_grid();
-
- protected:
-  // Construct grid thread group (through the API this_grid())
-  explicit __CG_QUALIFIER__ grid_group(uint32_t size)
-      : thread_group(internal::cg_grid, size) { }
-
- public:
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::grid::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::grid::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::grid::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct grid cooperative group type
- *         object - `grid_group`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `multi_grid_group`. Instead, he should construct it through this
- *           API function
- */
-__CG_QUALIFIER__ grid_group
-this_grid() {
-  return grid_group(internal::grid::size());
-}
-
-/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
- *         type
- *
- *  \details Represents an intra-workgroup cooperative group type where the
- *           participating threads within the group are exctly the same threads
- *           which are participated in the currently executing `workgroup`
- */
-class thread_block : public thread_group {
-  // Only these friend functions are allowed to construct an object of this
-  // class and access its resources
-  friend __CG_QUALIFIER__ thread_block this_thread_block();
-
- protected:
-  // Construct a workgroup thread group (through the API this_thread_block())
-  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
-      : thread_group(internal::cg_workgroup, size) { }
-
- public:
-  // 3-dimensional block index within the grid
-  __CG_QUALIFIER__ dim3 group_index() {
-    return internal::workgroup::group_index();
-  }
-  // 3-dimensional thread index within the block
-  __CG_QUALIFIER__ dim3 thread_index() {
-    return internal::workgroup::thread_index();
-  }
-  __CG_QUALIFIER__ uint32_t thread_rank() const {
-    return internal::workgroup::thread_rank();
-  }
-  __CG_QUALIFIER__ bool is_valid() const {
-    return internal::workgroup::is_valid();
-  }
-  __CG_QUALIFIER__ void sync() const {
-    internal::workgroup::sync();
-  }
-};
-
-/** \brief User exposed API interface to construct workgroup cooperative
- *         group type object - `thread_block`
- *
- *  \details User is not allowed to directly construct an object of type
- *           `thread_block`. Instead, he should construct it through this API
- *           function
- */
-__CG_QUALIFIER__ thread_block
-this_thread_block() {
-  return thread_block(internal::workgroup::size());
-}
-
-/**
- *  Implemenation of all publicly exposed base class APIs
- */
-__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->thread_rank());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->thread_rank());
-    }
-    case internal::cg_coalesced_tile: {
-      return local_rank;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return -1;
-    }
-  }
-}
-
-__CG_QUALIFIER__ bool thread_group::is_valid() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      return (static_cast<const multi_grid_group*>(this)->is_valid());
-    }
-    case internal::cg_grid: {
-      return (static_cast<const grid_group*>(this)->is_valid());
-    }
-    case internal::cg_workgroup: {
-      return (static_cast<const thread_block*>(this)->is_valid());
-    }
-    case internal::cg_coalesced_tile: {
-      return _tiled_partition;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-      return false;
-    }
-  }
-}
-
-__CG_QUALIFIER__ void thread_group::sync() const {
-  switch (this->_type) {
-    case internal::cg_multi_grid: {
-      static_cast<const multi_grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_grid: {
-      static_cast<const grid_group*>(this)->sync();
-      break;
-    }
-    case internal::cg_workgroup: {
-      static_cast<const thread_block*>(this)->sync();
-      break;
-    }
-    case internal::cg_coalesced_tile: {
-      if (!_tiled_partition) // If in a tiled partition, this is a no-op
-            __syncthreads();
-      break;
-    }
-    default: {
-      assert(false && "invalid cooperative group type");
-    }
-  }
-}
-
-/**
- *  Implemenation of publicly exposed `wrapper` APIs on top of basic cooperative
- *  group type APIs
- */
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t group_size(CGTy const &g) {
-  return g.size();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ uint32_t thread_rank(CGTy const &g) {
-  return g.thread_rank();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ bool is_valid(CGTy const &g) {
-  return g.is_valid();
-}
-
-template <class CGTy>
-__CG_QUALIFIER__ void sync(CGTy const &g) {
-  g.sync();
-}
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
deleted file mode 100644
index 7f8e69da11c3..000000000000
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-/**
- *  @file  hcc_detail/hip_cooperative_groups_helper.h
- *
- *  @brief Device side implementation of cooperative group feature.
- *
- *  Defines helper constructs and APIs which aid the types and device API
- *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
- */
-#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
-
-#if __cplusplus
-#include <hip/hcc_detail/hip_runtime_api.h>
-#include <hip/hcc_detail/device_functions.h>
-
-#if !defined(__align__)
-#define __align__(x) __attribute__((aligned(x)))
-#endif
-
-#if !defined(__CG_QUALIFIER__)
-#define __CG_QUALIFIER__ __device__ __forceinline__
-#endif
-
-#if !defined(__CG_STATIC_QUALIFIER__)
-#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
-#endif
-
-#if !defined(WAVEFRONT_SIZE)
-#define WAVEFRONT_SIZE 64
-#endif
-
-namespace cooperative_groups {
-
-namespace internal {
-
-/** \brief Enums representing different cooperative group types
- */
-typedef enum {
-  cg_invalid,
-  cg_multi_grid,
-  cg_grid,
-  cg_workgroup,
-  cg_coalesced_tile
-} group_type;
-
-/**
- *  Functionalities related to multi-grid cooperative group type
- */
-namespace multi_grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
-  return (uint32_t)__ockl_multi_grid_num_grids();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
-  return (uint32_t)__ockl_multi_grid_grid_rank();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)__ockl_multi_grid_size();
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return (uint32_t)__ockl_multi_grid_thread_rank();
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_multi_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_multi_grid_sync();
-}
-
-} // namespace multi_grid
-
-/**
- *  Functionalities related to grid cooperative group type
- */
-namespace grid {
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)((hipBlockDim_z * hipGridDim_z) *
-                    (hipBlockDim_y * hipGridDim_y) *
-                    (hipBlockDim_x * hipGridDim_x));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  // Compute global id of the workgroup to which the current thread belongs to
-  uint32_t blkIdx =
-           (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
-                      (hipBlockIdx_y * hipGridDim_x) +
-                      (hipBlockIdx_x));
-
-  // Compute total number of threads being passed to reach current workgroup
-  // within grid
-  uint32_t num_threads_till_current_workgroup =
-           (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-
-  // Compute thread local rank within current workgroup
-  uint32_t local_thread_rank =
-           (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                      (hipThreadIdx_y * hipBlockDim_x) +
-                      (hipThreadIdx_x));
-
-  return (num_threads_till_current_workgroup + local_thread_rank);
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-  return (bool)__ockl_grid_is_valid();
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __ockl_grid_sync();
-}
-
-} // namespace grid
-
-/**
- *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
- *  cooperative group type
- */
-namespace workgroup {
-
-__CG_STATIC_QUALIFIER__ dim3 group_index() {
-  return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y,
-               (uint32_t)hipBlockIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ dim3 thread_index() {
-  return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y,
-               (uint32_t)hipThreadIdx_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t size() {
-  return((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
-}
-
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
- return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
-                    (hipThreadIdx_y * hipBlockDim_x) +
-                    (hipThreadIdx_x)));
-}
-
-__CG_STATIC_QUALIFIER__ bool is_valid() {
-   //TODO(mahesha) any functionality need to be added here? I believe not
-  return true;
-}
-
-__CG_STATIC_QUALIFIER__ void sync() {
-  __syncthreads();
-}
-
-} // namespace workgroup
-
-} // namespace internal
-
-} // namespace cooperative_groups
-
-#endif // __cplusplus
-#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 606d0be255ef..234fb616f0b3 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -29,11 +29,7 @@ def sources(self):
         ]
 
     def include_paths(self):
-        includes = ['csrc/includes']
-        if is_rocm_pytorch:
-            from torch.utils.cpp_extension import ROCM_HOME
-            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
-        return includes
+        return ['csrc/includes']
 
     def nvcc_args(self):
         args = [

From 2585f2918b831d68a3460bc1b47aad827a47d3d8 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 19:04:37 +0000
Subject: [PATCH 3/8] Added back the required code from the commits,
 1c69737e1a8a8ae5ed9d295937458d54a65f4702 and
 7be71d322e2e4e33a1dc6b85044e23d4e54b0283

---
 docker/Dockerfile.rocm    | 2 +-
 op_builder/transformer.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 3e850a5238b3..5cbb0be580eb 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -168,6 +168,6 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
-        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
 RUN rm -rf ${STAGE_DIR}/DeepSpeed
 RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 234fb616f0b3..606d0be255ef 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -29,7 +29,11 @@ def sources(self):
         ]
 
     def include_paths(self):
-        return ['csrc/includes']
+        includes = ['csrc/includes']
+        if is_rocm_pytorch:
+            from torch.utils.cpp_extension import ROCM_HOME
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
+        return includes
 
     def nvcc_args(self):
         args = [

From 0be96458a329b5df77d98e43d85f614b89fb388d Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 19:51:43 +0000
Subject: [PATCH 4/8] Revert "Cooperative Groups workaround for transformer
 kernels extension"

This reverts commit fbddd9316c3686a15e3d805cbececf25feb4aa69.
---
 csrc/transformer/general_kernels.cu   |  5 +--
 csrc/transformer/normalize_kernels.cu | 64 +++++++--------------------
 csrc/transformer/softmax_kernels.cu   | 16 ++-----
 3 files changed, 21 insertions(+), 64 deletions(-)

diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index 180e93ce4dde..7d318773f354 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -11,10 +11,7 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
     __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
-
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
 
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index c69c47ebf1c8..366e93724638 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -28,9 +28,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -128,9 +126,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -318,9 +314,7 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -416,9 +410,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
-    g.tiled_partition(b, 32);
+    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -626,9 +618,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -695,9 +685,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -762,9 +750,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -864,9 +850,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1081,9 +1065,7 @@ __global__ void LayerNormBackward2(const float* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1178,9 +1160,7 @@ __global__ void LayerNormBackward2(const __half* out_grad,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1382,9 +1362,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1446,9 +1424,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
     __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
-    g.tiled_partition(b, TILE_DIM);
+    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
 
     int idx = blockDim.x * blockIdx.x + threadIdx.x;
     int offset = threadIdx.y * width + idx;
@@ -1507,9 +1483,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1613,9 +1587,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1833,9 +1805,7 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -1938,9 +1908,7 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
     int iterations = row_stride / iteration_stride;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index 18398148f559..be776b0c074d 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -20,9 +20,7 @@ __global__ void attn_softmax(float* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -153,9 +151,7 @@ __global__ void attn_softmax(__half* vals,
     int block_width = blockStride * seq_length;
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int batch = blockIdx.x;
     int row = blockIdx.y;
@@ -449,9 +445,7 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                           : MAX_THREAD_ITERATIONS);
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
-    g.tiled_partition(b, tbSize);
+    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
 
     int row = blockIdx.x;
     int id = threadIdx.x;
@@ -526,9 +520,7 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
     }
 
     cg::thread_block b = cg::this_thread_block();
-    //cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
-    g.tiled_partition(b, WARP_SIZE);
+    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
 
     for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
 

From f428da58c96b46afcc68952c19c078e7d4e3e738 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:29:39 +0000
Subject: [PATCH 5/8] Added defined(__HIP_PLATFORM_HCC__) to kernels code

---
 csrc/lamb/fused_lamb_cuda_kernel.cu           |  2 +-
 csrc/quantization/quantizer.cu                |  8 ++++----
 csrc/transformer/gelu_kernels.cu              |  6 +++---
 csrc/transformer/inference/csrc/dequantize.cu |  2 +-
 csrc/transformer/inference/csrc/gelu.cu       |  6 +++---
 csrc/transformer/inference/csrc/normalize.cu  |  4 ++--
 csrc/transformer/inference/csrc/softmax.cu    |  2 +-
 csrc/transformer/normalize_kernels.cu         |  4 ++--
 csrc/transformer/softmax_kernels.cu           |  2 +-
 csrc/transformer/transform_kernels.cu         | 10 +++++-----
 10 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index e12b2c8585b4..1a8cd6071f91 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -122,7 +122,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     cg::sync(cta);
 #endif
 
-#if (__CUDA_ARCH__ >= 300)
+#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();
 
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
index c48ae38969e3..f79c3ecb1e12 100644
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@@ -5,7 +5,7 @@ namespace cg = cooperative_groups;
 
 __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -206,7 +206,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
                                    int num_bits,
                                    std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -484,7 +484,7 @@ template void launch_sr_qunatize_kernel(__half* vals,
 
 __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -729,7 +729,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
                                         int num_bits,
                                         std::pair<uint64_t, uint64_t> seed)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 12048006266e..dbb8828ce977 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
 
 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                 int row_stride,
                                 int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
@@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                             int row_stride,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index ddf7a958822a..43d50f00c058 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                   unsigned groups,
                                   unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     unsigned merge_hidden = hidden_dim >> merge_count;
     unsigned quantization_stride = (merge_hidden * output_size) / groups;
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index fc3faacc54e8..1737855f614d 100755
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -39,7 +39,7 @@ __global__ void fused_bias_gelu(__half* input,
                                 int total_count,
                                 int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -117,7 +117,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
 
 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@@ -195,7 +195,7 @@ __global__ void fused_bias_residual(__half* input,
                                     int total_count,
                                     int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     float2* input_cast = reinterpret_cast<float2*>(input);
     const float2* residual_cast = reinterpret_cast<const float2*>(residual);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index ecd73154f37f..dc0f6be01144 100755
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                float epsilon,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                           int row_stride,
                                           bool preLN)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
 
     cg::thread_block b = cg::this_thread_block();
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index cee509965106..b347945df636 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -37,7 +37,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 float scale)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 366e93724638..c9bc4a46ee5e 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -121,7 +121,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* means,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -404,7 +404,7 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                                __half* vars,
                                                int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index be776b0c074d..a4d84c37dd3b 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -142,7 +142,7 @@ __global__ void attn_softmax(__half* vals,
                              int seq_length,
                              int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int warp_num = blockDim.x >> 5;
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
index 7d8a27eeeb43..b68d70f67ae1 100755
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                        int heads,
                                        int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                 int heads,
                                                 int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                            int seq_length,
                                            int heads)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                          int hidden_dim,
                                          int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                     int seq_length,
                                     int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700
+#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;

From ed2ee34e35e82ab27497859e2c36537fa2f43152 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:29:45 +0000
Subject: [PATCH 6/8] Revert "Enable cooperative groups for ROCm"

This reverts commit 077638dabbd8a6183758f88352ba69fad53bf98a.
---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 44 -----------------------------
 1 file changed, 44 deletions(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index 1a8cd6071f91..f74746f93752 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,11 +14,7 @@
 #include <iostream>
 
 //#include <helper_functions.h>
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-#include <hip/hip_cooperative_groups.h>
-#else
 #include <cooperative_groups.h>
-#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 
@@ -82,11 +78,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
     T a_sum = s_a[tid];
     T b_sum = s_b[tid];
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     // do reduction in shared mem
     if ((blockSize >= 512) && (tid < 256)) {
@@ -94,33 +86,21 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 256];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 256) && (tid < 128)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 128];
         s_b[tid] = b_sum = b_sum + s_b[tid + 128];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 128) && (tid < 64)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 64];
         s_b[tid] = b_sum = b_sum + s_b[tid + 64];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
     if (tid < 32) {
@@ -144,66 +124,42 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
         s_b[tid] = b_sum = b_sum + s_b[tid + 32];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 32) && (tid < 16)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 16];
         s_b[tid] = b_sum = b_sum + s_b[tid + 16];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 16) && (tid < 8)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 8];
         s_b[tid] = b_sum = b_sum + s_b[tid + 8];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 8) && (tid < 4)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 4];
         s_b[tid] = b_sum = b_sum + s_b[tid + 4];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 4) && (tid < 2)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 2];
         s_b[tid] = b_sum = b_sum + s_b[tid + 2];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
     if ((blockSize >= 2) && (tid < 1)) {
         s_a[tid] = a_sum = a_sum + s_a[tid + 1];
         s_b[tid] = b_sum = b_sum + s_b[tid + 1];
     }
 
-#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
-    cta.sync();
-#else
     cg::sync(cta);
-#endif
 
 #endif
 

From 742fd6486d02b84332ac3a8c72c52a22c03661fa Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 23 Jun 2021 21:31:59 +0000
Subject: [PATCH 7/8] Enable cooperative groups for ROCm

---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index f74746f93752..a6c610ada499 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -14,7 +14,11 @@
 #include <iostream>
 
 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 

From 81b744ef24a92ae2eb463458a075e512ac03918e Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Mon, 28 Jun 2021 23:02:35 +0000
Subject: [PATCH 8/8] hip cooperative groups functionality for coalesced_group
 in fused_lamb_cuda_kernel.cu is not implemented yet

---
 csrc/lamb/fused_lamb_cuda_kernel.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index a6c610ada499..10a17e98a13d 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -106,7 +106,7 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
 
     cg::sync(cta);
 
-#if (__CUDA_ARCH__ >= 300) || defined(__HIP_PLATFORM_HCC__)
+#if (__CUDA_ARCH__ >= 300)
     if (tid < 32) {
         cg::coalesced_group active = cg::coalesced_threads();