From bc380a9702b1f43c53e22faeb7d4c8838d37f028 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Sun, 21 Apr 2024 14:43:05 -0400 Subject: [PATCH 1/3] refactor for reduce, reduction, replicate, reshape and reverse --- lib/kernels/src/hip/reduce_kernels.cpp | 108 ++++++++++------------ lib/kernels/src/hip/reduction_kernels.cpp | 8 +- lib/kernels/src/hip/replicate_kernels.cpp | 31 ++++--- lib/kernels/src/hip/reshape_kernels.cpp | 21 +++-- lib/kernels/src/hip/reverse_kernels.cpp | 36 ++++---- 5 files changed, 96 insertions(+), 108 deletions(-) diff --git a/lib/kernels/src/hip/reduce_kernels.cpp b/lib/kernels/src/hip/reduce_kernels.cpp index b12b62e224..5cc6a1cdfd 100644 --- a/lib/kernels/src/hip/reduce_kernels.cpp +++ b/lib/kernels/src/hip/reduce_kernels.cpp @@ -13,88 +13,76 @@ * limitations under the License. */ +#include "device.h" #include "kernels/reduce_kernels.h" -#include "kernels/hip_helper.h" #include namespace FlexFlow { -// declare Legion names -using Legion::coord_t; -using Legion::Domain; +namespace Kernels { +namespace Reduce { + +ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle, + OperatorType const &op_type, + size_t const &reduction_size, + ArrayShape input_shape, + ArrayShape output_shape) { + ffTensorDescriptor_t inputTensor + ffTensorDescriptor_t outputTensor; + ffReduceTensorDescriptor_t reduceDesc; -ReducePerDeviceState::ReducePerDeviceState(FFHandler handler, - Reduce const *rd, - Domain const &input_domain) - : op_type(rd->op_type), PerDeviceOpState(handler) { - checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc)); checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); - cudnnReduceTensorOp_t reduce_op; - switch (rd->op_type) { - case OP_REDUCE_SUM: - reduce_op = CUDNN_REDUCE_TENSOR_ADD; - break; - case OP_REDUCE_MEAN: - reduce_op = CUDNN_REDUCE_TENSOR_AVG; - break; - default: - assert(false); - } - checkCUDNN(miopenSetReduceTensorDescriptor(reduceDesc, - MIOPEN_REDUCE_TENSOR_ADD, - miopenFloat, - MIOPEN_PROPAGATE_NAN, - MIOPEN_REDUCE_TENSOR_NO_INDICES, - MIOPEN_32BIT_INDICES)); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_domain)); - Domain output_domain = input_domain; - for (size_t i = 0; i < rd->num_axes; i++) { - assert(input_domain.dim > rd->axes[i]); - output_domain.rect_data[rd->axes[i] + output_domain.dim] = - output_domain.rect_data[rd->axes[i]]; - } - assert(output_domain.get_volume() % input_domain.get_volume() == 0); - reduction_size = input_domain.get_volume() / output_domain.get_volume(); - assert(reduction_size > 0); - checkCUDNN(cudnnSetTensorDescriptorFromDomain(outputTensor, output_domain)); -} + checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc)); -ReducePerDeviceState::~ReducePerDeviceState(void) { - checkCUDNN(miopenDestroyReduceTensorDescriptor(reduceDesc)); - checkCUDNN(miopenDestroyTensorDescriptor(inputTensor)); - checkCUDNN(miopenDestroyTensorDescriptor(outputTensor)); -} + checkCUDNN(miopenSetTensorDescriptor(inputTensor, + miopenFloat, + input_shape.dims.size(), + input_shape.dims.data(), + input_shape.strides.data())); + checkCUDNN(miopenSetTensorDescriptor(outputTensor, + miopenFloat, + output_shape.dims.size(), + output_shape.dims.data(), + output_shape.strides.data())); -namespace Kernels { -namespace Reduce { + ReducePerDeviceState per_device = { + handle, + inputTensor, + outputTensor, + reduceDesc, + op_type, + reduction_size + }; + return per_device; +} void forward_kernel(hipStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *input_ptr, float *output_ptr) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenReduceTensor(m->handle.dnn, - m->reduceDesc, + checkCUDNN(miopenReduceTensor(m.handle.dnn, + m.reduceDesc, nullptr /*indices*/, 0 /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); }; void backward_kernel(hipStream_t stream, - ReducePerDeviceState const *m, + ReducePerDeviceState const &m, float const *output_grad_ptr, float *input_grad_ptr) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - switch (m->op_type) { + switch (m.op_type) { case OP_REDUCE_SUM: alpha = 1.0f; break; @@ -106,16 +94,16 @@ void backward_kernel(hipStream_t stream, default: assert(false); } - checkCUDNN(miopenOpTensor(m->handle.dnn, + checkCUDNN(miopenOpTensor(m.handle.dnn, miopenTensorOpAdd, &alpha, - m->inputTensor, + m.inputTensor, input_grad_ptr, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &beta, - m->inputTensor, + m.inputTensor, input_grad_ptr)); } diff --git a/lib/kernels/src/hip/reduction_kernels.cpp b/lib/kernels/src/hip/reduction_kernels.cpp index ec5ebeebd6..0b8666e2e0 100644 --- a/lib/kernels/src/hip/reduction_kernels.cpp +++ b/lib/kernels/src/hip/reduction_kernels.cpp @@ -37,7 +37,7 @@ __global__ void reduction_forward_kernel(T const *input_ptr, template struct ForwardKernel { - void operator()(cudaStream_t stream, + void operator()(hipStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, size_t num_replicas) { @@ -57,7 +57,7 @@ struct ForwardKernel { template struct BackwardKernel { - void operator()(cudaStream_t stream, + void operator()(hipStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { checkCUDA(hipMemcpyAsync(input.get(), @@ -73,13 +73,13 @@ void forward_kernel(hipStream_t stream, GenericTensorAccessorW const &output, size_t num_replicas) { DataTypeDispatch1{}( - input->data_type, stream, input, output, num_replicas); + input.data_type, stream, input, output, num_replicas); } void backward_kernel(hipStream_t stream, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(input->data_type, stream, input, output); + DataTypeDispatch1{}(input.data_type, stream, input, output); } } // namespace Reduction diff --git a/lib/kernels/src/hip/replicate_kernels.cpp b/lib/kernels/src/hip/replicate_kernels.cpp index ef58a9ae31..472c3a6ad6 100644 --- a/lib/kernels/src/hip/replicate_kernels.cpp +++ b/lib/kernels/src/hip/replicate_kernels.cpp @@ -13,14 +13,27 @@ * limitations under the License. */ +#include "device.h" +#include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels.h" -#include "kernels/hip_helper.h" #include namespace FlexFlow { namespace Kernels { namespace Replicate { +template +__global__ void replicate_backward_kernel(T const *input_ptr, + T *output_ptr, + size_t num_elements, + size_t num_replicas) { + CUDA_KERNEL_LOOP(i, num_elements) { + for (size_t j = 0; j < num_replicas; j++) { + output_ptr[i] += input_ptr[i + j * num_elements]; + } + } +} + template struct ForwardKernel { void operator()(hipStream_t stream, @@ -54,22 +67,10 @@ struct BackwardKernel { } } -template -__global__ void replicate_backward_kernel(T const *input_ptr, - T *output_ptr, - size_t num_elements, - size_t num_replicas) { - CUDA_KERNEL_LOOP(i, num_elements) { - for (size_t j = 0; j < num_replicas; j++) { - output_ptr[i] += input_ptr[i + j * num_elements]; - } - } -} - void forward_kernel(hipStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input->data_type, stream, input, output); + DataTypeDispatch1{}(input.data_type, stream, input, output); } void backward_kernel(hipStream_t stream, @@ -77,7 +78,7 @@ void backward_kernel(hipStream_t stream, GenericTensorAccessorR const &output, size_t num_replicas) { DataTypeDispatch1{}( - input->data_type, stream, input, output, num_replicas); + input.data_type, stream, input, output, num_replicas); } } // namespace Replicate diff --git a/lib/kernels/src/hip/reshape_kernels.cpp b/lib/kernels/src/hip/reshape_kernels.cpp index 5afe9a3ce6..460e96b29c 100644 --- a/lib/kernels/src/hip/reshape_kernels.cpp +++ b/lib/kernels/src/hip/reshape_kernels.cpp @@ -13,19 +13,20 @@ * limitations under the License. */ -#include "kernels/reshape_kernels.h" +#include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/hip_helper.h" +#include "kernels/reshape_kernels.h" #include namespace FlexFlow { -ReshapePerDeviceState::ReshapePerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) {} - namespace Kernels { namespace Reshape { +ReshapePerDeviceState init_kernel(DataType data_type) { + return ReshapePerDeviceState{data_type}; +} + template struct ForwardKernel { void operator()(hipStream_t stream, @@ -42,7 +43,7 @@ struct ForwardKernel { template struct BackwardKernel { void operator()(hipStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { float alpha = 1.0f; @@ -59,17 +60,17 @@ struct BackwardKernel { } void forward_kernel(hipStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, m, input, output); } void backward_kernel(hipStream_t stream, - ReshapePerDeviceState const *m, + ReshapePerDeviceState const &m, GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { - DataTypeDispatch1{}(m->data_type, stream, m, input, output); + DataTypeDispatch1{}(m.data_type, stream, m, input, output); } } // namespace Reshape diff --git a/lib/kernels/src/hip/reverse_kernels.cpp b/lib/kernels/src/hip/reverse_kernels.cpp index b72ce73b9f..111789cd9f 100644 --- a/lib/kernels/src/hip/reverse_kernels.cpp +++ b/lib/kernels/src/hip/reverse_kernels.cpp @@ -13,17 +13,31 @@ * limitations under the License. */ +#include "device.h" #include "kernels/reverse_kernels.h" -#include "kernels/hip_helper.h" #include namespace FlexFlow { -// declare Legion names -using Legion::coord_t; namespace Kernels { namespace Reverse { +__global__ void reverse_forward_kernel(float const *in_ptr, + float *out_ptr, + coord_t num_out_blks, + coord_t reverse_dim_size, + coord_t in_blk_size) { + CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { + coord_t blk_idx = i / (reverse_dim_size * in_blk_size); + i = i - blk_idx * (reverse_dim_size * in_blk_size); + coord_t reverse_dim_idx = i / in_blk_size; + i = i - reverse_dim_idx * in_blk_size; + coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + + (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; + out_ptr[i] = in_ptr[in_idx]; + } +} + void forward_kernel(hipStream_t stream, float const *in_ptr, float *out_ptr, @@ -64,22 +78,6 @@ void backward_kernel(hipStream_t stream, in_blk_size); } -__global__ void reverse_forward_kernel(float const *in_ptr, - float *out_ptr, - coord_t num_out_blks, - coord_t reverse_dim_size, - coord_t in_blk_size) { - CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) { - coord_t blk_idx = i / (reverse_dim_size * in_blk_size); - i = i - blk_idx * (reverse_dim_size * in_blk_size); - coord_t reverse_dim_idx = i / in_blk_size; - i = i - reverse_dim_idx * in_blk_size; - coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) + - (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i; - out_ptr[i] = in_ptr[in_idx]; - } -} - } // namespace Reverse } // namespace Kernels } // namespace FlexFlow From f05d2ff78cf107f3c03c430047f6d12125ad4a3d Mon Sep 17 00:00:00 2001 From: Qinghan Chen Date: Mon, 3 Jun 2024 18:56:42 -0400 Subject: [PATCH 2/3] fix --- lib/kernels/src/hip/reduce_kernels.cpp | 37 ++++++++--------------- lib/kernels/src/hip/replicate_kernels.cpp | 6 ++-- lib/kernels/src/hip/reshape_kernels.cpp | 8 ++--- lib/kernels/src/hip/reverse_kernels.cpp | 2 +- 4 files changed, 21 insertions(+), 32 deletions(-) diff --git a/lib/kernels/src/hip/reduce_kernels.cpp b/lib/kernels/src/hip/reduce_kernels.cpp index 5cc6a1cdfd..d38ea1bf80 100644 --- a/lib/kernels/src/hip/reduce_kernels.cpp +++ b/lib/kernels/src/hip/reduce_kernels.cpp @@ -13,8 +13,8 @@ * limitations under the License. */ -#include "device.h" #include "kernels/reduce_kernels.h" +#include "device.h" #include namespace FlexFlow { @@ -24,10 +24,9 @@ namespace Reduce { ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle, OperatorType const &op_type, size_t const &reduction_size, - ArrayShape input_shape, - ArrayShape output_shape) { - ffTensorDescriptor_t inputTensor - ffTensorDescriptor_t outputTensor; + ArrayShape const &input_shape, + ArrayShape const &output_shape) { + ffTensorDescriptor_t inputTensor ffTensorDescriptor_t outputTensor; ffReduceTensorDescriptor_t reduceDesc; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); @@ -46,13 +45,7 @@ ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle, output_shape.strides.data())); ReducePerDeviceState per_device = { - handle, - inputTensor, - outputTensor, - reduceDesc, - op_type, - reduction_size - }; + handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; return per_device; } @@ -89,22 +82,18 @@ void backward_kernel(hipStream_t stream, case OP_REDUCE_MEAN: // When the output is the average of multiple input elements // we need to scale the gradients by 1.0 / reduction_size - alpha = 1.0f / m->reduction_size; + alpha = 1.0f / m.reduction_size; break; default: assert(false); } - checkCUDNN(miopenOpTensor(m.handle.dnn, - miopenTensorOpAdd, - &alpha, - m.inputTensor, - input_grad_ptr, - &alpha, - m.outputTensor, - output_grad_ptr, - &beta, - m.inputTensor, - input_grad_ptr)); + checkCUDNN(hipdnnAddTensor(m.handle.dnn, + &alpha, + m.outputTensor, + output_grad_ptr, + &beta, + m.inputTensor, + input_grad_ptr)); } } // namespace Reduce diff --git a/lib/kernels/src/hip/replicate_kernels.cpp b/lib/kernels/src/hip/replicate_kernels.cpp index 472c3a6ad6..9a5fc813c3 100644 --- a/lib/kernels/src/hip/replicate_kernels.cpp +++ b/lib/kernels/src/hip/replicate_kernels.cpp @@ -13,9 +13,9 @@ * limitations under the License. */ +#include "kernels/replicate_kernels.h" #include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/replicate_kernels.h" #include namespace FlexFlow { @@ -42,7 +42,7 @@ struct ForwardKernel { checkCUDA(hipMemcpyAsync(input.get(), output.get(), - input.shape.num_elements() * sizeof(T), + input.shape.num_elements() * size_of_datatype(T), hipMemcpyDeviceToDevice, stream)); } @@ -55,7 +55,7 @@ struct BackwardKernel { GenericTensorAccessorR const &output, size_t num_replicas) { size_t total_elements = input.shape.num_elements() * num_replicas; - hipLaunchKernelGGL(HIP_KERNEL_NAME(replicate_backward_kernel), + hipLaunchKernelGGL(HIP_KERNEL_NAME(replicate_backward_kernel>), GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, diff --git a/lib/kernels/src/hip/reshape_kernels.cpp b/lib/kernels/src/hip/reshape_kernels.cpp index 460e96b29c..941495c0fd 100644 --- a/lib/kernels/src/hip/reshape_kernels.cpp +++ b/lib/kernels/src/hip/reshape_kernels.cpp @@ -13,9 +13,9 @@ * limitations under the License. */ +#include "kernels/reshape_kernels.h" #include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/reshape_kernels.h" #include namespace FlexFlow { @@ -34,7 +34,7 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(hipMemcpyAsync(output.get(), input.get(), - input.shape.num_elements() * sizeof(T), + input.shape.num_elements() * size_of_datatype(T), hipMemcpyDeviceToDevice, stream)); } @@ -47,7 +47,7 @@ struct BackwardKernel { GenericTensorAccessorW const &input, GenericTensorAccessorR const &output) { float alpha = 1.0f; - hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_add_with_scale), + hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_add_with_scale>), GET_BLOCKS(input.shape.num_elements()), CUDA_NUM_THREADS, 0, @@ -55,7 +55,7 @@ struct BackwardKernel { input.get(), output.get(), input.shape.num_elements(), - (T)alpha); + static_cast> alpha); } } diff --git a/lib/kernels/src/hip/reverse_kernels.cpp b/lib/kernels/src/hip/reverse_kernels.cpp index 111789cd9f..03e97245bf 100644 --- a/lib/kernels/src/hip/reverse_kernels.cpp +++ b/lib/kernels/src/hip/reverse_kernels.cpp @@ -13,8 +13,8 @@ * limitations under the License. */ -#include "device.h" #include "kernels/reverse_kernels.h" +#include "device.h" #include namespace FlexFlow { From a2a3dd8fac2330a34a04f7ea31ebf0e91d4811d9 Mon Sep 17 00:00:00 2001 From: Qinghan Chen Date: Tue, 4 Jun 2024 22:35:10 -0400 Subject: [PATCH 3/3] fix miopenOpTensor --- lib/kernels/src/hip/reduce_kernels.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lib/kernels/src/hip/reduce_kernels.cpp b/lib/kernels/src/hip/reduce_kernels.cpp index d38ea1bf80..468543dd5b 100644 --- a/lib/kernels/src/hip/reduce_kernels.cpp +++ b/lib/kernels/src/hip/reduce_kernels.cpp @@ -87,13 +87,17 @@ void backward_kernel(hipStream_t stream, default: assert(false); } - checkCUDNN(hipdnnAddTensor(m.handle.dnn, - &alpha, - m.outputTensor, - output_grad_ptr, - &beta, - m.inputTensor, - input_grad_ptr)); + checkCUDNN(miopenOpTensor(m.handle.dnn, + miopenTensorOpAdd, + &alpha, + m.inputTensor, + input_grad_ptr, + &alpha, + m.outputTensor, + output_grad_ptr, + &beta, + m.inputTensor, + input_grad_ptr)); } } // namespace Reduce