From 060648cae34bf8f63ee7a1cf74ba66e2dc434190 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Mon, 15 Apr 2024 16:07:17 -0400 Subject: [PATCH 1/5] hip -e --- .../src/hip/element_binary_kernels.cpp | 201 ++++++-- lib/kernels/src/hip/element_unary_kernels.cpp | 310 ++++++------ lib/kernels/src/hip/embedding_kernels.cpp | 455 +++++++++++------- 3 files changed, 608 insertions(+), 358 deletions(-) diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index 5d29c27837..b02da92280 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -44,16 +44,16 @@ ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, switch (op_type) { case Op::EW_ADD: case Op::EW_SUB: - mode = miopenTensorOpAdd; + mode = MIOPEN_OP_TENSOR_ADD; break; case Op::EW_MUL: - mode = miopenTensorOpMul; + mode = MIOPEN_OP_TENSOR_MUL; break; case Op::EW_MAX: - mode = miopenTensorOpMax; + mode = MIOPEN_OP_TENSOR_MAX; break; case Op::EW_MIN: - mode = miopenTensorOpMin; + mode = MIOPEN_OP_TENSOR_MIN; break; default: assert(false); @@ -151,6 +151,24 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, beta * in2_grad[i]; break; } + case Op::EW_MAX: { + lhs_grad[i] = (lhs[i] >= rhs[i]) + ? alpha * out_grad[i] + beta * lhs_grad[i] + : beta * lhs_grad[i]; + rhs_grad[i] = (rhs[i] >= lhs[i]) + ? alpha * out_grad[i] + beta * rhs_grad[i] + : beta * rhs_grad[i]; + break; + } + case Op::EW_MIN: { + lhs_grad[i] = (lhs[i] <= rhs[i]) + ? alpha * out_grad[i] + beta * lhs_grad[i] + : beta * lhs_grad[i]; + rhs_grad[i] = (rhs[i] <= lhs[i]) + ? alpha * out_grad[i] + beta * rhs_grad[i] + : beta * rhs_grad[i]; + break; + } default: assert(false); } @@ -158,7 +176,7 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, } void forward_kernel(hipStream_t stream, - ElementBinaryPerDeviceState const *m, + ElementBinaryPerDeviceState const &m, float const *lhs_ptr, float const *rhs_ptr, float *out_ptr, @@ -175,6 +193,8 @@ void forward_kernel(hipStream_t stream, break; case Op::EW_ADD: case Op::EW_MUL: + case Op::EW_MAX: + case Op::EW_MIN: break; default: assert(false); @@ -183,29 +203,62 @@ void forward_kernel(hipStream_t stream, // cudnnOpTensor if (broadcast_inputLHS) { // currently only handle add and sub - assert(op_type == Op::EW_SUB || op_type == Op::EW_ADD); + assert(op_type == Op::EW_SUB || op_type == Op::EW_ADD || op_type == Op::EW_MUL); + if (op_type == Op::EW_SUB || op_type == Op::EW_ADD) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &beta, + m.outputTensor, + out_ptr, + &alpha1, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.outputTensor, + out_ptr)); + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &beta, + m.outputTensor, + out_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &alpha1, + m.outputTensor, + out_ptr)); + } else if (op_type == Op::EW_MUL) { + checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, + CUDNN_OP_TENSOR_MUL, + CUDNN_DATA_FLOAT, + CUDNN_NOT_PROPAGATE_NAN)); checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &beta, - m.outputTensor, - out_ptr, - &alpha1, - m.inputLHSTensor, - lhs_ptr, - &beta, - m.outputTensor, - out_ptr)); + m.opDesc, + &alpha1, + m.inputLHSTensor, + lhs_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.outputTensor, + out_ptr)); + checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, + CUDNN_OP_TENSOR_ADD, + CUDNN_DATA_FLOAT, + CUDNN_NOT_PROPAGATE_NAN)); + checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &beta, - m.outputTensor, - out_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &alpha1, - m.outputTensor, - out_ptr)); + m.opDesc, + &beta, + m.outputTensor, + out_ptr, + &alpha1, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.outputTensor, + out_ptr)); } else { checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, @@ -236,9 +289,9 @@ void backward_kernel(hipStream_t stream, checkCUDNN(miopenSetStream(handle.dnn, stream)); if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { - float alpha = 1.0f, alpha2 = 0.0f, beta = 1.0f; + float alpha = 1.0f, beta = 1.0f; if (lhs_grad_ptr != nullptr) { - if (m.broadcast_input1) { + if (broadcast_inputLHS) { checkCUDNN(miopenReduceTensor(handle.dnn, m.reduceAddDesc, nullptr /*indices*/, @@ -269,7 +322,7 @@ void backward_kernel(hipStream_t stream, alpha = -1.0f; } if (rhs_grad_ptr != nullptr) { - if (m.broadcast_input2) { + if (broadcast_inputRHS) { checkCUDNN(miopenReduceTensor(handle.dnn, m.reduceAddDesc, nullptr /*indices*/, @@ -297,9 +350,34 @@ void backward_kernel(hipStream_t stream, } } } else if (m.op_type == Op::EW_MUL) { - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f; + float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f, zero = 0.0f; if (lhs_grad_ptr != nullptr) { - checkCUDNN(miopenOpTensor(handle.dnn, + if (broadcast_inputLHS){ + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputLHSTensor, + lhs_grad_ptr)); + }else{ + checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, &alpha1, m.outputTensor, @@ -310,21 +388,74 @@ void backward_kernel(hipStream_t stream, &beta, m.inputLHSTensor, lhs_grad_ptr)); + } } if (rhs_grad_ptr != nullptr) { - checkCUDNN(miopenOpTensor(handle.dnn, + if (broadcast_inputRHS){ + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputRHSTensor, + rhs_grad_ptr)); + }else{ + checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, &alpha1, m.outputTensor, out_grad_ptr, &alpha2, - m.inputRHSTensor, + m.inputLHSTensor, lhs_ptr, &beta, - m.inputLHSTensor, + m.inputRHSTensor, rhs_grad_ptr)); + } } - } else { + } else if (op_type == Op::EW_MIN || op_type == Op::EW_MAX) { + float alpha = 1.0f, beta = 1.0f; + miopenDataType_t data_type; + int n; + int dims[MAX_TENSOR_DIM]; + int strides[MAX_TENSOR_DIM]; + checkCUDNN(miopenGetTensorDescriptorSize(m.outputTensor, &n)); + size_t volume = 1; + for (int i = 0; i < n; i++) { + volume *= dims[i]; + } + // launch hip kernel + hipLaunchKernelGGL(elewise_binary_backward_kernel, + dim3((volume + 255) / 256), + dim3(256), + 0, + stream, + volume, + alpha, + beta, + op_type, + out_grad_ptr, + lhs_ptr, + rhs_ptr, + lhs_grad_ptr, + rhs_grad_ptr); + else { assert(false && "Unsupported ElementWise Binary Type"); } } diff --git a/lib/kernels/src/hip/element_unary_kernels.cpp b/lib/kernels/src/hip/element_unary_kernels.cpp index e79ef57592..a13cd5a73b 100644 --- a/lib/kernels/src/hip/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/element_unary_kernels.cpp @@ -13,180 +13,79 @@ * limitations under the License. */ -#include "kernels/element_unary_kernels.h" +#include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/hip_helper.h" +#include "kernels/element_unary_kernels.h" +#include "op-attrs/get_op_type.h" +#include #include namespace FlexFlow { namespace Kernels { namespace ElementUnary { +using coord_t = long long; + +static bool use_cudnn(OperatorType op_type) { + switch (op_type) { + case Op::RELU: + case Op::SIGMOID: + case Op::TANH: + case Op::ELU: + return true; + default: + return false; + } +} + +template +T get_scalar(ElementUnaryUnifiedAttrs const &attrs) { + if (std::holds_alternative(attrs)) { + return (T)std::get(attrs).scalar; + } else { + T dummy_scalar; + return dummy_scalar; + } +} + ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, ArrayShape const &output_shape, - ElementUnaryAttrs const &attrs) { - miopenTensorDescriptor_t inputTensor; - miopenTensorDescriptor_t outputTensor; - miopenActivationDescriptor_t actiDesc; - miopenActivationMode_t mode; + ElementUnaryUnifiedAttrs const &attrs) { + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t outputTensor; + ffActivationDescriptor_t actiDesc; checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); - if (use_cudnn(attrs.op_type)) { - switch (attrs.op_type) { - case OP_SIGMOID: - mode = miopenActivationLOGISTIC; + Op op_type = get_op_type(attrs); + + if (use_cudnn(op_type)) { + miopenActivationMode_t mode; + switch (op_type) { + case Op::SIGMOID: + mode = CUDNN_ACTIVATION_SIGMOID; break; - case OP_RELU: - mode = miopenActivationRELU; + case Op::RELU: + mode = CUDNN_ACTIVATION_RELU; break; - case OP_TANH: - mode = miopenActivationTANH; + case Op::TANH: + mode = CUDNN_ACTIVATION_TANH; break; - case OP_ELU: - mode = miopenActivationELU; + case Op::ELU: + mode = CUDNN_ACTIVATION_ELU; break; default: assert(false); } - checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0)); - checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); - // input_domain == output_domain - checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); - } - - ElementUnaryPerDeviceState per_device_state = { - inputTensor, outputTensor, actiDesc}; - - return per_device_state; -} - -bool use_cudnn(OperatorType type) { - if (type == OP_RELU) { - return true; - } - if (type == OP_SIGMOID) { - return true; - } - if (type == OP_TANH) { - return true; - } - if (type == OP_ELU) { - return true; - } - return false; -} - -template -struct ForwardKernel { - void operator()(ffStream_t stream, - ElementUnaryPerDeviceState const &m, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - checkCUDNN(miopenSetStream(handle.dnn, stream)); - if (use_cudnn(attrs.op_type)) { - float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenActivationForward(handle.dnn, - m.actiDesc, - &alpha, - m.inputTensor, - input.get(), - &beta, - m.outputTensor, - output.get())); - } else { - size_t num_elements = input.shape.num_elements(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_unary_forward_kernel), - GET_BLOCKS(num_elements), - CUDA_NUM_THREADS, - 0, - stream, - num_elements, - (T)attrs.scalar, - attrs.op_type, - input.get(), - output.get()); - } } -} - -template -struct BackwardKernel { - void operator()(ffStream_t stream, - ElementUnaryPerDeviceState const &m, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &input_grad, - GenericTensorAccessorW const &output, - GenericTensorAccessorW const &output_grad) { - checkCUDNN(miopenSetStream(handle.dnn, stream)); + checkCUDNN(miopenSetActivationDescriptor(actiDesc, mode, 0.0, 0.0, 0.0)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_shape)); + checkCUDNN(cudnnSetTensorDescriptorFromDomain(outputTensor, output_shape)); + return {inputTensor, outputTensor, actiDesc}; + } - if (use_cudnn(attrs.op_type)) { - float alpha = 1.0f; - float beta = 0.0f; - checkCUDNN(miopenActivationBackward(handle.dnn, - m.actiDesc, - &alpha, - m.outputTensor, - output.get(), - m.outputTensor, - output_grad.get()), - m.inputTensor, - input.get(), - &beta, - m.inputTensor, - input_grad.get()); - } else { - size_t num_elements = input.shape.num_elements(); - hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_unary_backward_kernel), - GET_BLOCKS(num_elements), - CUDA_NUM_THREADS, - 0, - stream, - num_elements, - attrs.scalar, - attrs.op_type, - output.get(), - output_grad.get(), - input.get(), - input_grad.get()); - } - } -} void forward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}( - input.data_type, stream, m, attrs, handle, input, output); -} - -void backward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad) { - DataTypeDispatch1{}(input.data_type, - stream, - m, - attrs, - handle, - input, - input_grad, - output, - output_grad); -} template __global__ void elewise_unary_forward_kernel( @@ -309,6 +208,115 @@ __global__ void elewise_unary_backward_kernel(coord_t volume, } } +template +struct ForwardKernel { + void operator()(ffStream_t stream, + ElementUnaryPerDeviceState const &m, + ElementUnaryUnifiedAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) const { + checkCUDNN(miopenSetStream(handle.dnn, stream)); + Op op_type = get_op_type(attrs); + if (use_cudnn(op_type)) { + float alpha = 1.0f, beta = 0.0f; + checkCUDNN(miopenActivationForward(handle.dnn, + m.actiDesc, + &alpha, + m.inputTensor, + input.get(), + &beta, + m.outputTensor, + output.get())); + } else { + size_t num_elements = input.shape.num_elements(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_unary_forward_kernel), + GET_BLOCKS(num_elements), + CUDA_NUM_THREADS, + 0, + stream, + num_elements, + (T)m->scalar, + m->op_type, + input.get(), + output.get()); + } + } +}; + +template +struct BackwardKernel { + void operator()(ffStream_t stream, + ElementUnaryPerDeviceState const &m, + ElementUnaryUnifiedAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad) { + checkCUDNN(miopenSetStream(handle.dnn, stream)); + + Op op_type = get_op_type(attrs); + if (use_cudnn(op_type)) { + float alpha = 1.0f; + checkCUDNN(miopenActivationBackward(handle.dnn, + m.actiDesc, + &alpha, + m.outputTensor, + output.get(), + m.outputTensor, + output_grad.get()), + m.inputTensor, + input.get(), + &beta, + m.inputTensor, + input_grad.get()); + } else { + size_t num_elements = input.shape.num_elements(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(elewise_unary_backward_kernel), + GET_BLOCKS(num_elements), + CUDA_NUM_THREADS, + 0, + stream, + num_elements, + m->scalar, + m->op_type, + output.get(), + output_grad.get(), + input.get(), + input_grad.get()); + } + } +}; +void forward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &device_state, + ElementUnaryUnifiedAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}( + input.data_type, stream, device_state, attrs, handle, input, output); +} + +void backward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &device_state, + ElementUnaryUnifiedAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad) { + DataTypeDispatch1{}(input.data_type, + stream, + device_state, + attrs, + handle, + input, + input_grad, + output, + output_grad); +} + } // namespace ElementUnary } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 17edfea5c1..578f08a567 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -13,146 +13,17 @@ * limitations under the License. */ -#include "kernels/embedding_kernels.h" +#include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/hip_helper.h" +#include "kernels/embedding_kernels.h" #include namespace FlexFlow { namespace Kernels { namespace Embedding { -template -struct ForwardKernel { - void operator()(hipStream_t stream, - AggrMode aggr, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &weight, - int in_dim, - int out_dim, - int batch_size) { - assert(input.data_type == DT_INT32 || input.data_type == DT_INT64); - assert(weight.data_type == DT_HALF || weight.data_type == DT_FLOAT || - weight.data_type == DT_DOUBLE); - - if (aggr == AGGR_MODE_NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), - GET_BLOCKS(output.shape.get_volume()), - CUDA_NUM_THREADS, - 0, - stream, - input.get(), - output.get(), - weight.get(), - out_dim, - batch_size); - } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), - GET_BLOCKS(output.shape.get_volume()), - CUDA_NUM_THREADS, - 0, - stream, - input.get(), - output.get(), - weight.get(), - out_dim, - in_dim, - batch_size, - aggr); - } - } -} - -template -struct BackwardKernel { - void operator()(hipStream_t stream, - AggrMode aggr, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &weight_grad, - int in_dim, - int out_dim, - int batch_size) { - assert(input.data_type == DT_INT32 || input.data_type == DT_INT64); - assert(output.data_type == DT_HALF || output.data_type == DT_FLOAT, - || output.data_type == DT_DOUBLE); - if (aggr == AGGR_MODE_NONE) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), - GET_BLOCKS(output.shape.get_volume()), - CUDA_NUM_THREADS, - 0, - stream, - input.get(), - output.get(), - weight_grad.get(), - out_dim, - batch_size); - } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), - GET_BLOCKS(output.shape.get_volume()), - CUDA_NUM_THREADS, - 0, - stream, - input.get(), - output.get(), - weight_grad.get(), - out_dim, - in_dim, - batch_size, - aggr); - } - } -} - -void forward_kernel(hipStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &weight, - DataType input_data_type, - DataType output_data_type, - AggrMode aggr, - int in_dim, - int out_dim, - int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, - stream, - aggr, - input, - output, - weight, - in_dim, - out_dim, - batch_size); -} - -void backward_kernel(hipStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &weight_grad, - DataType input_data_type, - DataType output_data_type, - AggrMode aggr, - int in_dim, - int out_dim, - int batch_size) { - DataTypeDispatch2{}(input_data_type, - output_data_type, - stream, - aggr, - input, - output, - weight, - in_dim, - out_dim, - batch_size); -} - -void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { +void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p){ hipStream_t stream; - - // Randomly initialize the intput tensor to avoid out of index range issues hipLaunchKernelGGL(HIP_KERNEL_NAME(rand_generate_int), GET_BLOCKS(size), CUDA_NUM_THREADS, @@ -163,10 +34,8 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { p); } -void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { +void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p){ hipStream_t stream; - - // Randomly initialize the intput tensor to avoid out of index range issues hipLaunchKernelGGL(HIP_KERNEL_NAME(rand_generate_int), GET_BLOCKS(size), CUDA_NUM_THREADS, @@ -179,48 +48,131 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { template __global__ void embed_forward_no_aggr( - TI const *input, TD *output, TD const *embed, int out_dim, int batch_size) { + TI const *input, TD *output, TD const *embed, int out_dim, int batch_size); +template +__global__ void embed_forward_with_aggr(TI const *input, + TD *output, + TD const *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr); +template +__global__ void embed_backward_no_aggr( + TI const *input, TD const *output, TD *embed, int out_dim, int batch_size); +template +__global__ void embed_backward_with_aggr(TI const *input, + TD const *output, + TD *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr); + +template +__global__ void embed_forward_no_aggr(int32_t const *input, + TD *output, + TD const *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; int idx = i / out_dim; int off = i % out_dim; - TI wordIdx = input[idx]; + int32_t wordIdx = input[idx]; output[i] = embed[wordIdx * out_dim + off]; } } -template -__global__ void embed_forward_with_aggr(TI const *input, +template +__global__ void embed_forward_no_aggr(int64_t const *input, + TD *output, + TD const *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + output[i] = 0; + int idx = i / out_dim; + int off = i % out_dim; + int64_t wordIdx = input[idx]; + output[i] = embed[wordIdx * out_dim + off]; + } +} + +template +__global__ void embed_forward_with_aggr(int32_t const *input, TD *output, TD const *embed, int out_dim, int in_dim, int batch_size, - AggrMode aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { output[i] = 0; int idx = i / out_dim; int off = i % out_dim; for (int j = 0; j < in_dim; j++) { - TI wordIdx = input[idx * in_dim + j]; + int32_t wordIdx = input[idx * in_dim + j]; output[i] = output[i] + embed[wordIdx * out_dim + off]; - if (aggr == AGGR_MODE_SUM) { + if (aggr == AggregateOp::SUM) { } else { - assert(aggr == AGGR_MODE_AVG); + assert(aggr == AggregateOp::AVG); output[i] = output[i] * scale; } } } } -template -__global__ void embed_backward_no_aggr( - TI const *input, TD const *output, TD *embed, int out_dim, int batch_size) { +template +__global__ void embed_forward_with_aggr(int64_t const *input, + TD *output, + TD const *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { + TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + output[i] = 0; int idx = i / out_dim; int off = i % out_dim; - TI wordIdx = input[idx]; + for (int j = 0; j < in_dim; j++) { + int64_t wordIdx = input[idx * in_dim + j]; + output[i] = output[i] + embed[wordIdx * out_dim + off]; + if (aggr == AggregateOp::SUM) { + } else { + assert(aggr == AggregateOp::AVG); + output[i] = output[i] * scale; + } + } + } +} + +template +__global__ void embed_backward_no_aggr(int32_t const *input, + TD const *output, + TD *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int32_t wordIdx = input[idx]; + atomicAdd(embed + wordIdx * out_dim + off, output[i]); + } +} + +template +__global__ void embed_backward_no_aggr(int64_t const *input, + TD const *output, + TD *embed, + int out_dim, + int batch_size) { + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + int64_t wordIdx = input[idx]; atomicAdd(embed + wordIdx * out_dim + off, output[i]); } } @@ -228,15 +180,15 @@ __global__ void embed_backward_no_aggr( // Specialization for half type template <> -__global__ void embed_backward_no_aggr(int const *input, - half const *output, - half *embed, - int out_dim, - int batch_size) { +__global__ void embed_backward_no_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int batch_size) { CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; - int wordIdx = input[idx]; + int32_t wordIdx = input[idx]; #if __CUDA_ARCH__ >= 700 atomicAdd(embed + wordIdx * out_dim + off, output[i]); #else @@ -269,27 +221,53 @@ __global__ void embed_backward_no_aggr(int64_t const *input, } } -template -__global__ void embed_backward_with_aggr(TI const *input, +template +__global__ void embed_backward_with_aggr(int32_t const *input, TD const *output, TD *embed, int out_dim, int in_dim, int batch_size, - AggrMode aggr) { + AggregateOp aggr) { TD scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; TD gradient; - if (aggr == AGGR_MODE_SUM) { + if (aggr == AggregateOp::SUM) { gradient = output[i]; } else { - assert(aggr == AGGR_MODE_AVG); + assert(aggr == AggregateOp::AVG); gradient = output[i] * scale; } for (int j = 0; j < in_dim; j++) { - TI wordIdx = input[idx * in_dim + j]; + int32_t wordIdx = input[idx * in_dim + j]; + atomicAdd(embed + wordIdx * out_dim + off, gradient); + } + } +} + +template +__global__ void embed_backward_with_aggr(int64_t const *input, + TD const *output, + TD *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { + TD scale = 1.0f / in_dim; + CUDA_KERNEL_LOOP(i, batch_size * out_dim) { + int idx = i / out_dim; + int off = i % out_dim; + TD gradient; + if (aggr == AggregateOp::SUM) { + gradient = output[i]; + } else { + assert(aggr == AggregateOp::AVG); + gradient = output[i] * scale; + } + for (int j = 0; j < in_dim; j++) { + int64_t wordIdx = input[idx * in_dim + j]; atomicAdd(embed + wordIdx * out_dim + off, gradient); } } @@ -298,26 +276,26 @@ __global__ void embed_backward_with_aggr(TI const *input, // Specialization for half type template <> -__global__ void embed_backward_with_aggr(int const *input, - half const *output, - half *embed, - int out_dim, - int in_dim, - int batch_size, - AggrMode aggr) { +__global__ void embed_backward_with_aggr(int32_t const *input, + half const *output, + half *embed, + int out_dim, + int in_dim, + int batch_size, + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; half gradient; - if (aggr == AGGR_MODE_SUM) { + if (aggr == AggregateOp::SUM) { gradient = output[i]; } else { - assert(aggr == AGGR_MODE_AVG); + assert(aggr == AggregateOp::AVG); gradient = output[i] * scale; } for (int j = 0; j < in_dim; j++) { - int wordIdx = input[idx * in_dim + j]; + int32_t wordIdx = input[idx * in_dim + j]; #if __CUDA_ARCH__ >= 700 atomicAdd(embed + wordIdx * out_dim + off, gradient); #else @@ -337,16 +315,16 @@ __global__ void embed_backward_with_aggr(int64_t const *input, int out_dim, int in_dim, int batch_size, - AggrMode aggr) { + AggregateOp aggr) { half scale = 1.0f / in_dim; CUDA_KERNEL_LOOP(i, batch_size * out_dim) { int idx = i / out_dim; int off = i % out_dim; half gradient; - if (aggr == AGGR_MODE_SUM) { + if (aggr == AggregateOp::SUM) { gradient = output[i]; } else { - assert(aggr == AGGR_MODE_AVG); + assert(aggr == AggregateOp::AVG); gradient = output[i] * scale; } for (int j = 0; j < in_dim; j++) { @@ -370,6 +348,139 @@ __global__ void rand_generate_int(TD *ptr, size_t size, TD p) { } } +template +struct ForwardKernel { + void operator()(hipStream_t stream, + AggrMode aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + int in_dim, + int out_dim, + int batch_size) { + assert(input.data_type == DataType::INT32 || + input.data_type == DataType::INT64); + assert(weight.data_type == DataType::HALF || + weight.data_type == DataType::FLOAT || + weight.data_type == DataType::DOUBLE); + + + if (aggr == AggregateOp::NONE) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), + GET_BLOCKS(output.shape.get_volume()), + CUDA_NUM_THREADS, + 0, + stream, + input.get(), + output.get(), + weight.get(), + out_dim, + batch_size); + } else { + assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr), + GET_BLOCKS(output.shape.get_volume()), + CUDA_NUM_THREADS, + 0, + stream, + input.get(), + output.get(), + weight.get(), + out_dim, + in_dim, + batch_size, + aggr); + } + } +} + +template +struct BackwardKernel { + void operator()(hipStream_t stream, + AggrMode aggr, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &weight_grad, + int in_dim, + int out_dim, + int batch_size) { + assert(input.data_type == DataType::INT32 || + input.data_type == DataType::INT64); + assert(output.data_type == DataType::HALF || + output.data_type == DataType::FLOAT || + output.data_type == DataType::DOUBLE); + if (aggr == AggregateOp::NONE) { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr), + GET_BLOCKS(output.shape.get_volume()), + CUDA_NUM_THREADS, + 0, + stream, + input.get(), + output.get(), + weight_grad.get(), + out_dim, + batch_size); + } else { + hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr), + GET_BLOCKS(output.shape.get_volume()), + CUDA_NUM_THREADS, + 0, + stream, + input.get(), + output.get(), + weight_grad.get(), + out_dim, + in_dim, + batch_size, + aggr); + } + } +} + +void forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + AggrMode aggr, + int in_dim, + int out_dim, + int batch_size) { + DataTypeDispatch2{}(input_data_type, + output_data_type, + stream, + aggr, + input, + output, + weight, + in_dim, + out_dim, + batch_size); +} + +void backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &weight_grad, + DataType input_data_type, + DataType output_data_type, + AggrMode aggr, + int in_dim, + int out_dim, + int batch_size) { + DataTypeDispatch2{}(input_data_type, + output_data_type, + stream, + aggr, + input, + output, + weight, + in_dim, + out_dim, + batch_size); +} + } // namespace Embedding } // namespace Kernels } // namespace FlexFlow From 6604d6dd3157a73b089f2b41d9c6147c8bba2b3a Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Mon, 15 Apr 2024 16:12:09 -0400 Subject: [PATCH 2/5] format fix --- .../src/hip/element_binary_kernels.cpp | 441 +++++++++--------- lib/kernels/src/hip/element_unary_kernels.cpp | 7 +- lib/kernels/src/hip/embedding_kernels.cpp | 7 +- 3 files changed, 227 insertions(+), 228 deletions(-) diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index b02da92280..e53285712c 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -203,7 +203,8 @@ void forward_kernel(hipStream_t stream, // cudnnOpTensor if (broadcast_inputLHS) { // currently only handle add and sub - assert(op_type == Op::EW_SUB || op_type == Op::EW_ADD || op_type == Op::EW_MUL); + assert(op_type == Op::EW_SUB || op_type == Op::EW_ADD || + op_type == Op::EW_MUL); if (op_type == Op::EW_SUB || op_type == Op::EW_ADD) { checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, @@ -227,239 +228,239 @@ void forward_kernel(hipStream_t stream, &alpha1, m.outputTensor, out_ptr)); - } else if (op_type == Op::EW_MUL) { - checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, - CUDNN_OP_TENSOR_MUL, - CUDNN_DATA_FLOAT, - CUDNN_NOT_PROPAGATE_NAN)); - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.inputLHSTensor, - lhs_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &beta, - m.outputTensor, - out_ptr)); - checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, - CUDNN_OP_TENSOR_ADD, - CUDNN_DATA_FLOAT, - CUDNN_NOT_PROPAGATE_NAN)); - - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &beta, - m.outputTensor, - out_ptr, - &alpha1, - m.inputLHSTensor, - lhs_ptr, - &beta, - m.outputTensor, - out_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.inputLHSTensor, - lhs_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &beta, - m.outputTensor, - out_ptr)); - } -} - -void backward_kernel(hipStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *out_grad_ptr, - float const *lhs_ptr, - float const *rhs_ptr, - float *lhs_grad_ptr, - float *rhs_grad_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - bool broadcast_inputRHS, - PerDeviceFFHandle handle) { - checkCUDA(hipblasSetStream(handle.blas, stream)); - checkCUDNN(miopenSetStream(handle.dnn, stream)); - - if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { - float alpha = 1.0f, beta = 1.0f; - if (lhs_grad_ptr != nullptr) { - if (broadcast_inputLHS) { - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - miopenTensorOpAdd, - &alpha, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } - } - if (m.op_type == Op::EW_SUB) { - alpha = -1.0f; - } - if (rhs_grad_ptr != nullptr) { - if (broadcast_inputRHS) { - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - miopenTensorOpAdd, - &alpha, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - } - } - } else if (m.op_type == Op::EW_MUL) { - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f, zero = 0.0f; - if (lhs_grad_ptr != nullptr) { - if (broadcast_inputLHS){ - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha1, - m.outputTensor, - out_grad_ptr, - &zero, - m.inputLHSTensor, - lhs_grad_ptr)); - }else{ - checkCUDNN(miopenOpTensor(handle.dnn, + } else if (op_type == Op::EW_MUL) { + checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, + CUDNN_OP_TENSOR_MUL, + CUDNN_DATA_FLOAT, + CUDNN_NOT_PROPAGATE_NAN)); + checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, &alpha1, - m.outputTensor, - out_grad_ptr, + m.inputLHSTensor, + lhs_ptr, &alpha2, m.inputRHSTensor, rhs_ptr, &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } - } - if (rhs_grad_ptr != nullptr) { - if (broadcast_inputRHS){ - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputLHSTensor, - lhs_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha1, - m.outputTensor, - out_grad_ptr, - &zero, - m.inputRHSTensor, - rhs_grad_ptr)); - }else{ - checkCUDNN(miopenOpTensor(handle.dnn, + m.outputTensor, + out_ptr)); + checkCUDNN(cudnnSetOpTensorDescriptor(m.opDesc, + CUDNN_OP_TENSOR_ADD, + CUDNN_DATA_FLOAT, + CUDNN_NOT_PROPAGATE_NAN)); + + checkCUDNN(miopenOpTensor(handle.dnn, m.opDesc, - &alpha1, + &beta, m.outputTensor, - out_grad_ptr, - &alpha2, + out_ptr, + &alpha1, m.inputLHSTensor, lhs_ptr, &beta, + m.outputTensor, + out_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.inputLHSTensor, + lhs_ptr, + &alpha2, m.inputRHSTensor, - rhs_grad_ptr)); - } - } - } else if (op_type == Op::EW_MIN || op_type == Op::EW_MAX) { - float alpha = 1.0f, beta = 1.0f; - miopenDataType_t data_type; - int n; - int dims[MAX_TENSOR_DIM]; - int strides[MAX_TENSOR_DIM]; - checkCUDNN(miopenGetTensorDescriptorSize(m.outputTensor, &n)); - size_t volume = 1; - for (int i = 0; i < n; i++) { - volume *= dims[i]; + rhs_ptr, + &beta, + m.outputTensor, + out_ptr)); } - // launch hip kernel - hipLaunchKernelGGL(elewise_binary_backward_kernel, - dim3((volume + 255) / 256), - dim3(256), - 0, - stream, - volume, - alpha, - beta, - op_type, - out_grad_ptr, - lhs_ptr, - rhs_ptr, - lhs_grad_ptr, - rhs_grad_ptr); - else { - assert(false && "Unsupported ElementWise Binary Type"); } -} -} // namespace ElementBinary + void backward_kernel(hipStream_t stream, + ElementBinaryPerDeviceState const &m, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + PerDeviceFFHandle handle) { + checkCUDA(hipblasSetStream(handle.blas, stream)); + checkCUDNN(miopenSetStream(handle.dnn, stream)); + + if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { + float alpha = 1.0f, beta = 1.0f; + if (lhs_grad_ptr != nullptr) { + if (broadcast_inputLHS) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + miopenTensorOpAdd, + &alpha, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + } + } + if (m.op_type == Op::EW_SUB) { + alpha = -1.0f; + } + if (rhs_grad_ptr != nullptr) { + if (broadcast_inputRHS) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + miopenTensorOpAdd, + &alpha, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + } + } + } else if (m.op_type == Op::EW_MUL) { + float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f, zero = 0.0f; + if (lhs_grad_ptr != nullptr) { + if (broadcast_inputLHS) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputLHSTensor, + lhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + } + } + if (rhs_grad_ptr != nullptr) { + if (broadcast_inputRHS) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputRHSTensor, + rhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + } + } + } else if (op_type == Op::EW_MIN || op_type == Op::EW_MAX) { + float alpha = 1.0f, beta = 1.0f; + miopenDataType_t data_type; + int n; + int dims[MAX_TENSOR_DIM]; + int strides[MAX_TENSOR_DIM]; + checkCUDNN(miopenGetTensorDescriptorSize(m.outputTensor, &n)); + size_t volume = 1; + for (int i = 0; i < n; i++) { + volume *= dims[i]; + } + // launch hip kernel + hipLaunchKernelGGL(elewise_binary_backward_kernel, + dim3((volume + 255) / 256), + dim3(256), + 0, + stream, + volume, + alpha, + beta, + op_type, + out_grad_ptr, + lhs_ptr, + rhs_ptr, + lhs_grad_ptr, + rhs_grad_ptr); + else { + assert(false && "Unsupported ElementWise Binary Type"); + } + } + + } // namespace ElementBinary } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/hip/element_unary_kernels.cpp b/lib/kernels/src/hip/element_unary_kernels.cpp index a13cd5a73b..d6ca2054e9 100644 --- a/lib/kernels/src/hip/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/element_unary_kernels.cpp @@ -13,12 +13,12 @@ * limitations under the License. */ +#include "kernels/element_unary_kernels.h" #include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/element_unary_kernels.h" #include "op-attrs/get_op_type.h" -#include #include +#include namespace FlexFlow { namespace Kernels { @@ -84,8 +84,7 @@ ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, checkCUDNN(cudnnSetTensorDescriptorFromDomain(inputTensor, input_shape)); checkCUDNN(cudnnSetTensorDescriptorFromDomain(outputTensor, output_shape)); return {inputTensor, outputTensor, actiDesc}; - } - +} template __global__ void elewise_unary_forward_kernel( diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp index 578f08a567..7ca3149f2f 100644 --- a/lib/kernels/src/hip/embedding_kernels.cpp +++ b/lib/kernels/src/hip/embedding_kernels.cpp @@ -13,16 +13,16 @@ * limitations under the License. */ +#include "kernels/embedding_kernels.h" #include "device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/embedding_kernels.h" #include namespace FlexFlow { namespace Kernels { namespace Embedding { -void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p){ +void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { hipStream_t stream; hipLaunchKernelGGL(HIP_KERNEL_NAME(rand_generate_int), GET_BLOCKS(size), @@ -34,7 +34,7 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p){ p); } -void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p){ +void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) { hipStream_t stream; hipLaunchKernelGGL(HIP_KERNEL_NAME(rand_generate_int), GET_BLOCKS(size), @@ -364,7 +364,6 @@ struct ForwardKernel { weight.data_type == DataType::FLOAT || weight.data_type == DataType::DOUBLE); - if (aggr == AggregateOp::NONE) { hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr), GET_BLOCKS(output.shape.get_volume()), From 696b69443167d1ebb0f37d61985f7d97d1961449 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Mon, 15 Apr 2024 17:36:00 -0400 Subject: [PATCH 3/5] small fix to binary --- .../src/hip/element_binary_kernels.cpp | 517 ++++++++---------- 1 file changed, 243 insertions(+), 274 deletions(-) diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index e53285712c..017253c325 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -14,13 +14,74 @@ */ #include "kernels/element_binary_kernels.h" -#include "kernels/hip_helper.h" +#include "device.h" +#include "kernels/ff_handle.h" +#include "op-attrs/datatype.h" +#include "op-attrs/op.h" #include namespace FlexFlow { namespace Kernels { namespace ElementBinary { +using OperatorType = Op; + +__global__ void elewise_binary_backward_kernel(coord_t volume, + float const alpha, + float const beta, + OperatorType type, + float const *out_grad, + float const *lhs, + float const *rhs, + float *lhs_grad, + float *rhs_grad) { + CUDA_KERNEL_LOOP(i, volume) { + switch (type) { + case Op::EW_ADD: { + lhs_grad[i] = alpha * out_grad[i] + beta * lhs_grad[i]; + rhs_grad[i] = alpha * out_grad[i] + beta * rhs_grad[i]; + break; + } + case Op::EW_SUB: { + lhs_grad[i] = alpha * out_grad[i] + beta * lhs_grad[i]; + rhs_grad[i] = -alpha * out_grad[i] + beta * rhs_grad[i]; + break; + } + case Op::EW_MUL: { + lhs_grad[i] = alpha * out_grad[i] * rhs[i] + beta * lhs_grad[i]; + rhs_grad[i] = alpha * out_grad[i] * lhs[i] + beta * rhs_grad[i]; + break; + } + case Op::EW_DIV: { + lhs_grad[i] = alpha * out_grad[i] / rhs[i] + beta * lhs_grad[i]; + rhs_grad[i] = -alpha * out_grad[i] * lhs[i] / (rhs[i] * rhs[i]) + + beta * rhs_grad[i]; + break; + } + case Op::EW_MAX: { + lhs_grad[i] = (lhs[i] >= rhs[i]) + ? alpha * out_grad[i] + beta * lhs_grad[i] + : beta * lhs_grad[i]; + rhs_grad[i] = (rhs[i] >= lhs[i]) + ? alpha * out_grad[i] + beta * rhs_grad[i] + : beta * rhs_grad[i]; + break; + } + case Op::EW_MIN: { + lhs_grad[i] = (lhs[i] <= rhs[i]) + ? alpha * out_grad[i] + beta * lhs_grad[i] + : beta * lhs_grad[i]; + rhs_grad[i] = (rhs[i] <= lhs[i]) + ? alpha * out_grad[i] + beta * rhs_grad[i] + : beta * rhs_grad[i]; + break; + } + default: + assert(false); + } + } +} + ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, OperatorType op_type, bool should_broadcast_lhs, @@ -82,99 +143,6 @@ ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, return per_device_state; } -__global__ void elewise_binary_forward_kernel(coord_t volume, - float const alpha, - float const beta, - OperatorType type, - float const *in1, - float const *in2, - float *out) { - switch (type) { - case Op::EW_ADD: { - CUDA_KERNEL_LOOP(i, volume) { - out[i] = alpha * (in1[i] + in2[i]) + beta * out[i]; - } - break; - } - case Op::EW_SUB: { - CUDA_KERNEL_LOOP(i, volume) { - out[i] = alpha * (in1[i] - in2[i]) + beta * out[i]; - } - break; - } - case Op::EW_MUL: { - CUDA_KERNEL_LOOP(i, volume) { - out[i] = alpha * in1[i] * in2[i] + beta * out[i]; - } - break; - } - case Op::EW_DIV: { - CUDA_KERNEL_LOOP(i, volume) { - out[i] = alpha * (in1[i] / in2[i]) + beta * out[i]; - } - break; - } - default: - assert(false); - } -} - -__global__ void elewise_binary_backward_kernel(coord_t volume, - float const alpha, - float const beta, - OperatorType type, - float const *out_grad, - float const *in1, - float const *in2, - float *in1_grad, - float *in2_grad) { - CUDA_KERNEL_LOOP(i, volume) { - switch (type) { - case Op::EW_ADD: { - in1_grad[i] = alpha * out_grad[i] + beta * in1_grad[i]; - in2_grad[i] = alpha * out_grad[i] + beta * in2_grad[i]; - break; - } - case Op::EW_SUB: { - in1_grad[i] = alpha * out_grad[i] + beta * in1_grad[i]; - in2_grad[i] = -alpha * out_grad[i] + beta * in2_grad[i]; - break; - } - case Op::EW_MUL: { - in1_grad[i] = alpha * out_grad[i] * in2[i] + beta * in1_grad[i]; - in2_grad[i] = alpha * out_grad[i] * in1[i] + beta * in2_grad[i]; - break; - } - case Op::EW_DIV: { - in1_grad[i] = alpha * out_grad[i] / in2[i] + beta * in1_grad[i]; - in2_grad[i] = -alpha * out_grad[i] * in1[i] / (in2[i] * in2[i]) + - beta * in2_grad[i]; - break; - } - case Op::EW_MAX: { - lhs_grad[i] = (lhs[i] >= rhs[i]) - ? alpha * out_grad[i] + beta * lhs_grad[i] - : beta * lhs_grad[i]; - rhs_grad[i] = (rhs[i] >= lhs[i]) - ? alpha * out_grad[i] + beta * rhs_grad[i] - : beta * rhs_grad[i]; - break; - } - case Op::EW_MIN: { - lhs_grad[i] = (lhs[i] <= rhs[i]) - ? alpha * out_grad[i] + beta * lhs_grad[i] - : beta * lhs_grad[i]; - rhs_grad[i] = (rhs[i] <= lhs[i]) - ? alpha * out_grad[i] + beta * rhs_grad[i] - : beta * rhs_grad[i]; - break; - } - default: - assert(false); - } - } -} - void forward_kernel(hipStream_t stream, ElementBinaryPerDeviceState const &m, float const *lhs_ptr, @@ -274,193 +242,194 @@ void forward_kernel(hipStream_t stream, out_ptr)); } } +} - void backward_kernel(hipStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *out_grad_ptr, - float const *lhs_ptr, - float const *rhs_ptr, - float *lhs_grad_ptr, - float *rhs_grad_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - bool broadcast_inputRHS, - PerDeviceFFHandle handle) { - checkCUDA(hipblasSetStream(handle.blas, stream)); - checkCUDNN(miopenSetStream(handle.dnn, stream)); +void backward_kernel(hipStream_t stream, + ElementBinaryPerDeviceState const &m, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + PerDeviceFFHandle handle) { + checkCUDA(hipblasSetStream(handle.blas, stream)); + checkCUDNN(miopenSetStream(handle.dnn, stream)); - if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { - float alpha = 1.0f, beta = 1.0f; - if (lhs_grad_ptr != nullptr) { - if (broadcast_inputLHS) { - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - miopenTensorOpAdd, - &alpha, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } - } - if (m.op_type == Op::EW_SUB) { - alpha = -1.0f; - } - if (rhs_grad_ptr != nullptr) { - if (broadcast_inputRHS) { - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - miopenTensorOpAdd, - &alpha, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.outputTensor, - out_grad_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - } - } - } else if (m.op_type == Op::EW_MUL) { - float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f, zero = 0.0f; - if (lhs_grad_ptr != nullptr) { - if (broadcast_inputLHS) { - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha1, - m.outputTensor, - out_grad_ptr, - &zero, - m.inputLHSTensor, - lhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputRHSTensor, - rhs_ptr, - &beta, - m.inputLHSTensor, - lhs_grad_ptr)); - } + if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { + float alpha = 1.0f, beta = 1.0f; + if (lhs_grad_ptr != nullptr) { + if (broadcast_inputLHS) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + miopenTensorOpAdd, + &alpha, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); } - if (rhs_grad_ptr != nullptr) { - if (broadcast_inputRHS) { - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputLHSTensor, - lhs_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - checkCUDNN(miopenReduceTensor(handle.dnn, - m.reduceAddDesc, - nullptr /*indices*/, - 0 /*indicesSizeInBytes*/, - handle.workSpace, - handle.workSpaceSize, - &alpha1, - m.outputTensor, - out_grad_ptr, - &zero, - m.inputRHSTensor, - rhs_grad_ptr)); - } else { - checkCUDNN(miopenOpTensor(handle.dnn, - m.opDesc, - &alpha1, - m.outputTensor, - out_grad_ptr, - &alpha2, - m.inputLHSTensor, - lhs_ptr, - &beta, - m.inputRHSTensor, - rhs_grad_ptr)); - } + } + if (m.op_type == Op::EW_SUB) { + alpha = -1.0f; + } + if (rhs_grad_ptr != nullptr) { + if (broadcast_inputRHS) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + miopenTensorOpAdd, + &alpha, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.outputTensor, + out_grad_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); } - } else if (op_type == Op::EW_MIN || op_type == Op::EW_MAX) { - float alpha = 1.0f, beta = 1.0f; - miopenDataType_t data_type; - int n; - int dims[MAX_TENSOR_DIM]; - int strides[MAX_TENSOR_DIM]; - checkCUDNN(miopenGetTensorDescriptorSize(m.outputTensor, &n)); - size_t volume = 1; - for (int i = 0; i < n; i++) { - volume *= dims[i]; + } + } else if (m.op_type == Op::EW_MUL) { + float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f, zero = 0.0f; + if (lhs_grad_ptr != nullptr) { + if (broadcast_inputLHS) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputLHSTensor, + lhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputRHSTensor, + rhs_ptr, + &beta, + m.inputLHSTensor, + lhs_grad_ptr)); } - // launch hip kernel - hipLaunchKernelGGL(elewise_binary_backward_kernel, - dim3((volume + 255) / 256), - dim3(256), - 0, - stream, - volume, - alpha, - beta, - op_type, - out_grad_ptr, - lhs_ptr, - rhs_ptr, - lhs_grad_ptr, - rhs_grad_ptr); - else { - assert(false && "Unsupported ElementWise Binary Type"); + } + if (rhs_grad_ptr != nullptr) { + if (broadcast_inputRHS) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, + nullptr /*indices*/, + 0 /*indicesSizeInBytes*/, + handle.workSpace, + handle.workSpaceSize, + &alpha1, + m.outputTensor, + out_grad_ptr, + &zero, + m.inputRHSTensor, + rhs_grad_ptr)); + } else { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, + &alpha1, + m.outputTensor, + out_grad_ptr, + &alpha2, + m.inputLHSTensor, + lhs_ptr, + &beta, + m.inputRHSTensor, + rhs_grad_ptr)); } } + } else if (op_type == Op::EW_MIN || op_type == Op::EW_MAX) { + float alpha = 1.0f, beta = 1.0f; + miopenDataType_t data_type; + int n; + int dims[MAX_TENSOR_DIM]; + int strides[MAX_TENSOR_DIM]; + checkCUDNN(miopenGetTensorDescriptorSize(m.outputTensor, &n)); + size_t volume = 1; + for (int i = 0; i < n; i++) { + volume *= dims[i]; + } + // launch hip kernel + hipLaunchKernelGGL(elewise_binary_backward_kernel, + dim3((volume + 255) / 256), + dim3(256), + 0, + stream, + volume, + alpha, + beta, + op_type, + out_grad_ptr, + lhs_ptr, + rhs_ptr, + lhs_grad_ptr, + rhs_grad_ptr); + } else { + assert(false && "Unsupported ElementWise Binary Type"); + } +} - } // namespace ElementBinary +} // namespace ElementBinary } // namespace Kernels } // namespace FlexFlow From 2a681c28c50f65a76def354622fbe43c291212e0 Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Wed, 22 May 2024 09:53:04 -0400 Subject: [PATCH 4/5] fixed problems --- lib/kernels/src/hip/element_binary_kernels.cpp | 14 +++++++------- lib/kernels/src/hip/element_unary_kernels.cpp | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index 017253c325..e9589585d8 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -105,16 +105,16 @@ ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, switch (op_type) { case Op::EW_ADD: case Op::EW_SUB: - mode = MIOPEN_OP_TENSOR_ADD; + mode = miopenTensorOpAdd; break; case Op::EW_MUL: - mode = MIOPEN_OP_TENSOR_MUL; + mode = miopenTensorOpMul; break; case Op::EW_MAX: - mode = MIOPEN_OP_TENSOR_MAX; + mode = miopenOpTensorMax; break; case Op::EW_MIN: - mode = MIOPEN_OP_TENSOR_MIN; + mode = miopenOpTensorMin; break; default: assert(false); @@ -280,7 +280,7 @@ void backward_kernel(hipStream_t stream, &alpha, m.outputTensor, out_grad_ptr, - &alpha2, + &alpha, m.outputTensor, out_grad_ptr, &beta, @@ -412,8 +412,8 @@ void backward_kernel(hipStream_t stream, } // launch hip kernel hipLaunchKernelGGL(elewise_binary_backward_kernel, - dim3((volume + 255) / 256), - dim3(256), + GET_BLOCKS(volume), + CUDA_NUM_THREADS, 0, stream, volume, diff --git a/lib/kernels/src/hip/element_unary_kernels.cpp b/lib/kernels/src/hip/element_unary_kernels.cpp index d6ca2054e9..e14018fa24 100644 --- a/lib/kernels/src/hip/element_unary_kernels.cpp +++ b/lib/kernels/src/hip/element_unary_kernels.cpp @@ -65,16 +65,16 @@ ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, miopenActivationMode_t mode; switch (op_type) { case Op::SIGMOID: - mode = CUDNN_ACTIVATION_SIGMOID; + mode = miopenActivationLOGISTIC; break; case Op::RELU: - mode = CUDNN_ACTIVATION_RELU; + mode = miopenActivationRELU; break; case Op::TANH: - mode = CUDNN_ACTIVATION_TANH; + mode = miopenActivationTANH; break; case Op::ELU: - mode = CUDNN_ACTIVATION_ELU; + mode = miopenActivationELU; break; default: assert(false); @@ -235,8 +235,8 @@ struct ForwardKernel { 0, stream, num_elements, - (T)m->scalar, - m->op_type, + (T)m.scalar, + m.op_type, input.get(), output.get()); } @@ -278,8 +278,8 @@ struct BackwardKernel { 0, stream, num_elements, - m->scalar, - m->op_type, + m.scalar, + m.op_type, output.get(), output_grad.get(), input.get(), From 8dee9a3eb107610305fdfe8394b025adbb7c188c Mon Sep 17 00:00:00 2001 From: Bob Chen Date: Fri, 24 May 2024 16:43:01 -0400 Subject: [PATCH 5/5] fix alpha --- lib/kernels/src/hip/element_binary_kernels.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index e9589585d8..232e377797 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -311,7 +311,7 @@ void backward_kernel(hipStream_t stream, &alpha, m.outputTensor, out_grad_ptr, - &alpha2, + &alpha, m.outputTensor, out_grad_ptr, &beta,