diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h index 1995505eac..99b2abb129 100644 --- a/lib/kernels/include/kernels/combine_kernels.h +++ b/lib/kernels/include/kernels/combine_kernels.h @@ -10,13 +10,11 @@ namespace Combine { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType dataType); + GenericTensorAccessorW const &output); void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad, - DataType dataType); + GenericTensorAccessorW const &input_grad); } // namespace Combine } // namespace Kernels diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index 9d1492c0a7..9f0aa7ba95 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -11,14 +11,12 @@ namespace Concat { void forward_kernel(ffStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, - int num_inputs, - ff_dim_t legion_axis); + ff_dim_t axis); void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, - int num_inputs, - ff_dim_t legion_axis); + ff_dim_t axis); } // namespace Concat } // namespace Kernels diff --git a/lib/kernels/src/cuda/combine_kernels.cu b/lib/kernels/src/cuda/combine_kernels.cu index e0f2f6f560..62eadf5b33 100644 --- a/lib/kernels/src/cuda/combine_kernels.cu +++ b/lib/kernels/src/cuda/combine_kernels.cu @@ -50,17 +50,15 @@ struct BackwardKernel { void forward_kernel(ffStream_t stream, GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - DataType data_type) { - DataTypeDispatch1{}(data_type, stream, input, output); + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad, - DataType data_type) { + GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}( - data_type, stream, output_grad, input_grad); + input_grad.data_type, stream, output_grad, input_grad); } } // namespace Combine diff --git a/lib/kernels/src/cuda/concat_kernels.cu b/lib/kernels/src/cuda/concat_kernels.cu index 6e96388d22..928b7b1945 100644 --- a/lib/kernels/src/cuda/concat_kernels.cu +++ b/lib/kernels/src/cuda/concat_kernels.cu @@ -25,11 +25,11 @@ namespace Concat { void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, - req legion_axis) { + ff_dim_t axis) { num_blocks = 1; blk_size = 1; for (int d = 0; d < shape.num_dims(); d++) { - if (d <= legion_axis) { + if (d <= axis) { blk_size *= shape[legion_dim_t(d)]; } else { num_blocks *= shape[legion_dim_t(d)]; @@ -37,18 +37,17 @@ void calc_blk_size(size_t &num_blocks, } } -void forward_kernel(ffStream_t stream, +void forward_kernel(cudaStream_t stream, GenericTensorAccessorW const &output, std::vector const &inputs, - int num_inputs, - ff_dim_t legion_axis) { + ff_dim_t axis) { size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; + int num_inputs = inputs.size(); assert(num_inputs <= MAX_NUM_INPUTS); - calc_blk_size(num_blocks, output_blk_size, output.shape, legion_axis); + calc_blk_size(num_blocks, output_blk_size, output.shape, axis); for (int i = 0; i < num_inputs; i++) { size_t input_num_blocks = 1; - calc_blk_size( - input_num_blocks, input_blk_sizes[i], inputs[i].shape, legion_axis); + calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); assert(input_num_blocks == num_blocks); } @@ -66,20 +65,19 @@ void forward_kernel(ffStream_t stream, } } -void backward_kernel(ffStream_t stream, +void backward_kernel(cudaStream_t stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, - int num_inputs, - ff_dim_t legion_axis) { + ff_dim_t axis) { size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; + int num_inputs = input_grads.size(); assert(num_inputs <= MAX_NUM_INPUTS); - ArrayShape shape = output_grad.shape; - calc_blk_size(num_blocks, output_blk_size, shape, legion_axis); + calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); for (int i = 0; i < num_inputs; i++) { shape = input_grads[i].shape; size_t input_num_blocks = 1; - calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, legion_axis); + calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); assert(input_num_blocks == num_blocks); } diff --git a/lib/kernels/src/cuda/conv_2d_kernels.cu b/lib/kernels/src/cuda/conv_2d_kernels.cu index f54b045db8..d0d4c4c3b5 100644 --- a/lib/kernels/src/cuda/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/conv_2d_kernels.cu @@ -268,11 +268,11 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, void forward_kernel(cudaStream_t stream, Conv2DPerDeviceState const &m, - optional const &activation, float const *input_ptr, float *output_ptr, float const *filter_ptr, - float const *bias_ptr) { + float const *bias_ptr, + optional activation) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; @@ -313,14 +313,14 @@ void forward_kernel(cudaStream_t stream, void backward_kernel(cudaStream_t stream, Conv2DPerDeviceState const &m, - optional const &activation, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr) { + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + optional activation) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; @@ -355,7 +355,7 @@ void backward_kernel(cudaStream_t stream, m.handle.workSpaceSize, &alpha, m.filterDesc, - kernel_grad_ptr)); + filter_grad_ptr)); // Compute bias gradiant // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { diff --git a/lib/kernels/src/hip/combine_kernels.cpp b/lib/kernels/src/hip/combine_kernels.cpp index 1a6fa79f10..b9a862af6d 100644 --- a/lib/kernels/src/hip/combine_kernels.cpp +++ b/lib/kernels/src/hip/combine_kernels.cpp @@ -19,10 +19,6 @@ #include namespace FlexFlow { - -CombinePerDeviceState::CombinePerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) {} - namespace Kernels { namespace Combine { @@ -57,18 +53,16 @@ struct BackwardKernel { }; void forward_kernel(ffStream_t stream, - CombinePerDeviceState const *m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m->data_type, stream, input, output); + DataTypeDispatch1{}(input.data_type, stream, input, output); } void backward_kernel(ffStream_t stream, - CombinePerDeviceState const *m, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { DataTypeDispatch1{}( - m->data_type, stream, output_grad, input_grad); + input_grad.data_type, stream, output_grad, input_grad); } } // namespace Combine diff --git a/lib/kernels/src/hip/concat_kernels.cpp b/lib/kernels/src/hip/concat_kernels.cpp index e818f8b568..6eac034a4b 100644 --- a/lib/kernels/src/hip/concat_kernels.cpp +++ b/lib/kernels/src/hip/concat_kernels.cpp @@ -18,59 +18,35 @@ #include namespace FlexFlow { - -// declare Legion names -using Legion::coord_t; -using Legion::Rect; - namespace Kernels { namespace Concat { -void init_meta(ConcatPerDeviceState *m, int legion_axis) { - m->legion_axis = legion_axis; -} - -template -void calc_blk_size(coord_t &num_blocks, - coord_t &blk_size, - Rect rect, - int axis) { +void calc_blk_size(size_t &num_blocks, + size_t &blk_size, + ArrayShape const &shape, + ff_dim_t axis) { num_blocks = 1; blk_size = 1; - for (int d = 0; d < N; d++) { + for (int d = 0; d < shape.num_dims(); d++) { if (d <= axis) { - blk_size *= (rect.hi[d] - rect.lo[d] + 1); + blk_size *= shape[legion_dim_t(d)]; } else { - num_blocks *= (rect.hi[d] - rect.lo[d] + 1); + num_blocks *= shape[legion_dim_t(d)]; } } } void forward_kernel(hipStream_t stream, GenericTensorAccessorW const &output, - GenericTensorAccessorR const *inputs, - int num_inputs, - int axis) { - coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; + std::vector const &inputs, + ff_dim_t axis) { + size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; + int num_inputs = inputs.size(); assert(num_inputs <= MAX_NUM_INPUTS); - switch (output.domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: { \ - Rect rect = output.domain; \ - calc_blk_size(num_blocks, output_blk_size, rect, axis); \ - for (int i = 0; i < num_inputs; i++) { \ - rect = inputs[i].domain; \ - coord_t input_num_blocks = 1; \ - calc_blk_size(input_num_blocks, input_blk_sizes[i], rect, axis); \ - assert(input_num_blocks == num_blocks); \ - } \ - break; \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - fprintf(stderr, "Unsupported concat dimension number"); - assert(false); + for (int i = 0; i < num_inputs; i++) { + size_t input_num_blocks = 1; + calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis); + assert(input_num_blocks == num_blocks); } off_t offset = 0; @@ -89,31 +65,19 @@ void forward_kernel(hipStream_t stream, } } -void backward_kernel(ffStream_t stream, +void backward_kernel(hipStream_t stream, GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const *input_grads, - int num_inputs, - int axis) { + std::vector const &input_grads, + ff_dim_t axis) { coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; + int num_inputs = input_grads.size(); assert(num_inputs <= MAX_NUM_INPUTS); - switch (output_grad.domain.get_dim()) { -#define DIMFUNC(DIM) \ - case DIM: { \ - Rect rect = output_grad.domain; \ - calc_blk_size(num_blocks, output_blk_size, rect, axis); \ - for (int i = 0; i < num_inputs; i++) { \ - rect = input_grads[i].domain; \ - coord_t input_num_blocks = 1; \ - calc_blk_size(input_num_blocks, input_blk_sizes[i], rect, axis); \ - assert(input_num_blocks == num_blocks); \ - } \ - break; \ - } - LEGION_FOREACH_N(DIMFUNC) -#undef DIMFUNC - default: - fprintf(stderr, "Unsupported concat dimension number"); - assert(false); + calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); + for (int i = 0; i < num_inputs; i++) { + shape = input_grads[i].shape; + size_t input_num_blocks = 1; + calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); + assert(input_num_blocks == num_blocks); } off_t offset = 0; @@ -130,12 +94,6 @@ void backward_kernel(ffStream_t stream, output_blk_size); offset += input_blk_sizes[i]; } - - // Rect<2> output_rect(Point<2>(0, 0), Point<2>(output_blk_size-1, batch_size - // - 1)); Rect<2> input_rect(Point<2>(0, 0), Point<2>(input_blk_sizes[0]-1, - // batch_size - 1)); print_tensor<2, float>(output_grad - output_blk_size, - // output_rect, "[Concat:backward:output]"); print_tensor<2, - // float>(input_grads[0], input_rect, "[Concat:backward:input0]"); } } // namespace Concat diff --git a/lib/kernels/src/hip/conv_2d_kernels.cpp b/lib/kernels/src/hip/conv_2d_kernels.cpp index 761a59b545..4f5d2f1644 100644 --- a/lib/kernels/src/hip/conv_2d_kernels.cpp +++ b/lib/kernels/src/hip/conv_2d_kernels.cpp @@ -179,14 +179,10 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); - checkCUDNN(miopenSet4dTensorDescriptor(m->filterDesc, - miopenFloat, - output_c, - input_c / groups, - kernel_h, - kernel_w)); + checkCUDNN(miopenSet4dTensorDescriptor( + filterDesc, miopenFloat, output_c, input_c / groups, kernel_h, kernel_w)); - checkCUDNN(miopenInitConvolutionDescriptor(m->convDesc, + checkCUDNN(miopenInitConvolutionDescriptor(convDesc, miopenConvolution, pad_h, // conv->padding_h, pad_w, // conv->padding_w, @@ -195,11 +191,11 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, 1 /*upscale_x*/, 1 /*upscale_y*/)); if (groups != 1) { - checkCUDNN(miopenSetConvolutionGroupCount(m->convDesc, groups)); + checkCUDNN(miopenSetConvolutionGroupCount(convDesc, groups)); } // TODO: enable tensor core when possible - if (m->handle.allowTensorOpMathConversion) { + if (handle.allowTensorOpMathConversion) { // checkCUDNN(hipdnnSetConvolutionMathType(m->convDesc, // CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); } else { @@ -209,72 +205,75 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, int n, c, h, w; checkCUDNN(miopenGetConvolutionForwardOutputDim( - m->convDesc, m->inputTensor, m->filterDesc, &n, &c, &h, &w)); + convDesc, inputTensor, filterDesc, &n, &c, &h, &w)); assert(n == output_n); assert(c == output_c); assert(h == output_h); assert(w == output_w); checkCUDNN( - miopenSet4dTensorDescriptor(m->outputTensor, miopenFloat, n, c, h, w)); + miopenSet4dTensorDescriptor(outputTensor, miopenFloat, n, c, h, w)); float time; // select forward algorithm - m->fwdAlgo = selectConvolutionForwardAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->filterDesc, - kernel_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->outputTensor, - output_ptr, - &time); - if (forward_time != nullptr) { - *forward_time += time; - } + fwdAlgo = selectConvolutionForwardAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + filterDesc, + filter_ptr, + convDesc, + handle.workSpace, + handle.workSpaceSize, + outputTensor, + output.get_float_ptr(), + nullptr); // select backward filter algorithm - m->bwdFilterAlgo = - selectConvolutionBackwardFilterAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->filterDesc, - kernel_grad_ptr, - &time); - if (backward_time != nullptr) { - *backward_time += time; - } + bwdFilterAlgo = + selectConvolutionBackwardFilterAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + filterDesc, + filter_grad_ptr, + nullptr); // select backward data algorithm - m->bwdDataAlgo = - selectConvolutionBackwardDataAlgorithm(m->handle.dnn, - m->filterDesc, - kernel_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->inputTensor, - (float *)input_ptr, - &time); - if (backward_time != nullptr) { - *backward_time += time; - } - if (m->relu) { + bwdDataAlgo = + selectConvolutionBackwardDataAlgorithm(handle.dnn, + filterDesc, + filter_ptr, + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + inputTensor, + (void *)input.get_float_ptr(), + nullptr); + if (activation.has_value()) { checkCUDNN(miopenSetActivationDescriptor( - m->actiDesc, miopenActivationRELU, 0.0, 0.0, 0.0)); + actiDesc, miopenActivationRELU, 0.0, 0.0, 0.0)); } + + Conv2DPerDeviceState per_device_state = {handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo}; + return per_device_state; } -void forward_kernel(ffStream_t stream, +void forward_kernel(hipStream_t stream, Conv2DPerDeviceState const &m, float const *input_ptr, float *output_ptr, @@ -282,46 +281,46 @@ void forward_kernel(ffStream_t stream, float const *bias_ptr, optional activation) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(miopenConvolutionForward(m->handle.dnn, + checkCUDNN(miopenConvolutionForward(m.handle.dnn, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, - m->filterDesc, + m.filterDesc, filter_ptr, - m->convDesc, - m->fwdAlgo, + m.convDesc, + m.fwdAlgo, &beta, - m->outputTensor, + m.outputTensor, output_ptr, - m->handle.workSpace, - m->handle.workSpaceSize)); + m.handle.workSpace, + m.handle.workSpaceSize)); // use_bias == True if (bias_ptr != NULL) { - checkCUDNN(miopenConvolutionForwardBias(m->handle.dnn, + checkCUDNN(miopenConvolutionForwardBias(m.handle.dnn, &alpha, - m->biasTensor, + m.biasTensor, bias_ptr, &alpha, - m->outputTensor, + m.outputTensor, output_ptr)); } - if (m->relu) { - checkCUDNN(miopenActivationForward(m->handle.dnn, - m->actiDesc, + if (activation.has_value()) { + checkCUDNN(miopenActivationForward(m.handle.dnn, + m.actiDesc, &alpha, - m->outputTensor, + m.outputTensor, output_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); } } -void backward_kernel(ffStream_t stream, +void backward_kernel(hipStream_t stream, Conv2DPerDeviceState const &m, float const *input_ptr, float *input_grad_ptr, @@ -332,14 +331,14 @@ void backward_kernel(ffStream_t stream, float *bias_grad_ptr, optional activation) { - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + checkCUDNN(miopenSetStream(m.handle.dnn, stream)); float alpha = 1.0f; float beta = 0.0f; - if (m->relu) { + if (activation.has_value()) { miopenDataType_t dataType; int n, c, h, w, nStride, cStride, hStride, wStride; - checkCUDNN(miopenGet4dTensorDescriptor(m->outputTensor, + checkCUDNN(miopenGet4dTensorDescriptor(m.outputTensor, &dataType, &n, &c, @@ -360,46 +359,46 @@ void backward_kernel(ffStream_t stream, } // Compute filter gradiant // NOTE: we use alpha for kernel_grad to accumulate gradients - checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn, + checkCUDNN(miopenConvolutionBackwardWeights(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->inputTensor, + m.inputTensor, input_ptr, - m->convDesc, - m->bwdFilterAlgo, + m.convDesc, + m.bwdFilterAlgo, &beta, - m->filterDesc, - kernel_grad_ptr, - m->handle.workSpace, - m->handle.workSpaceSize)); + m.filterDesc, + filter_grad_ptr, + m.handle.workSpace, + m.handle.workSpaceSize)); // Compute bias gradiant // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { - checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn, + checkCUDNN(miopenConvolutionBackwardBias(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &beta, - m->biasTensor, + m.biasTensor, bias_grad_ptr)); } // Compute data gradiant // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { - checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn, + checkCUDNN(miopenConvolutionBackwardData(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->filterDesc, - kernel_ptr, - m->convDesc, - m->bwdDataAlgo, + m.filterDesc, + filter_ptr, + m.convDesc, + m.bwdDataAlgo, &beta, - m->inputTensor, + m.inputTensor, input_grad_ptr, - m->handle.workSpace, - m->handle.workSpaceSize)); + m.handle.workSpace, + m.handle.workSpaceSize)); } } diff --git a/lib/kernels/src/hip/element_binary_kernels.cpp b/lib/kernels/src/hip/element_binary_kernels.cpp index fabe7821c7..5d29c27837 100644 --- a/lib/kernels/src/hip/element_binary_kernels.cpp +++ b/lib/kernels/src/hip/element_binary_kernels.cpp @@ -18,52 +18,68 @@ #include namespace FlexFlow { -// declare Legion names -using Legion::coord_t; -using Legion::Domain; - -ElementBinaryPerDeviceState::ElementBinaryPerDeviceState(FFHandler handler) - : OpPerDeviceState(handler) { - checkCUDNN(miopenCreateTensorDescriptor(&input1Tensor)); - checkCUDNN(miopenCreateTensorDescriptor(&input2Tensor)); - checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); - checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceAddDesc)); - op_type = OP_NOOP; -} - namespace Kernels { namespace ElementBinary { -/*static*/ -void init_kernel(ElementBinaryPerDeviceState *m, - Domain const &input1_domain, - Domain const &input2_domain, - Domain const &output_domain) { +ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, + OperatorType op_type, + bool should_broadcast_lhs, + bool should_broadcast_rhs, + ArrayShape lhs_shape, + ArrayShape rhs_shape, + ArrayShape output_shape) { + ffTensorDescriptor_t inputLHSTensor; + ffTensorDescriptor_t inputRHSTensor; + ffTensorDescriptor_t outputTensor; + ffOpTensorDescriptor_t opDesc; + ffReduceTensorDescriptor_t reduceAddDesc; miopenTensorOp_t mode; - switch (m->op_type) { - case OP_EW_ADD: - case OP_EW_SUB: + + checkCUDNN(miopenCreateTensorDescriptor(&inputLHSTensor)); + checkCUDNN(miopenCreateTensorDescriptor(&inputRHSTensor)); + checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); + checkCUDNN(miopenCreateOpTensorDescriptor(&opDesc)); + checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceAddDesc)); + + switch (op_type) { + case Op::EW_ADD: + case Op::EW_SUB: mode = miopenTensorOpAdd; break; - case OP_EW_MUL: + case Op::EW_MUL: mode = miopenTensorOpMul; break; + case Op::EW_MAX: + mode = miopenTensorOpMax; + break; + case Op::EW_MIN: + mode = miopenTensorOpMin; + break; default: assert(false); } - m->opDesc = mode; - checkCUDNN(miopenSetReduceTensorDescriptor(m->reduceAddDesc, + checkCUDNN(miopenSetOpTensorDescriptor( + opDesc, mode, miopenFloat, MIOPEN_PROPAGATE_NAN)); + checkCUDNN(miopenSetReduceTensorDescriptor(reduceAddDesc, MIOPEN_REDUCE_TENSOR_ADD, miopenFloat, MIOPEN_PROPAGATE_NAN, MIOPEN_REDUCE_TENSOR_NO_INDICES, MIOPEN_32BIT_INDICES)); checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->input1Tensor, input1_domain)); + miopenSetTensorDescriptorFromArrayShape(inputLHSTensor, lhs_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->input2Tensor, input2_domain)); + miopenSetTensorDescriptorFromArrayShape(inputRHSTensor, rhs_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromDomain(m->outputTensor, output_domain)); + miopenSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + + ElementBinaryPerDeviceState per_device_state = {handle, + inputLHSTensor, + inputRHSTensor, + outputTensor, + opDesc, + reduceAddDesc}; + return per_device_state; } __global__ void elewise_binary_forward_kernel(coord_t volume, @@ -74,25 +90,25 @@ __global__ void elewise_binary_forward_kernel(coord_t volume, float const *in2, float *out) { switch (type) { - case OP_EW_ADD: { + case Op::EW_ADD: { CUDA_KERNEL_LOOP(i, volume) { out[i] = alpha * (in1[i] + in2[i]) + beta * out[i]; } break; } - case OP_EW_SUB: { + case Op::EW_SUB: { CUDA_KERNEL_LOOP(i, volume) { out[i] = alpha * (in1[i] - in2[i]) + beta * out[i]; } break; } - case OP_EW_MUL: { + case Op::EW_MUL: { CUDA_KERNEL_LOOP(i, volume) { out[i] = alpha * in1[i] * in2[i] + beta * out[i]; } break; } - case OP_EW_DIV: { + case Op::EW_DIV: { CUDA_KERNEL_LOOP(i, volume) { out[i] = alpha * (in1[i] / in2[i]) + beta * out[i]; } @@ -114,22 +130,22 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, float *in2_grad) { CUDA_KERNEL_LOOP(i, volume) { switch (type) { - case OP_EW_ADD: { + case Op::EW_ADD: { in1_grad[i] = alpha * out_grad[i] + beta * in1_grad[i]; in2_grad[i] = alpha * out_grad[i] + beta * in2_grad[i]; break; } - case OP_EW_SUB: { + case Op::EW_SUB: { in1_grad[i] = alpha * out_grad[i] + beta * in1_grad[i]; in2_grad[i] = -alpha * out_grad[i] + beta * in2_grad[i]; break; } - case OP_EW_MUL: { + case Op::EW_MUL: { in1_grad[i] = alpha * out_grad[i] * in2[i] + beta * in1_grad[i]; in2_grad[i] = alpha * out_grad[i] * in1[i] + beta * in2_grad[i]; break; } - case OP_EW_DIV: { + case Op::EW_DIV: { in1_grad[i] = alpha * out_grad[i] / in2[i] + beta * in1_grad[i]; in2_grad[i] = -alpha * out_grad[i] * in1[i] / (in2[i] * in2[i]) + beta * in2_grad[i]; @@ -143,163 +159,170 @@ __global__ void elewise_binary_backward_kernel(coord_t volume, void forward_kernel(hipStream_t stream, ElementBinaryPerDeviceState const *m, - float const *in1_ptr, - float const *in2_ptr, - float *out_ptr) { - checkCUDA(hipblasSetStream(m->handle.blas, stream)); - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + PerDeviceFFHandle handle) { + checkCUDA(hipblasSetStream(handle.blas, stream)); + checkCUDNN(miopenSetStream(handle.dnn, stream)); float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; - switch (m->op_type) { - case OP_EW_SUB: + switch (op_type) { + case Op::EW_SUB: alpha2 = -1.0f; break; - case OP_EW_ADD: - case OP_EW_MUL: + case Op::EW_ADD: + case Op::EW_MUL: break; default: assert(false); } // cudnn currently does not support broadcasting the first input in // cudnnOpTensor - if (m->broadcast_input1) { + if (broadcast_inputLHS) { // currently only handle add and sub - assert(m->op_type == OP_EW_SUB || m->op_type == OP_EW_ADD); - checkCUDNN(miopenOpTensor(m->handle.dnn, - m->opDesc, + assert(op_type == Op::EW_SUB || op_type == Op::EW_ADD); + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, &beta, - m->outputTensor, + m.outputTensor, out_ptr, &alpha1, - m->input1Tensor, - in1_ptr, + m.inputLHSTensor, + lhs_ptr, &beta, - m->outputTensor, + m.outputTensor, out_ptr)); - checkCUDNN(miopenOpTensor(m->handle.dnn, - m->opDesc, + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, &beta, - m->outputTensor, + m.outputTensor, out_ptr, &alpha2, - m->input2Tensor, - in2_ptr, + m.inputRHSTensor, + rhs_ptr, &alpha1, - m->outputTensor, + m.outputTensor, out_ptr)); } else { - checkCUDNN(miopenOpTensor(m->handle.dnn, - m->opDesc, + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, &alpha1, - m->input1Tensor, - in1_ptr, + m.inputLHSTensor, + lhs_ptr, &alpha2, - m->input2Tensor, - in2_ptr, + m.inputRHSTensor, + rhs_ptr, &beta, - m->outputTensor, + m.outputTensor, out_ptr)); } } void backward_kernel(hipStream_t stream, - ElementBinaryPerDeviceState const *m, + ElementBinaryPerDeviceState const &m, float const *out_grad_ptr, - float const *in1_ptr, - float const *in2_ptr, - float *in1_grad_ptr, - float *in2_grad_ptr) { - checkCUDA(hipblasSetStream(m->handle.blas, stream)); - checkCUDNN(miopenSetStream(m->handle.dnn, stream)); + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + PerDeviceFFHandle handle) { + checkCUDA(hipblasSetStream(handle.blas, stream)); + checkCUDNN(miopenSetStream(handle.dnn, stream)); - if (m->op_type == OP_EW_ADD || m->op_type == OP_EW_SUB) { + if (m.op_type == Op::EW_ADD || m.op_type == Op::EW_SUB) { float alpha = 1.0f, alpha2 = 0.0f, beta = 1.0f; - if (in1_grad_ptr != nullptr) { - if (m->broadcast_input1) { - checkCUDNN(miopenReduceTensor(m->handle.dnn, - m->reduceAddDesc, + if (lhs_grad_ptr != nullptr) { + if (m.broadcast_input1) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, nullptr /*indices*/, 0 /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, + handle.workSpace, + handle.workSpaceSize, &alpha, - m->outputTensor, + m.outputTensor, out_grad_ptr, &beta, - m->input1Tensor, - in1_grad_ptr)); + m.inputLHSTensor, + lhs_grad_ptr)); } else { - checkCUDNN(miopenOpTensor(m->handle.dnn, + checkCUDNN(miopenOpTensor(handle.dnn, miopenTensorOpAdd, &alpha, - m->outputTensor, + m.outputTensor, out_grad_ptr, &alpha2, - m->outputTensor, + m.outputTensor, out_grad_ptr, &beta, - m->input1Tensor, - in1_grad_ptr)); + m.inputLHSTensor, + lhs_grad_ptr)); } } - if (m->op_type == OP_EW_SUB) { + if (m.op_type == Op::EW_SUB) { alpha = -1.0f; } - if (in2_grad_ptr != nullptr) { - if (m->broadcast_input2) { - checkCUDNN(miopenReduceTensor(m->handle.dnn, - m->reduceAddDesc, + if (rhs_grad_ptr != nullptr) { + if (m.broadcast_input2) { + checkCUDNN(miopenReduceTensor(handle.dnn, + m.reduceAddDesc, nullptr /*indices*/, 0 /*indicesSizeInBytes*/, - m->handle.workSpace, - m->handle.workSpaceSize, + handle.workSpace, + handle.workSpaceSize, &alpha, - m->outputTensor, + m.outputTensor, out_grad_ptr, &beta, - m->input2Tensor, - in2_grad_ptr)); + m.inputRHSTensor, + rhs_grad_ptr)); } else { - checkCUDNN(miopenOpTensor(m->handle.dnn, + checkCUDNN(miopenOpTensor(handle.dnn, miopenTensorOpAdd, &alpha, - m->outputTensor, + m.outputTensor, out_grad_ptr, &alpha2, - m->outputTensor, + m.outputTensor, out_grad_ptr, &beta, - m->input2Tensor, - in2_grad_ptr)); + m.inputRHSTensor, + rhs_grad_ptr)); } } - } else if (m->op_type == OP_EW_MUL) { + } else if (m.op_type == Op::EW_MUL) { float alpha1 = 1.0f, alpha2 = 1.0f, beta = 1.0f; - if (in1_grad_ptr != nullptr) { - checkCUDNN(miopenOpTensor(m->handle.dnn, - m->opDesc, + if (lhs_grad_ptr != nullptr) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, &alpha1, - m->outputTensor, + m.outputTensor, out_grad_ptr, &alpha2, - m->input2Tensor, - in2_ptr, + m.inputRHSTensor, + rhs_ptr, &beta, - m->input1Tensor, - in1_grad_ptr)); + m.inputLHSTensor, + lhs_grad_ptr)); } - if (in2_grad_ptr != nullptr) { - checkCUDNN(miopenOpTensor(m->handle.dnn, - m->opDesc, + if (rhs_grad_ptr != nullptr) { + checkCUDNN(miopenOpTensor(handle.dnn, + m.opDesc, &alpha1, - m->outputTensor, + m.outputTensor, out_grad_ptr, &alpha2, - m->input2Tensor, - in1_ptr, + m.inputRHSTensor, + lhs_ptr, &beta, - m->input1Tensor, - in2_grad_ptr)); + m.inputLHSTensor, + rhs_grad_ptr)); } } else { assert(false && "Unsupported ElementWise Binary Type"); diff --git a/lib/runtime/src/ops/combine.cc b/lib/runtime/src/ops/combine.cc index a5f4d4f73b..46d5ebb4fe 100644 --- a/lib/runtime/src/ops/combine.cc +++ b/lib/runtime/src/ops/combine.cc @@ -56,8 +56,7 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Combine] forward_time = %.2lfms\n", input, - output, - input.data_type); + output); } static void forward_task(Task const *task, @@ -78,8 +77,7 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { profiling, "[Combine] forward_time = %.2lfms\n", input_grad, - output_grad, - input_grad.data_type); + output_grad); } static void backward_task(Task const *task, diff --git a/lib/runtime/src/ops/concat.cc b/lib/runtime/src/ops/concat.cc index 0b276f405a..1ce549cc57 100644 --- a/lib/runtime/src/ops/concat.cc +++ b/lib/runtime/src/ops/concat.cc @@ -62,7 +62,6 @@ static optional forward_task_impl(TaskArgumentAccessor const &acc) { "[Concat] forward_time = %.2lfms\n", output, inputs, - attrs.num_inputs, attrs.axis); } @@ -88,7 +87,6 @@ static optional backward_task_impl(TaskArgumentAccessor const &acc) { "[Concat] backward_time = %.2lfms\n", output_grad, input_grads, - attrs.num_inputs, attrs.axis); }