diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 15369b2e13..c65c2befb8 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -34,7 +34,10 @@ class GenericTensorAccessorW { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT(GenericTensorAccessorW, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW, + data_type, + shape, + ptr); class GenericTensorAccessorR { public: @@ -59,7 +62,10 @@ class GenericTensorAccessorR { ArrayShape shape; req ptr; }; -FF_VISITABLE_STRUCT(GenericTensorAccessorR, data_type, shape, ptr); +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR, + data_type, + shape, + ptr); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 50b3c0601f..e8ae6fd014 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -1,66 +1,74 @@ #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H +#include "kernels/accessor.h" #include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "op-attrs/activation.h" +#include "utils/visitable.h" namespace FlexFlow { -class Conv2DPerDeviceState : public PerDeviceOpState { -public: - Conv2DPerDeviceState(FFHandler handler); - ffTensorDescriptor_t inputTensor, biasTensor, outputTensor; +struct Conv2DPerDeviceState { + PerDeviceFFHandle handle; + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t biasTensor; + ffTensorDescriptor_t outputTensor; ffFilterDescriptor_t filterDesc; ffActivationDescriptor_t actiDesc; ffConvolutionDescriptor_t convDesc; ffConvolutionFwdAlgo_t fwdAlgo; ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; - ffConvolutionBwdDataAlgo_t bwdDataAlgo; - bool relu, use_bias; - char op_name[MAX_OPNAME]; + req bwdDataAlgo; }; +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, + handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo); + namespace Kernels { namespace Conv2D { -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time = nullptr, - float *backward_time = nullptr); +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr); void forward_kernel(ffStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, float const *input_ptr, float *output_ptr, float const *filter_ptr, - float const *bias_ptr); + float const *bias_ptr, + optional activation); void backward_kernel(ffStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr); + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + optional activation); } // namespace Conv2D } // namespace Kernels diff --git a/lib/kernels/include/kernels/device.h b/lib/kernels/include/kernels/device.h index 663f5c2b3f..b1571da1b3 100644 --- a/lib/kernels/include/kernels/device.h +++ b/lib/kernels/include/kernels/device.h @@ -25,8 +25,6 @@ #include #include -namespace FlexFlow { - #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) typedef cudaStream_t ffStream_t; cudaError_t get_legion_stream(cudaStream_t *stream); @@ -79,6 +77,8 @@ typedef hipError_t ffError_t; #error "Unknown device" #endif +namespace FlexFlow { + #define FatalError(s) \ do { \ std::stringstream _where, _message; \ diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 9df2f376e2..e15ee4ff32 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -6,6 +6,7 @@ #endif #include "kernels/device.h" +#include "utils/visitable.h" namespace FlexFlow { @@ -22,6 +23,23 @@ struct PerDeviceFFHandle { #endif }; +#ifdef FF_USE_NCCL +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, + dnn, + blas, + workSpace, + workSpaceSize, + allowTensorOpMathConversion, + ncclComm); +#else +FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, + dnn, + blas, + workSpace, + workSpaceSize, + allowTensorOpMathConversion); +#endif + } // namespace FlexFlow #endif diff --git a/lib/kernels/src/cuda/conv_2d_kernels.cu b/lib/kernels/src/cuda/conv_2d_kernels.cu index 265954aa17..f54b045db8 100644 --- a/lib/kernels/src/cuda/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/conv_2d_kernels.cu @@ -1,44 +1,161 @@ +#include "device.h" #include "kernels/conv_2d_kernels.h" -#include "kernels/cuda_helper.h" +#include "kernels/device.h" namespace FlexFlow { +namespace Kernels { +namespace Conv2D { + +cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + void const *w, + const cudnnTensorDescriptor_t dyDesc, + void const *dy, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionBwdDataAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithmEx(handle, + wDesc, + w, + dyDesc, + dy, + convDesc, + dxDesc, + dx, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +cudnnConvolutionFwdAlgo_t selectConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + void const *x, + const cudnnFilterDescriptor_t wDesc, + void const *w, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnTensorDescriptor_t yDesc, + void *y, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionFwdAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(handle, + xDesc, + x, + wDesc, + w, + convDesc, + yDesc, + y, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + void const *x, + const cudnnTensorDescriptor_t dyDesc, + void const *dy, + const cudnnConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + cudnnConvolutionBwdFilterAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithmEx(handle, + xDesc, + x, + dyDesc, + dy, + convDesc, + dwDesc, + dw, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize)); + assert(cnt > 0); + checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].algo; +} + +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { + + ffTensorDescriptor_t inputTensor; + ffTensorDescriptor_t biasTensor; + ffTensorDescriptor_t outputTensor; + ffFilterDescriptor_t filterDesc; + ffActivationDescriptor_t actiDesc; + ffConvolutionDescriptor_t convDesc; + ffConvolutionFwdAlgo_t fwdAlgo; + ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; + ffConvolutionBwdDataAlgo_t bwdDataAlgo; + + int input_w = input.shape[legion_dim_t(0)]; + int input_h = input.shape[legion_dim_t(1)]; + int input_c = input.shape[legion_dim_t(2)]; + int input_n = input.shape[legion_dim_t(3)]; + + int output_w = output.shape[legion_dim_t(0)]; + int output_h = output.shape[legion_dim_t(1)]; + int output_c = output.shape[legion_dim_t(2)]; + int output_n = output.shape[legion_dim_t(3)]; -Conv2DPerDeviceState::Conv2DPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) { checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc)); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); -} -namespace Kernels { -namespace Conv2D { - -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time, - float *backward_time) { - checkCUDNN(cudnnSetTensor4dDescriptor(m->inputTensor, + checkCUDNN(cudnnSetTensor4dDescriptor(inputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, input_n, @@ -47,16 +164,11 @@ void init_kernel(Conv2DPerDeviceState *m, input_w)); checkCUDNN(cudnnSetTensor4dDescriptor( - m->biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1)); + biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1)); // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); - printf("filterDim: kernel(%d %d) c_in(%d), c_out(%d)\n", - kernel_h, - kernel_w, - input_c / groups, - output_c); - checkCUDNN(cudnnSetFilter4dDescriptor(m->filterDesc, + checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, output_c, @@ -64,9 +176,9 @@ void init_kernel(Conv2DPerDeviceState *m, kernel_h, kernel_w)); - checkCUDNN(cudnnSetConvolution2dDescriptor(m->convDesc, - pad_h, // conv->padding_h, - pad_w, // conv->padding_w, + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + pad_h, + pad_w, stride_h, stride_w, 1 /*upscale_x*/, @@ -74,132 +186,134 @@ void init_kernel(Conv2DPerDeviceState *m, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); if (groups != 1) { - checkCUDNN(cudnnSetConvolutionGroupCount(m->convDesc, groups)); + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, groups)); } // enable tensor core when possible - if (m->handle.allowTensorOpMathConversion) { + if (handle.allowTensorOpMathConversion) { checkCUDNN(cudnnSetConvolutionMathType( - m->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); + convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); } else { - checkCUDNN(cudnnSetConvolutionMathType(m->convDesc, CUDNN_TENSOR_OP_MATH)); + checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH)); } int n, c, h, w; checkCUDNN(cudnnGetConvolution2dForwardOutputDim( - m->convDesc, m->inputTensor, m->filterDesc, &n, &c, &h, &w)); + convDesc, inputTensor, filterDesc, &n, &c, &h, &w)); assert(n == output_n); assert(c == output_c); assert(h == output_h); assert(w == output_w); checkCUDNN(cudnnSetTensor4dDescriptor( - m->outputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); + outputTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, n, c, h, w)); float time; // select forward algorithm - m->fwdAlgo = selectConvolutionForwardAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->filterDesc, - kernel_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->outputTensor, - output_ptr, - &time); - if (forward_time != nullptr) { - *forward_time += time; - } + fwdAlgo = selectConvolutionForwardAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + filterDesc, + filter_ptr, + convDesc, + handle.workSpace, + handle.workSpaceSize, + outputTensor, + output.get_float_ptr(), + nullptr); // select backward filter algorithm - m->bwdFilterAlgo = - selectConvolutionBackwardFilterAlgorithm(m->handle.dnn, - m->inputTensor, - input_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->filterDesc, - kernel_grad_ptr, - &time); - if (backward_time != nullptr) { - *backward_time += time; - } + bwdFilterAlgo = + selectConvolutionBackwardFilterAlgorithm(handle.dnn, + inputTensor, + input.get_float_ptr(), + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + filterDesc, + filter_grad_ptr, + nullptr); // select backward data algorithm - m->bwdDataAlgo = - selectConvolutionBackwardDataAlgorithm(m->handle.dnn, - m->filterDesc, - kernel_ptr, - m->outputTensor, - output_ptr, - m->convDesc, - m->handle.workSpace, - m->handle.workSpaceSize, - m->inputTensor, - (float *)input_ptr, - &time); - if (backward_time != nullptr) { - *backward_time += time; - } - - if (m->relu) { + bwdDataAlgo = selectConvolutionBackwardDataAlgorithm(handle.dnn, + filterDesc, + filter_ptr, + outputTensor, + output.get_float_ptr(), + convDesc, + handle.workSpace, + handle.workSpaceSize, + inputTensor, + input.get_float_ptr(), + nullptr); + if (activation.has_value()) { checkCUDNN(cudnnSetActivationDescriptor( - m->actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); + actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } + + Conv2DPerDeviceState per_device_state = {handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo}; + return per_device_state; } void forward_kernel(cudaStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, + optional const &activation, float const *input_ptr, float *output_ptr, float const *filter_ptr, float const *bias_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnConvolutionForward(m->handle.dnn, + checkCUDNN(cudnnConvolutionForward(m.handle.dnn, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, - m->filterDesc, + m.filterDesc, filter_ptr, - m->convDesc, - m->fwdAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.fwdAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); - // use_bias == True if (bias_ptr != NULL) { - checkCUDNN(cudnnAddTensor(m->handle.dnn, + checkCUDNN(cudnnAddTensor(m.handle.dnn, &alpha, - m->biasTensor, + m.biasTensor, bias_ptr, &alpha, - m->outputTensor, + m.outputTensor, output_ptr)); } - if (m->relu) { - checkCUDNN(cudnnActivationForward(m->handle.dnn, - m->actiDesc, + if (activation.has_value()) { + checkCUDNN(cudnnActivationForward(m.handle.dnn, + m.actiDesc, &alpha, - m->outputTensor, + m.outputTensor, output_ptr, &beta, - m->outputTensor, + m.outputTensor, output_ptr)); } } void backward_kernel(cudaStream_t stream, - Conv2DPerDeviceState const *m, + Conv2DPerDeviceState const &m, + optional const &activation, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, @@ -207,14 +321,14 @@ void backward_kernel(cudaStream_t stream, float const *kernel_ptr, float *kernel_grad_ptr, float *bias_grad_ptr) { - checkCUDNN(cudnnSetStream(m->handle.dnn, stream)); + checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; // float beta = 0.0f; - if (m->relu) { + if (activation.has_value()) { cudnnDataType_t dataType; int n, c, h, w, nStride, cStride, hStride, wStride; - checkCUDNN(cudnnGetTensor4dDescriptor(m->outputTensor, + checkCUDNN(cudnnGetTensor4dDescriptor(m.outputTensor, &dataType, &n, &c, @@ -229,166 +343,49 @@ void backward_kernel(cudaStream_t stream, } // Compute filter gradiant // NOTE: we use alpha for kernel_grad to accumulate gradients - checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardFilter(m.handle.dnn, &alpha, - m->inputTensor, + m.inputTensor, input_ptr, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->convDesc, - m->bwdFilterAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.bwdFilterAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->filterDesc, + m.filterDesc, kernel_grad_ptr)); // Compute bias gradiant // NOTE: we use alpha for bias_grad to accumulate gradients if (bias_grad_ptr != NULL) { - checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardBias(m.handle.dnn, &alpha, - m->outputTensor, + m.outputTensor, output_grad_ptr, &alpha, - m->biasTensor, + m.biasTensor, bias_grad_ptr)); } // Compute data gradiant // NOTE: we use alpha for input_grad to accumulate gradients if (input_grad_ptr != NULL) { - checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn, + checkCUDNN(cudnnConvolutionBackwardData(m.handle.dnn, &alpha, - m->filterDesc, + m.filterDesc, kernel_ptr, - m->outputTensor, + m.outputTensor, output_grad_ptr, - m->convDesc, - m->bwdDataAlgo, - m->handle.workSpace, - m->handle.workSpaceSize, + m.convDesc, + m.bwdDataAlgo, + m.handle.workSpace, + m.handle.workSpaceSize, &alpha, - m->inputTensor, + m.inputTensor, input_grad_ptr)); } } -cudnnConvolutionFwdAlgo_t selectConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - void const *x, - const cudnnFilterDescriptor_t wDesc, - void const *w, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnTensorDescriptor_t yDesc, - void *y, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionFwdAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionForwardAlgorithmEx(handle, - xDesc, - x, - wDesc, - w, - convDesc, - yDesc, - y, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - -cudnnConvolutionBwdDataAlgo_t selectConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - void const *w, - const cudnnTensorDescriptor_t dyDesc, - void const *dy, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionBwdDataAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithmEx(handle, - wDesc, - w, - dyDesc, - dy, - convDesc, - dxDesc, - dx, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - -cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - void const *x, - const cudnnTensorDescriptor_t dyDesc, - void const *dy, - const cudnnConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const cudnnFilterDescriptor_t dwDesc, - void *dw, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - cudnnConvolutionBwdFilterAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithmEx(handle, - xDesc, - x, - dyDesc, - dy, - convDesc, - dwDesc, - dw, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize)); - assert(cnt > 0); - checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].algo; -} - } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index dd3226ce5a..5c4239a5cf 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -3,7 +3,8 @@ #include "kernels/array_shape.h" #include "kernels/device.h" -#include "op-attrs/ffconst.h" +#include "op-attrs/datatype.h" +#include "op-attrs/op.h" #if defined(FF_USE_CUDA) #include @@ -23,6 +24,9 @@ #error "Unknown device" #endif +using ::FlexFlow::DataType; +using ::FlexFlow::OperatorType; + #define checkCUDNN(status) \ do { \ std::stringstream _error; \ diff --git a/lib/kernels/src/hip/conv_2d_kernels.cpp b/lib/kernels/src/hip/conv_2d_kernels.cpp index 9c6a7f15ae..761a59b545 100644 --- a/lib/kernels/src/hip/conv_2d_kernels.cpp +++ b/lib/kernels/src/hip/conv_2d_kernels.cpp @@ -14,59 +14,171 @@ */ #include "kernels/conv_2d_kernels.h" -#include "kernels/hip_helper.h" -#include namespace FlexFlow { +namespace Kernels { +namespace Conv2D { + +miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + void const *x, + const miopenTensorDescriptor_t wDesc, + void const *w, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t yDesc, + void *y, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionForwardAlgorithm(handle, + xDesc, + x, + wDesc, + w, + convDesc, + yDesc, + y, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].fwd_algo; +} + +miopenConvBwdWeightsAlgorithm_t selectConvolutionBackwardFilterAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + void const *x, + const miopenTensorDescriptor_t dyDesc, + void const *dy, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t dwDesc, + void *dw, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionBackwardWeightsAlgorithm(handle, + dyDesc, + dy, + xDesc, + x, + convDesc, + dwDesc, + dw, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].bwd_weights_algo; +} + +miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t wDesc, + void const *w, + const miopenTensorDescriptor_t dyDesc, + void const *dy, + const miopenConvolutionDescriptor_t convDesc, + void *workSpace, + size_t workSpaceSize, + const miopenTensorDescriptor_t dxDesc, + void *dx, + float *time) { + int const reqAlgCnt = 8; + int cnt = 0; + miopenConvAlgoPerf_t perfResults[reqAlgCnt]; + checkCUDNN(miopenFindConvolutionBackwardDataAlgorithm(handle, + dyDesc, + dy, + wDesc, + w, + convDesc, + dxDesc, + dx, + reqAlgCnt, + &cnt, + perfResults, + workSpace, + workSpaceSize, + false)); + assert(cnt > 0); + // checkCUDNN(perfResults[0].status); + if (time != nullptr) { + *time = perfResults[0].time; + } + return perfResults[0].bwd_data_algo; +} + +Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, + optional activation, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { + + miopenCreateTensorDescriptor inputTensor; + miopenCreateTensorDescriptor biasTensor; + miopenCreateTensorDescriptor outputTensor; + miopenCreateTensorDescriptor filterDesc; + miopenActivationDescriptor_t actiDesc; + miopenActivationDescriptor_t convDesc; + miopenConvFwdAlgorithm_t fwdAlgo; + miopenConvBwdWeightsAlgorithm_t bwdFilterAlgo; + miopenConvBwdDataAlgorithm_t bwdDataAlgo; + + int input_w = input.shape[legion_dim_t(0)]; + int input_h = input.shape[legion_dim_t(1)]; + int input_c = input.shape[legion_dim_t(2)]; + int input_n = input.shape[legion_dim_t(3)]; + + int output_w = output.shape[legion_dim_t(0)]; + int output_h = output.shape[legion_dim_t(1)]; + int output_c = output.shape[legion_dim_t(2)]; + int output_n = output.shape[legion_dim_t(3)]; -Conv2DPerDeviceState::Conv2DPerDeviceState(FFHandler handler) - : PerDeviceOpState(handler) { checkCUDNN(miopenCreateTensorDescriptor(&inputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&biasTensor)); checkCUDNN(miopenCreateTensorDescriptor(&outputTensor)); checkCUDNN(miopenCreateTensorDescriptor(&filterDesc)); checkCUDNN(miopenCreateConvolutionDescriptor(&convDesc)); checkCUDNN(miopenCreateActivationDescriptor(&actiDesc)); -} - -namespace Kernels { -namespace Conv2D { -void init_kernel(Conv2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int kernel_h, - int kernel_w, - int groups, - int stride_h, - int stride_w, - int pad_h, - int pad_w, - float const *input_ptr, - float *output_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *forward_time, - float *backward_time) { checkCUDNN(miopenSet4dTensorDescriptor( - m->inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); + inputTensor, miopenFloat, input_n, input_c, input_h, input_w)); - checkCUDNN(miopenSet4dTensorDescriptor( - m->biasTensor, miopenFloat, 1, output_c, 1, 1)); + checkCUDNN( + miopenSet4dTensorDescriptor(biasTensor, miopenFloat, 1, output_c, 1, 1)); // Require that input_c is divisible by conv->groups assert(input_c % groups == 0); - printf("filterDim: kernel(%d %d) c_in(%d), c_out(%d)\n", - kernel_h, - kernel_w, - input_c / groups, - output_c); checkCUDNN(miopenSet4dTensorDescriptor(m->filterDesc, miopenFloat, output_c, @@ -162,12 +274,13 @@ void init_kernel(Conv2DPerDeviceState *m, } } -void forward_kernel(hipStream_t stream, - Conv2DPerDeviceState const *m, +void forward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, float const *input_ptr, float *output_ptr, float const *filter_ptr, - float const *bias_ptr) { + float const *bias_ptr, + optional activation) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); @@ -208,15 +321,16 @@ void forward_kernel(hipStream_t stream, } } -void backward_kernel(hipStream_t stream, - Conv2DPerDeviceState const *m, +void backward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, float const *input_ptr, float *input_grad_ptr, float const *output_ptr, float *output_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr) { + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + optional activation) { checkCUDNN(miopenSetStream(m->handle.dnn, stream)); @@ -289,126 +403,6 @@ void backward_kernel(hipStream_t stream, } } -miopenConvFwdAlgorithm_t selectConvolutionForwardAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - void const *x, - const miopenTensorDescriptor_t wDesc, - void const *w, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t yDesc, - void *y, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionForwardAlgorithm(handle, - xDesc, - x, - wDesc, - w, - convDesc, - yDesc, - y, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("forwardAlgo(%d) time(%.2lf)\n", - perfResults[0].fwd_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].fwd_algo; -} - -miopenConvBwdWeightsAlgorithm_t selectConvolutionBackwardFilterAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - void const *x, - const miopenTensorDescriptor_t dyDesc, - void const *dy, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t dwDesc, - void *dw, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionBackwardWeightsAlgorithm(handle, - dyDesc, - dy, - xDesc, - x, - convDesc, - dwDesc, - dw, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("bwdFilterAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_weights_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].bwd_weights_algo; -} - -miopenConvBwdDataAlgorithm_t selectConvolutionBackwardDataAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t wDesc, - void const *w, - const miopenTensorDescriptor_t dyDesc, - void const *dy, - const miopenConvolutionDescriptor_t convDesc, - void *workSpace, - size_t workSpaceSize, - const miopenTensorDescriptor_t dxDesc, - void *dx, - float *time) { - int const reqAlgCnt = 8; - int cnt = 0; - miopenConvAlgoPerf_t perfResults[reqAlgCnt]; - checkCUDNN(miopenFindConvolutionBackwardDataAlgorithm(handle, - dyDesc, - dy, - wDesc, - w, - convDesc, - dxDesc, - dx, - reqAlgCnt, - &cnt, - perfResults, - workSpace, - workSpaceSize, - false)); - assert(cnt > 0); - // checkCUDNN(perfResults[0].status); - printf("bwdDataAlgo(%d) time(%.2lf)\n", - perfResults[0].bwd_data_algo, - perfResults[0].time); - if (time != nullptr) { - *time = perfResults[0].time; - } - return perfResults[0].bwd_data_algo; -} - } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/runtime/src/ops/conv_2d.cc b/lib/runtime/src/ops/conv_2d.cc index e362c73f92..e0e1872a06 100644 --- a/lib/runtime/src/ops/conv_2d.cc +++ b/lib/runtime/src/ops/conv_2d.cc @@ -1,1051 +1,250 @@ #include "conv_2d.h" #include "kernels/conv_2d_kernels.h" -#include "layer.h" #include "legion/legion_utilities.h" #include "mpark/variant.hpp" -#include "task_spec.h" +#include "op-attrs/get_output_shapes.h" #include "utils/hash-utils.h" namespace FlexFlow { -enum Slots { - INPUT, - OUTPUT, - FILTER, - BIAS, - FILTER_GRAD, - INPUT_GRAD, - OUTPUT_GRAD, - BIAS_GRAD, - ATTRS, - PROFILING, -} - -// declare Legion names -using Legion::ArgumentMap; using Legion::Context; -using Legion::coord_t; -using Legion::Domain; -using Legion::FutureMap; -using Legion::IndexLauncher; -using Legion::InlineLauncher; using Legion::PhysicalRegion; -using Legion::Predicate; -using Legion::Rect; -using Legion::RegionRequirement; using Legion::Runtime; using Legion::Task; -using Legion::TaskArgument; -using Legion::TaskLauncher; using namespace FlexFlow::Kernels::Conv2D; -Tensor FFModel::conv2d(Tensor const &input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, - ActiMode activation, - int groups, - bool use_bias, - Layer const *shared_op, - Initializer *kernel_initializer, - Initializer *bias_initializer, - char const *name) { - assert(input->num_dims() == 4); /*NCHW*/ - - Conv2DAttrs attrs = {outChannels, - kernelH, - kernelW, - strideH, - strideW, - paddingH, - paddingW, - groups, - activation, - use_bias}; - - TensorShape output_shape = get_output_shape(attrs, input->get_shape()); - Tensor output = this->tensor_mgr.create(output_shape, CreateGrad::YES, conv); - - std::vector weights; - - TensorShape kernel_shape = get_kernel_shape(attrs, input->get_shape()); - weights.push_back(this->tensor_mgr.create( - kernel_shape, CreateGrad::YES, kernel_initializer, CHOSEN_SYNC_TYPE)); - - if (use_bias) { - TensorShape bias_shape = get_bias_shape(attrs, input->get_shape()); - weights.push_back(this->tensor_mgr.create( - bias_shape, CreateGrad::YES, bias_initializer, CHOSEN_SYNC_TYPE)); - } - - Layer *conv = - this->layer_mgr.create(attrs, DT_FLOAT, name, {input}, weights, {output}); - - //{ - // int numdims = 4; - // int dims[MAX_TENSOR_DIM]; - // dims[3] = input->dims[3]; - // dims[2] = outChannels; - // dims[1] = 1 + (input->dims[1] + 2 * paddingH - kernelH) / strideH; - // dims[0] = 1 + (input->dims[0] + 2 * paddingW - kernelW) / strideW; - // conv->outputs[0] = create_tensor_legion_ordering( - // numdims, dims, DT_FLOAT, conv, 0, true /*create_grad*/); - //} - //{ - // int dims[4] = {kernelW, kernelH, input->dims[2], outChannels}; - // conv->weights[0] = create_weight_legion_ordering(4, - // dims, - // DT_FLOAT, - // conv, - // true /*create_grad*/, - // kernel_initializer, - // CHOSEN_SYNC_TYPE); - //} - // if (use_bias) { - // int dims[1] = {outChannels}; - // conv->weights[1] = create_weight_legion_ordering(1, - // dims, - // DT_FLOAT, - // conv, - // true /*create_grad*/, - // bias_initializer, - // CHOSEN_SYNC_TYPE); - //} - conv->add_initializer("kernel", kernel_initializer); - conv->add_initializer("bias", bias_initializer); - /* layers.push_back(conv); */ - return conv->outputs[0]; -} - -Op *Conv2D::create_operator_from_layer( - FFModel &model, - Layer const *layer, - std::vector const &inputs) { - return new Conv2D(model, - get(layer->attrs), - inputs, - layer->name, - false /*allocate_weights*/ - ); -} - -/* void Conv2DParams::mark_replica_dims( */ -/* ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* if (output_dims != nullptr) { */ -/* output_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* kernel_dims[Conv2DOutput::REPLICA].is_replica_dim = true; */ -/* } */ -/* if (bias_dims != nullptr) { */ -/* bias_dims[Conv2DBias::REPLICA_1].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_2].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_3].is_replica_dim = true; */ -/* bias_dims[Conv2DBias::REPLICA_4].is_replica_dim = true; */ -/* } */ -/* } */ - -/* int Conv2DParams::output_size(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM]) const - * { */ -/* int input_w = input.dims[Conv2DInput::WIDTH].size; */ -/* int input_h = input.dims[Conv2DInput::HEIGHT].size; */ - -/* output_dims[Conv2DOutput::SAMPLE].size = - * input.dims[Conv2DInput::SAMPLE].size; */ -/* output_dims[Conv2DOutput::CHANNEL].size = out_channels; */ -/* output_dims[Conv2DOutput::HEIGHT].size = */ -/* 1 + (input_h + 2 * padding_h - kernel_h) / stride_h; */ -/* output_dims[Conv2DOutput::WIDTH].size = */ -/* 1 + (input_w + 2 * padding_w - kernel_w) / stride_w; */ - -/* return input.num_dims; */ -/* }; */ - -/* int Conv2DParams::kernel_size(ParallelTensorShape const &input, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM]) const - * { */ -/* kernel_dims[Conv2DKernel::CHANNEL_OUT].size = this->out_channels; */ -/* kernel_dims[Conv2DKernel::CHANNEL_IN].size = */ -/* input.dims[Conv2DInput::CHANNEL].size / this->groups; */ -/* kernel_dims[Conv2DKernel::HEIGHT].size = */ -/* this->kernel_h * input.dims[Conv2DInput::HEIGHT].degree; */ -/* kernel_dims[Conv2DKernel::WIDTH].size = */ -/* this->kernel_w * input.dims[Conv2DInput::WIDTH].degree; */ - -/* return Conv2DKernel::NUMDIM; */ -/* } */ - -/* int Conv2DParams::bias_size(ParallelTensorShape const &input, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM]) const { */ -/* bias_dims[Conv2DBias::CHANNEL].size = this->out_channels; */ - -/* return Conv2DBias::NUMDIM; */ -/* }; */ - -/* void Conv2DParams::solve_dims(ParallelTensorShape const &input, */ -/* ParallelDim output_dims[MAX_TENSOR_DIM], */ -/* int *output_ndims, */ -/* ParallelDim kernel_dims[MAX_TENSOR_DIM], */ -/* int *kernel_ndims, */ -/* ParallelDim bias_dims[MAX_TENSOR_DIM], */ -/* int *bias_ndims) const { */ -/* assert((output_dims == nullptr) == (output_ndims == nullptr)); */ -/* assert((kernel_dims == nullptr) == (kernel_ndims == nullptr)); */ -/* assert((bias_dims == nullptr) == (bias_ndims == nullptr)); */ - -/* std::vector mapping; */ -/* Conv2D::construct_mappings(mapping, this->use_bias); */ - -/* this->mark_replica_dims(input, output_dims, kernel_dims, bias_dims); */ - -/* std::vector output_dim_sets; */ -/* if (output_dims != nullptr) { */ -/* output_dim_sets.push_back(output_dims); */ -/* } */ - -/* std::vector weight_dim_sets; */ -/* if (kernel_dims != nullptr) { */ -/* weight_dim_sets.push_back(kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* weight_dim_sets.push_back(bias_dims); */ -/* } */ - -/* solve_parallel_dim_mappings( */ -/* mapping, {input.dims}, weight_dim_sets, output_dim_sets); */ - -/* if (output_dims != nullptr) { */ -/* *output_ndims = this->output_size(input, output_dims); */ -/* } */ -/* if (kernel_dims != nullptr) { */ -/* *kernel_ndims = this->kernel_size(input, kernel_dims); */ -/* } */ -/* if (bias_dims != nullptr && this->use_bias) { */ -/* *bias_ndims = this->bias_size(input, bias_dims); */ -/* } */ -/* } */ - -/*static*/ -/* void Conv2D::construct_mappings(std::vector &out, - */ -/* bool use_bias) { */ -/* Conv2D::construct_output_mappings(out); */ -/* Conv2D::construct_weight_mappings(out, use_bias); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_output_mappings( */ -/* std::vector &out) { */ -/* Op::construct_output_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::CHANNEL, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DOutput::REPLICA}, */ -/* {Conv2DInput::SAMPLE, MappingOperation::PARTITION, - * Conv2DOutput::SAMPLE}, */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DOutput::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, MappingOperation::PARTITION, - * Conv2DOutput::HEIGHT}, */ -/* {Conv2DInput::WIDTH, MappingOperation::PARTITION, - * Conv2DOutput::WIDTH}}); */ -/* } */ - -/*static*/ -/* void Conv2D::construct_weight_mappings( */ -/* std::vector &out, bool use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* { */ -/* {Conv2DInput::REPLICA, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_OUT}, */ -/* {Conv2DInput::SAMPLE, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::REPLICA}, */ -/* {Conv2DInput::CHANNEL, */ -/* MappingOperation::PARTITION, */ -/* Conv2DKernel::CHANNEL_IN}, */ -/* {Conv2DInput::HEIGHT, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::HEIGHT}, // Kernel::{HEIGHT, WEIGHT} would both work - */ -/* // here */ -/* {Conv2DInput::WIDTH, */ -/* MappingOperation::REPLICATE, */ -/* Conv2DKernel::WIDTH}, // same as above */ -/* }, */ -/* Conv2DInput::INDEX, */ -/* Conv2DKernel::INDEX); */ - -/* if (use_bias) { */ -/* Op::construct_weight_parallel_dims( */ -/* out, */ -/* {{Conv2DInput::REPLICA, Conv2DBias::REPLICA_1}, */ -/* {Conv2DInput::SAMPLE, Conv2DBias::REPLICA_2}, */ -/* {Conv2DInput::CHANNEL, Conv2DBias::CHANNEL}, */ -/* {Conv2DInput::HEIGHT, Conv2DBias::REPLICA_3}, */ -/* {Conv2DInput::WIDTH, Conv2DBias::REPLICA_4}}, */ -/* Conv2DInput::INDEX, */ -/* Conv2DBias::INDEX); */ -/* } */ -/* } */ - -Conv2D::Conv2D(FFModel &model, - Conv2D const &other, - const ParallelTensor input, - bool allocate_weights) - : Conv2D(model, - other.layer_guid, - input, - other.out_channels, - other.kernel_h, - other.kernel_w, - other.stride_h, - other.stride_w, - other.padding_h, - other.padding_w, - other.activation, - other.groups, - other.use_bias, - allocate_weights, - other.name) {} - -Conv2D::Conv2D(FFModel &model, - Conv2DAttrs const &attrs, - std::vector const &inputs, - char const *name, - bool allocate_weights) - : Conv2D(model, - params.layer_guid, - input, - params.out_channels, - params.kernel_h, - params.kernel_w, - params.stride_h, - params.stride_w, - params.padding_h, - params.padding_w, - params.activation, - params.groups, - params.use_bias, - allocate_weights, - name) {} - -/* bool Conv2DParams::is_valid(ParallelTensorShape const &input) const { */ -/* ParallelTensorShape output_shape, kernel_shape, bias_shape; */ -/* this->solve_dims(input, */ -/* output_shape.dims, */ -/* &output_shape.num_dims, */ -/* kernel_shape.dims, */ -/* &kernel_shape.num_dims, */ -/* bias_shape.dims, */ -/* &bias_shape.num_dims); */ -/* bool is_valid = true; */ -/* is_valid &= input.is_valid(); */ -/* is_valid &= output_shape.is_valid(); */ -/* is_valid &= kernel_shape.is_valid(); */ -/* if (use_bias) { */ -/* is_valid &= bias_shape.is_valid(); */ -/* } */ - -/* // TODO FIXME: Currently disable parallelizing the height and width - * dimension */ -/* if (input.dims[0].degree > 1 || input.dims[1].degree > 1) { */ -/* return false; */ -/* } */ - -/* return is_valid; */ -/* } */ - -Conv2D::Conv2D(FFModel &model, - LayerID const &_layer_guid, - const ParallelTensor input, - int outChannels, - int kernelH, - int kernelW, - int strideH, - int strideW, - int paddingH, - int paddingW, - ActiMode activation, - int groups, - bool use_bias, - bool allocate_weights, - char const *name) - : Op(model, - OP_CONV2D, - DT_FLOAT, - name, - 1 /*inputs*/, - use_bias ? 2 : 1 /*weights*/, - allocate_weights, - 1 /*outputs*/, - input), - in_channels(input->dims[Conv2DInput::CHANNEL].size / - input->dims[Conv2DInput::CHANNEL].degree), - out_channels(outChannels), kernel_h(kernelH), kernel_w(kernelW), - stride_h(strideH), stride_w(strideW), padding_h(paddingH), - padding_w(paddingW), activation(activation), groups(groups), - use_bias(use_bias) { - // overwrite layer_guid - layer_guid = _layer_guid; - assert(input->num_dims == Conv2DInput::NUMDIM); - assert(this->stride_h > 0); - assert(this->stride_w > 0); - - ParallelDim output_dims[MAX_TENSOR_DIM], kernel_dims[MAX_TENSOR_DIM], - bias_dims[MAX_TENSOR_DIM]; - int output_ndims, kernel_ndims, bias_ndims; - - this->construct_mappings(*this->parallel_dims_mapping, this->use_bias); - this->get_params().solve_dims(this->inputs[0]->get_shape(), - output_dims, - &output_ndims, - kernel_dims, - &kernel_ndims, - bias_dims, - &bias_ndims); - - if (allocate_weights) { - Initializer *kernel_initializer = new GlorotUniform(std::rand() /*seed*/); - - weights[Conv2DKernel::INDEX] = - model.create_parallel_weight_legion_ordering(kernel_ndims, - kernel_dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - kernel_initializer, - CHOSEN_SYNC_TYPE); - - if (use_bias) { - Initializer *bias_initializer = new ZeroInitializer(); - - weights[Conv2DBias::INDEX] = - model.create_parallel_weight_legion_ordering(bias_ndims, - bias_dims, - DT_FLOAT, - NULL /*owner_op*/, - true /*create_grad*/, - bias_initializer, - CHOSEN_SYNC_TYPE); - } - } - - outputs[0] = model.create_parallel_tensor_legion_ordering( - output_ndims, output_dims, DT_FLOAT, this); - - assert(check_output_input_weight_parallel_dims(allocate_weights)); -} - -static OpTaskSignature get_init_task_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_arg_slot(PROFILING); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT, WRITE_ONLY); - init.add_param_slot(FILTER); - init.add_param_slot(BIAS); - init.add_param_grad_slot(FILTER_GRAD, WRITE_ONLY); - init.add_input_grad_slot(INPUT_GRAD); - - return init; -} - -static OpTaskSignature get_fwd_task_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT, WRITE_ONLY); - fwd.add_param_slot(FILTER); - fwd.add_param_slot(BIAS); - - return fwd; -} - -static OpTaskSignature get_bwd_task_signature() { - OpTaskSignature bwd(OpTaskType::BWD); - - bwd.add_arg_slot(ATTRS); - - bwd.add_input_slot(INPUT); - bwd.add_input_grad_slot(INPUT_GRAD, READ_WRITE); - bwd.add_output_slot(OUTPUT); - bwd.add_output_grad_slot(OUTPUT_GRAD, READ_WRITE); - bwd.add_param_slot(FILTER); - bwd.add_param_grad_slot(FILTER_GRAD, READ_WRITE); - bwd.add_param_grad_slot(BIAS_GRAD, READ_WRITE); - - return bwd; -} - -OpTaskBinding Conv2d::get_init_task_binding() const { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, this->attrs); - binding.bind_arg(PROFILING, this->profiling); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, param_tensor(0)); - binding.bind(BIAS, param_tensor(1)); - binding.bind(FILTER_GRAD, param_tensor(0).grad()); - binding.bind(INPUT_GRAD, input_tensor(0).grad()); - - return binding; -} +enum Slots { + INPUT, + OUTPUT, + FILTER, + BIAS, + ATTRS, + PROFILING, + PER_DEVICE_STATE, + HANDLE +}; -OpTaskBinding Conv2d::get_fwd_task_binding() const { +OpTaskInvocation init(Conv2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(ATTRS, this->attrs); - binding.bind(INPUT, input_tensor(0)); binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, param_tensor(0)); - binding.bind(BIAS, param_tensor(1)); + binding.bind(FILTER, weight_tensor(0)); + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(HANDLE, ff_handle()); - return binding; + return {CONV2D_INIT_TASK_ID, binding}; } -OpTaskBinding Conv2d::get_bwd_task_binding() const { +OpTaskInvocation forward(Conv2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(ATTRS, this->attrs); + binding.bind_arg(ATTRS, attrs); + binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(PER_DEVICE_STATE, + per_device_op_state()); binding.bind(INPUT, input_tensor(0)); - binding.bind(INPUT_GRAD, input_tensor(0).grad()); binding.bind(OUTPUT, output_tensor(0)); - binding.bind(OUTPUT_GRAD, output_tensor(0).grad()); - binding.bind(FILTER, param_tensor(0)); - binding.bind(FILTER_GRAD, param_tensor(0).grad()); - binding.bind(BIAS_GRAD, param_tensor(1).grad()); - - return binding; -} - -void Conv2D::init(FFModel const &ff) { - this->execute_task(ff, CONV2D_INIT_TASK_ID, get_init_task_signature()); - // assert(check_output_input_weight_same_parallel_is()); - // parallel_is = outputs[0]->parallel_is; - // ArgumentMap argmap; - // Context ctx = ff.config.lg_ctx; - // Runtime *runtime = ff.config.lg_hlr; - // set_argumentmap_for_init(ff, argmap); - // IndexLauncher launcher(CONV2D_INIT_TASK_ID, - // parallel_is, - // TaskArgument(this, sizeof(Conv2D)), - // argmap, - // Predicate::TRUE_PRED, - // false /*must*/, - // 0 /*mapper_id*/, - // outputs[0]->machine_view.hash()); - // launcher.add_region_requirement(RegionRequirement(inputs[0]->part, - // 0 /*projection id*/, - // READ_ONLY, - // EXCLUSIVE, - // inputs[0]->region)); - // launcher.add_field(0, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(outputs[0]->part, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // outputs[0]->region)); - // launcher.add_field(1, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(weights[0]->part, - // 0 /*projection id*/, - // READ_ONLY, - // EXCLUSIVE, - // weights[0]->region)); - // launcher.add_field(2, FID_DATA); - // // launcher.add_region_requirement( - // // RegionRequirement(weights[1]->part, 0/*projection id*/, - // // READ_ONLY, EXCLUSIVE, weights[1]->region)); - // // launcher.add_field(3, FID_DATA); - // launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad, - // 0 /*projection id*/, - // WRITE_ONLY, - // EXCLUSIVE, - // weights[0]->region_grad)); - // launcher.add_field(3, FID_DATA); - // // launcher.add_region_requirement( - // // RegionRequirement(inputs[0]->part_grad, 0/*projection id*/, - // // WRITE_ONLY, EXCLUSIVE, inputs[0]->region_grad)); - // // launcher.add_field(4, FID_DATA); - // FutureMap fm = runtime->execute_index_space(ctx, launcher); - // fm.wait_all_results(); - // set_opmeta_from_futuremap(ff, fm); -} - -/* - regions[0]: input - regions[1]: output - regions[2](I): filter - regions[3](I): bias - regions[4](O): filter_grad - regions[5](O): input_grad -*/ -PerDeviceOpState *Conv2D::init_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - assert(regions.size() == 4); - assert(task->regions.size() == 4); - // Conv2D const *conv = (Conv2D *)task->args; + binding.bind(FILTER, weight_tensor(0)); + binding.bind(BIAS, weight_tensor(1)); + + return {CONV2D_FWD_TASK_ID, binding}; +} + +OpTaskInvocation backward(Conv2DAttrs const &attrs) { + OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); + + return {CONV2D_BWD_TASK_ID, binding}; +} + +static DeviceSpecific + init_task_impl(TaskArgumentAccessor const &acc) { + + PerDeviceFFHandle handle = acc.get_argument(HANDLE); + auto attrs = acc.get_argument(ATTRS); + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto filter = acc.get_tensor(FILTER); + auto filter_grad = acc.get_tensor_grad(FILTER); + + DeviceSpecific per_device_state = + acc.create_device_specific( + init_kernel(handle, + attrs.activation, + attrs.kernel_h, + attrs.kernel_w, + attrs.groups, + attrs.padding_h, + attrs.padding_w, + attrs.stride_h, + attrs.stride_w, + input, + output, + filter.get_float_ptr(), + filter_grad.get_float_ptr())); + return per_device_state; +} + +static DeviceSpecific + init_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { TaskArgumentAccessor acc(task, regions, ctx, runtime); - FFHandler handle = *((FFHandler const *)task->local_args); - auto const &attrs = acc.get_argument(ATTRS); - bool profiling = acc.get_argument(PROFILING); - // TensorAccessorR acc_input( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - // TensorAccessorW acc_output(regions[1], - // task->regions[1], - // FID_DATA, - // ctx, - // runtime, - // false - // /*readOutput*/); - // TensorAccessorR acc_kernel( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - // TensorAccessorR acc_bias( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // TensorAccessorW acc_kernel_grad( - // regions[3], - // task->regions[3], - // FID_DATA, - // ctx, - // runtime, - // false /*readOutput*/); - // TensorAccessorW acc_input_grad( - // regions[4], task->regions[4], FID_DATA, ctx, runtime, - // false/*readOutput*/); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto filter = acc.get_tensor(FILTER); - auto bias = acc.get_tensor(BIAS); - auto filter_grad = acc.get_tensor(FILTER_GRAD); - auto input_grad = acc.get_tensor(INPUT_GRAD); - - Conv2DPerDeviceState *m = new Conv2DPerDeviceState(handle); - m->relu = attrs.activation == AC_MODE_RELU; - m->use_bias = attrs.use_bias; - m->profiling = profiling; - // m->trainableInputs[0] = conv->trainableInputs[0]; ?? - std::strcpy(m->op_name, attrs.name); - - int input_w = input.shape[0]; - int input_h = input.shape[1]; - int input_c = input.shape[2]; - int input_n = input.shape[3]; - int output_w = output.shape[0]; - int output_h = output.shape[1]; - int output_c = output.shape[2]; - int output_n = output.shape[3]; - - printf("init conv (input): n(%d) c(%d) h(%d) w(%d)\n", - input_n, - input_c, - input_h, - input_w); - printf("init conv (output): n(%d) c(%d) h(%d) w(%d)\n", - output_n, - output_c, - output_h, - output_w); - - // printf("convDim: padding(%d %d) stride(%d %d)\n", conv->padding_h, - // conv->padding_w, conv->stride_h, conv->stride_w); - int pad_h = - ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2; - int pad_w = - ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2; - if (pad_h != attrs.padding_h) { - printf("Warning: changing conv_padding_h to satisfy output_h size\n"); - } - if (pad_w != attrs.padding_w) { - printf("Warning: changing conv_padding_w to satisfy output_w size\n"); - } - - init_kernel(m, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - attrs.kernel_h, - attrs.kernel_w, - attrs.groups, - attrs.stride_h, - attrs.stride_w, - pad_h, - pad_w, - input.get_float_ptr(), - output.get_float_ptr(), - filter.get_float_ptr(), - filter_grad.get_float_ptr()); - - return m; -} - -// TaskSpec Conv2D::get_tasks_spec() const { -// OpTasksSpec spec { -// CONV2D_INIT_TASK_ID, -// CONV2D_FWD_TASK_ID, -// CONV2D_BWD_TASK_ID -// }; -// auto &fwd = spec.get_fwd(); - -// fwd.add_input_slot(INPUT); -// fwd.add_param_slot(KERNEL); -// fwd.add_output_slot(OUTPUT); - -// auto input = spec.input_tensor(0); -// auto kernel = spec.param_tensor(0); -// auto bias = spec.param_tensor(1); -// auto output = spec.output_tensor(0); - -// fwd[INPUT] = input; -// fwd[KERNEL] = kernel; -// if (this->use_bias) { -// fwd[BIAS] = bias; -// } -// fwd[OUTPUT] = output; - -// return spec; -// } - -/* TaskSpec Conv2D::get_forward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_FWD_TASK_ID, Pass::FWD }; */ - -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ - -/* spec.add_input(INPUT, input); */ -/* spec.add_input(KERNEL, kernel); */ - -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* } */ - -/* spec.add_output(OUTPUT, output); */ - -/* return spec; */ -/* } */ - -/* TaskSpec Conv2D::get_backward_task_spec() const { */ -/* TaskSpec spec = { CONV2D_BWD_TASK_ID, Pass::BWD }; */ - -/* auto input = spec.add_tensor(TensorRole::INPUT, 0); */ -/* auto kernel = spec.add_tensor(TensorRole::PARAM, 0); */ -/* auto bias = spec.add_tensor(TensorRole::BIAS, 1); */ -/* auto output = spec.add_tensor(TensorRole::OUTPUT, 0); */ - -/* spec.add_input(INPUT, input); */ -/* spec.add_output(INPUT_GRAD, input.grad); */ -/* spec.add_input(KERNEL, kernel); */ -/* spec.add_output(KERNEL_GRAD, kernel.grad); */ - -/* if (this->use_bias) { */ -/* spec.add_input(BIAS, bias); */ -/* spec.add_output(BIAS_GRAD, bias.grad); */ -/* } */ - -/* spec.add_input(OUTPUT, output); */ -/* spec.add_input(OUTPUT_GRAD, output.grad); */ - -/* return spec; */ -/* } */ - -void Conv2D::forward(FFModel const &ff) { - this->execute_task(ff, CONV2D_FWD_TASK_ID, get_fwd_task_signature()); -} - -void Conv2D::backward(FFModel const &ff) { - this->execute_task(ff, CONV2D_bWD_TASK_ID, get_bwd_task_signature()); -} - -/* - regions[0](I): input - regions[1](O): output - regions[2](I): filter - regions[3](I): bias -*/ -void Conv2D::forward_task(Task const *task, + return init_task_impl(acc); +} + +static optional forward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); + + auto input = acc.get_tensor(INPUT); + auto filter = acc.get_tensor(FILTER); + auto bias = acc.get_tensor(BIAS); + auto output = acc.get_tensor(OUTPUT); + + return profile(forward_kernel, + profiling, + "[Conv2d] forward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + output.get_float_ptr(), + filter.get_float_ptr(), + bias.get_float_ptr(), + attrs.activation); +} + +static void forward_task(Task const *task, + std::vector const ®ions, + Context ctx, + Runtime *runtime) { + TaskArgumentAccessor acc(task, regions, ctx, runtime); + forward_task_impl(acc); +} + +static optional backward_task_impl(TaskArgumentAccessor const &acc) { + ProfilingSettings profiling = acc.get_argument(PROFILING); + auto per_device_state = + acc.get_argument(PER_DEVICE_STATE); + auto attrs = acc.get_argument(ATTRS); + + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto filter = acc.get_tensor(FILTER); + + auto input_grad = acc.get_tensor_grad(INPUT); + auto output_grad = acc.get_tensor_grad(OUTPUT); + auto filter_grad = acc.get_tensor_grad(FILTER); + auto bias_grad = acc.get_tensor_grad(BIAS); + + return profile(backward_kernel, + profiling, + "[Conv2d] backward_time = %.2lfms\n", + per_device_state, + input.get_float_ptr(), + input_grad.get_float_ptr(), + output.get_float_ptr(), + output_grad.get_float_ptr(), + filter.get_float_ptr(), + filter_grad.get_float_ptr(), + bias_grad.get_float_ptr(), + attrs.activation); +} + +static void backward_task(Task const *task, std::vector const ®ions, Context ctx, Runtime *runtime) { - Conv2DPerDeviceState const *m = *((Conv2DPerDeviceState **)task->local_args); - TaskArgumentAccessor acc(task, regions, ctx, runtime); + backward_task_impl(acc); +} - auto input = acc.get_tensor(INPUT); - auto filter = acc.get_tensor(FILTER); - auto bias = acc.get_tensor(BIAS); - auto output = acc.get_tensor(OUTPUT); +CostMetrics measure_operator_cost(SimEnvFactory const &sim, + Conv2DAttrs const &attrs, + InputParallelTensorDesc const &input_shape, + InputParallelTensorDesc const &filter_shape, + InputParallelTensorDesc const &bias_shape, + ProfilingSettings const &settings, + MachineView const &mv) { - // TensorAccessorR acc_input( - // regions[0], task->regions[0], FID_DATA, ctx, runtime); - // TensorAccessorW acc_output(regions[1], - // task->regions[1], - // FID_DATA, - // ctx, - // runtime, - // false - // /*readOutput*/); - // TensorAccessorR acc_kernel( - // regions[2], task->regions[2], FID_DATA, ctx, runtime); - // float const *acc_bias_ptr = NULL; - // if (m->use_bias) { - // TensorAccessorR acc_bias( - // regions[3], task->regions[3], FID_DATA, ctx, runtime); - // acc_bias_ptr = acc_bias.ptr; - // } + auto env = sim.new_environment(); - profile(forward_kernel, - m->profiling, - "[Conv2d] forward_time = %.2lfms\n", - m, - input.get_float_ptr(), - output.get_float_ptr(), - filter.get_float_ptr(), - bias.get_float_ptr()); -} + ParallelTensorShape output_shape = get_output_shape(attrs, input_shape.shape); -/* - region(I): input - region(I/O): input_grad (if trainableInputs[0]) - region(I): output - region(I/O): output_grad - region(I): filter - region(I/O): filter_grad - region(I/O): bias_grad (if use_bias) -*/ -void Conv2D::backward_task(Task const *task, - std::vector const ®ions, - Context ctx, - Runtime *runtime) { - // Conv2D* conv = (Conv2D*) task->args; - Conv2DPerDeviceState const *m = *((Conv2DPerDeviceState **)task->local_args); - assert(regions.size() == (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - assert(task->regions.size() == - (5 + static_cast(m->trainableInputs[0]) + - static_cast(m->use_bias))); - size_t rid = 0; - TensorAccessorR acc_input( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - float *acc_input_grad_ptr = NULL; - if (m->trainableInputs[0]) { - TensorAccessorW acc_input_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - acc_input_grad_ptr = acc_input_grad.ptr; - rid++; - } - TensorAccessorR acc_output( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_output_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - TensorAccessorR acc_kernel( - regions[rid], task->regions[rid], FID_DATA, ctx, runtime); - rid++; - TensorAccessorW acc_kernel_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - rid++; - float *acc_bias_grad_ptr = NULL; - if (m->use_bias) { - TensorAccessorW acc_bias_grad( - regions[rid], - task->regions[rid], - FID_DATA, - ctx, - runtime, - true /*readOutput*/); - acc_bias_grad_ptr = static_cast(acc_bias_grad.ptr); - rid++; - } - assert(rid == regions.size()); + SimTaskBinding init_binding; + init_binding.bind(INPUT, input_shape); + init_binding.bind(OUTPUT, output_shape); + init_binding.bind(FILTER, filter_shape); + init_binding.bind_arg(ATTRS, attrs); + init_binding.bind_arg(HANDLE, ff_handle()); - backward_kernel_wrapper(m, - acc_input.ptr, - acc_input_grad_ptr, - acc_output.ptr, - acc_output_grad.ptr, - acc_kernel.ptr, - acc_kernel_grad.ptr, - acc_bias_grad_ptr); -} + auto init_accessor = env.get_init_accessor(CONV2D_INIT_TASK_ID, init_binding); + DeviceSpecific per_device_state = + init_task_impl(init_accessor); -bool Conv2D::estimate_sync_cost(Simulator *sim, - MachineView const &view, - CostMetrics &cost_metrics) const { - ParallelDim kernel_dims[MAX_TENSOR_DIM], bias_dims[MAX_TENSOR_DIM]; - int kernel_ndims, bias_ndims; + SimTaskBinding fwd_binding; + fwd_binding.bind_arg(PROFILING, settings); + fwd_binding.bind_arg(PER_DEVICE_STATE, per_device_state); + init_binding.bind_arg(ATTRS, attrs); - this->get_params().solve_dims(this->inputs[0]->get_shape(), - nullptr, - nullptr, - kernel_dims, - &kernel_ndims, - bias_dims, - &bias_ndims); + fwd_binding.bind(INPUT, input_shape); + fwd_binding.bind(OUTPUT, output_shape); + fwd_binding.bind(FILTER, filter_shape); + fwd_binding.bind(BIAS, bias_shape); - cost_metrics.sync_time = - sim->default_estimate_sync_cost(kernel_dims, kernel_ndims, view); + SimTaskBinding bwd_binding = infer_bwd_binding(fwd_binding); - if (this->use_bias) { - cost_metrics.sync_time += - sim->default_estimate_sync_cost(bias_dims, bias_ndims, view); - } + auto fwd_accessor = env.get_fwd_accessor(CONV2D_FWD_TASK_ID, fwd_binding); + auto bwd_accessor = env.get_bwd_accessor(CONV2D_BWD_TASK_ID, bwd_binding); - return true; + float forward_time = forward_task_impl(fwd_accessor).value(); + float backward_time = backward_task_impl(bwd_accessor).value(); + + float sync_time = default_estimate_sync_time(env); + return make_metrics(forward_time, backward_time, sync_time, env); } -tl::optional Conv2D::as_dot() const { - RecordFormatter rr; - RecordFormatter r; +template <> +void register_task() { + OpTaskSignature init(OpTaskType::INIT); - r << this->inputs[0]->get_shape().as_dot(); - r << "in_channels" << this->in_channels; - r << "out_channels" << this->out_channels; - r << "kernel_h" << this->kernel_h; - r << "kernel_w" << this->kernel_w; - r << "padding_h" << this->padding_h; - r << "padding_w" << this->padding_w; - r << "stride_h" << this->stride_h; - r << "stride_w" << this->stride_w; - r << this->outputs[0]->get_shape().as_dot(); - rr << r; + init.add_input_slot(INPUT); + init.add_output_slot(OUTPUT); + init.add_weight_slot(FILTER); + init.add_arg_slot(ATTRS); + init.add_unchecked_arg_slot(HANDLE); - return rr; -} + init.add_return_value(); -bool Conv2D::measure_operator_cost(Simulator *sim, - MachineView const &mv, - CostMetrics &cost_metrics) const { - ParallelTensorBase sub_output, sub_input; - if (!outputs[0]->get_sub_tensor(mv, sub_output)) { - return false; - } - if (!inputs[0]->get_sub_tensor(mv, sub_input)) { - return false; - } - int input_w = sub_input.dims[0].size; - int input_h = sub_input.dims[1].size; - int input_c = sub_input.dims[2].size; - int input_n = sub_input.dims[3].size; - int output_w = sub_output.dims[0].size; - int output_h = sub_output.dims[1].size; - int output_c = sub_output.dims[2].size; - int output_n = sub_output.dims[3].size; - int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2; - int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2; + register_task(CONV2D_INIT_TASK_ID, "Conv2D Init", init, init_task); +} - Conv2DPerDeviceState *m = sim->conv2d_meta; - m->relu = activation == AC_MODE_RELU; - // require input_c is divisible by groups +template <> +void register_task() { + OpTaskSignature fwd(OpTaskType::FWD); - // allocate tensors in simulator - sim->free_all(); - float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT); - assert(input_ptr != NULL); - cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + fwd.add_arg_slot(PROFILING); + fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + init.add_arg_slot(ATTRS); - float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT); - assert(output_ptr != NULL); - cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset); + fwd.add_input_slot(INPUT); + fwd.add_output_slot(OUTPUT); + fwd.add_weight_slot(FILTER); + fwd.add_weight_slot(BIAS); - float *weight_ptr = (float *)sim->allocate( - (size_t)output_c * input_c * kernel_h * kernel_w / groups, DT_FLOAT); - assert(weight_ptr != NULL); - float *bias_ptr = (float *)sim->allocate(output_c, DT_FLOAT); - assert(bias_ptr != NULL); - cost_metrics.weights_memory += cost_metrics.total_mem_diff_from(sim->offset); + register_task(CONV2D_FWD_TASK_ID, "Conv2D Fwd", fwd, forward_task); +} - init_kernel(m, - input_w, - input_h, - input_c, - input_n, - output_w, - output_h, - output_c, - output_n, - kernel_h, - kernel_w, - groups, - stride_h, - stride_w, - pad_h, - pad_w, - input_ptr, - output_ptr, - weight_ptr, - weight_ptr, // note we reuse weight_ptr for kernel_grad_ptr here - // to avoid allocating another tensor - &cost_metrics.forward_time, - &cost_metrics.backward_time); +template <> +void register_task() { + OpTaskSignature bwd = + infer_bwd_signature(get_op_signature(CONV2D_FWD_TASK_ID)); - log_measure.debug("[Measure Conv2D] name(%s) input(%d %d %d %d) weight(%d %d " - "%d %d) output(%d %d %d %d) stride(%d %d) padding(%d %d) " - "forward_time(%.4lf) backward_time(%.4lf)\n", - name, - input_n, - input_c, - input_h, - input_w, - output_c, - input_c / groups, - kernel_h, - kernel_w, - output_n, - output_c, - output_h, - output_w, - stride_h, - stride_w, - padding_h, - padding_w, - cost_metrics.forward_time, - cost_metrics.backward_time); - return true; + register_task(CONV2D_BWD_TASK_ID, "Conv2D Bwd", bwd, backward_task); } } // namespace FlexFlow diff --git a/lib/runtime/src/ops/conv_2d.h b/lib/runtime/src/ops/conv_2d.h index 382538b70a..7225099a47 100644 --- a/lib/runtime/src/ops/conv_2d.h +++ b/lib/runtime/src/ops/conv_2d.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_CONV_2D_H #include "op-attrs/ops/conv_2d.h" -#include "op_task_invocation.h" #include "sim_environment.h" +#include "task_spec/op_task_invocation.h" namespace FlexFlow { @@ -24,113 +24,6 @@ CostMetrics measure_operator_cost(SimEnvFactory const &sim_factory, ProfilingSettings const &settings, MachineView const &machine_view); -/* namespace Conv2DInput { */ -/* static constexpr int INDEX = 0; */ - -/* enum { WIDTH = 0, HEIGHT = 1, CHANNEL = 2, SAMPLE = 3, REPLICA = 4, NUMDIM }; - */ -/* } */ - -/* namespace Conv2DOutput { */ -/* enum { WIDTH = 0, HEIGHT = 1, CHANNEL = 2, SAMPLE = 3, REPLICA = 4, NUMDIM }; - */ -/* } */ - -/* namespace Conv2DKernel { */ -/* static constexpr int INDEX = 0; */ - -/* enum { */ -/* WIDTH = 0, */ -/* HEIGHT = 1, */ -/* CHANNEL_IN = 2, */ -/* CHANNEL_OUT = 3, */ -/* REPLICA = 4, */ -/* NUMDIM */ -/* }; */ -/* } */ - -/* /1* namespace Conv2DBias { *1/ */ -/* /1* static constexpr int INDEX = 1; *1/ */ - -/* class Conv2D : public Op { */ -/* public: */ -/* Conv2D(FFModel &model, */ -/* LayerID const &layer_guid, */ -/* const ParallelTensor input, */ -/* int outChannels, */ -/* int kernelH, */ -/* int kernelW, */ -/* int strideH, */ -/* int strideW, */ -/* int paddingH, */ -/* int paddingW, */ -/* ActiMode activation, */ -/* int groups, */ -/* bool use_bias, */ -/* bool allocate_weights, */ -/* char const *name); */ -/* Conv2D(FFModel &model, */ -/* Conv2D const &other, */ -/* const ParallelTensor input, */ -/* bool allocate_weights); */ -/* Conv2D(FFModel &model, */ -/* Conv2DAttrs const &attrs, */ -/* std::vector const &inputs, */ -/* char const *name = nullptr, */ -/* bool allocate_weights = false); */ -/* void init(FFModel const &) override; */ -/* void forward(FFModel const &) override; */ -/* void backward(FFModel const &) override; */ -/* // void update(const FFModel&); */ -/* // Parameter* get_parameter(int index); */ -/* // void create_weights(FFModel& model); */ -/* // void create_input_partition(FFModel& model); */ -/* static Op * */ -/* create_operator_from_layer(FFModel &model, */ -/* Layer const *layer, */ -/* std::vector const &inputs); - */ - -/* static PerDeviceOpState *init_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void forward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* static void backward_task(Legion::Task const *task, */ -/* std::vector const - * ®ions, */ -/* Legion::Context ctx, */ -/* Legion::Runtime *runtime); */ -/* bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ -/* bool estimate_sync_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const override; */ - -/* /1* static void *1/ */ -/* /1* construct_output_mappings(std::vector &); - * *1/ */ -/* /1* static void construct_mappings(std::vector &, - * *1/ */ -/* /1* bool use_bias); *1/ */ -/* /1* static void - * construct_weight_mappings(std::vector &, *1/ */ -/* /1* bool use_bias); *1/ */ - -/* public: */ -/* int in_channels, out_channels, kernel_h, kernel_w, stride_h, stride_w, */ -/* padding_h, padding_w; */ -/* ActiMode activation; */ -/* int groups; */ -/* bool use_bias; */ -/* }; */ - } // namespace FlexFlow #endif