From 9e22a6d6a51669a3e0e4049ffba0a5ab45ad5cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Fri, 12 Feb 2021 10:13:53 +0100 Subject: [PATCH 1/9] Use mkldnn deconvolution primitive in deconvolution --- .../nn/mkldnn/mkldnn_deconvolution.cc | 561 +++++++++--------- 1 file changed, 276 insertions(+), 285 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 65bf93298b95..9d59c65e7891 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -31,69 +31,96 @@ namespace mxnet { namespace op { -bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, +using DeconvFwd = mkldnn::deconvolution_forward; +using DeconvFwdPD = mkldnn::deconvolution_forward::primitive_desc; + +using DeconvBwdData = mkldnn::deconvolution_backward_data; +using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc; + +using DeconvBwdWeight = mkldnn::deconvolution_backward_weights; +using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc; + +bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) { if (params.kernel.ndim() != 2) return false; return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) && input.shape().ndim() == 4; } -static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) { - mkldnn::memory::dims dims(1); - // This is deconvolution on 4D data. The second dimension is the channel. - dims[0] = md.data.dims[1]; - return mkldnn::memory::desc( - dims, static_cast(md.data.data_type), - mkldnn::memory::format_tag::any); +// Swaps the logical order of dimensions that in plain format would correspond to input and output +// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). +static inline mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) { + auto &d = desc.data; + int offset = int(num_groups > 1); + int dim0 = offset + 0; + int dim1 = offset + 1; + std::swap(d.dims[dim0], d.dims[dim1]); + std::swap(d.padded_dims[dim0], d.padded_dims[dim1]); + if (d.format_kind != dnnl_format_kind_any) { + std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]); + // as padding is not supported, these are always zeros? + std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]); + // for blocked format: change indices + for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) { + auto &val = d.format_desc.blocking.inner_idxs[i]; + if (val == dim0) { + val = dim1; + } else if (val == dim1) { + val = dim0; + } + } + } + return desc; } -std::shared_ptr GetDeconvBwd_( - const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, - bool has_bias, const mkldnn::memory::desc &out_md, - const mkldnn::engine &engine, const mkldnn::memory::dims &strides, - const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) { - // MKL-DNN introduced padded formats since 0.15 which require more memory - // compared to the actual size of the tensor. Currently, MKL-DNN operators - // still reuse memory from memory planning, so here we need to select a - // suboptimal kernel for computation that has the expected memory size requirements - if (!has_bias) { - mkldnn::convolution_forward::desc desc( - mkldnn::prop_kind::forward_training, - mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, - strides, dilates, padding, padding); - auto deconv_pd = - std::make_shared(desc, - engine); - while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) { - CHECK(deconv_pd->next_impl()) << "No implementation"; - } - return deconv_pd; +// Applies IOLogicalSwapDesc to arr +static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) { + mkldnn::memory::desc desc; + if (arr.IsMKLDNNData()) { + desc = arr.GetMKLDNNData()->get_desc(); } else { - auto bias_md = GetBiasDesc(data_md); - mkldnn::convolution_forward::desc desc( - mkldnn::prop_kind::forward_training, - mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md, - data_md, strides, dilates, padding, padding); - auto deconv_pd = - std::make_shared(desc, - engine); - while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) { - CHECK(deconv_pd->next_impl()) << "No implementation"; - } - return deconv_pd; + const auto &temp = GetWeightDesc(arr, num_groups); + desc = mkldnn::memory::desc( + temp.dims(), temp.data_type(), + static_cast(GetDefaultFormat(temp.data.ndims))); } + const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups)); +} + +// Version of GetWeightDesc for deconvolution (with swap) +static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) { + return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups); } -std::shared_ptr -GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, bool has_bias, const NDArray &output) { +// Imposes the plain format on memory descriptors with padding +// Changing only one at a time, so maybe better implementations will be selected +// (than entirely plain one) +void ImposePlainWherePadding(mkldnn::memory::desc &src_md, mkldnn::memory::desc &dst_md, + mkldnn::memory::desc &weight_md, size_t src_size, size_t dst_size, + size_t wei_size) { + if (src_size != GetMemDescSize(src_md)) { + CHECK(src_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + src_md = GetDesc(src_md, GetDefaultFormat(src_md)); + } else if (dst_size != GetMemDescSize(dst_md)) { + CHECK(dst_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + dst_md = GetDesc(dst_md, GetDefaultFormat(dst_md)); + } else if (wei_size != GetMemDescSize(weight_md)) { + CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + int num_groups = (weight_md.data.ndims > src_md.data.ndims) ? weight_md.data.dims[0] : 1; + weight_md = IOLogicalSwapDesc(weight_md, num_groups); + weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups); + } +} + +std::shared_ptr GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, const NDArray *bias, + const NDArray &output) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); + auto weight_md = GetDeconvWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); + auto bias_md = bias ? GetMemDesc(*bias) + : mkldnn::memory::desc{ + {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any}; auto engine = CpuEngine::Get()->get_engine(); CHECK_GE(param.stride.ndim(), 2); CHECK_GE(param.pad.ndim(), 2); @@ -107,32 +134,41 @@ GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data, mkldnn::memory::dims dilate{0, 0}; dilate[0] = param.dilate[0] - 1; dilate[1] = param.dilate[1] - 1; - auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, - strides, padding, dilate); - mkldnn::convolution_backward_data::desc desc( - mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md, - strides, dilate, padding, padding); - auto deconv_pd = - std::make_shared( - desc, engine, *bwd_pd); + auto desc = [&]() { + return DeconvFwd::desc( + mkldnn::prop_kind::forward_training, // TODO: check if this should be constant + mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides, + dilate, padding, padding); + }; + auto deconv_pd = + std::make_shared( + desc(), engine); // MKL-DNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKL-DNN operators // still reuse memory from memory planning, so here we need to select a // suboptimal kernel for computation that has the expected memory size requirements - while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->diff_src_desc().get_size() != GetMemDescSize(out_md) || + while (deconv_pd->dst_desc().get_size() != GetMemDescSize(out_md) || + deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) || deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) { - CHECK(deconv_pd->next_impl()) << "No implementation"; + // for deconvolution primitive next_impl always fails. Keep this? + if (!deconv_pd->next_impl()) { + ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->dst_desc().get_size(), + deconv_pd->src_desc().get_size(), + deconv_pd->weights_desc().get_size()); + *deconv_pd = DeconvFwdPD(desc(), engine); + } } + return deconv_pd; } -std::shared_ptr -GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, bool has_bias, - const NDArray &output) { +std::shared_ptr +GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, + const NDArray &output, + const DeconvFwdPD &fwd_pd) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); + auto weight_md = GetDeconvWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); auto engine = CpuEngine::Get()->get_engine(); CHECK_GE(param.stride.ndim(), 2); @@ -147,18 +183,41 @@ GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, mkldnn::memory::dims dilate{0, 0}; dilate[0] = param.dilate[0] - 1; dilate[1] = param.dilate[1] - 1; - return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, strides, - padding, dilate); + auto desc = [&]() { + return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, + data_md, weight_md, out_md, strides, dilate, + padding, padding); + }; + auto deconv_pd = + std::make_shared(desc(), engine, fwd_pd); + // MKL-DNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements + while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) || + deconv_pd->diff_src_desc().get_size() != GetMemDescSize(data_md) || + deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) { + if (!deconv_pd->next_impl()) { + ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(), + deconv_pd->diff_src_desc().get_size(), + deconv_pd->weights_desc().get_size()); + *deconv_pd = DeconvBwdDataPD(desc(), engine, fwd_pd); + } + } + return deconv_pd; } -std::shared_ptr +std::shared_ptr GetDeconvBwdWeightsImpl( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, bool has_bias, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights,const NDArray *bias, const NDArray &output, + const DeconvFwdPD &fwd_pd) { auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); + auto weight_md = GetDeconvWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); + auto bias_md = bias ? GetMemDesc(*bias) + : mkldnn::memory::desc{ + {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any}; auto engine = CpuEngine::Get()->get_engine(); CHECK_GE(param.stride.ndim(), 2); CHECK_GE(param.pad.ndim(), 2); @@ -172,98 +231,61 @@ GetDeconvBwdWeightsImpl( mkldnn::memory::dims dilate{0, 0}; dilate[0] = param.dilate[0] - 1; dilate[1] = param.dilate[1] - 1; + auto desc = [&]() { + return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, + data_md, weight_md, bias_md, out_md, + strides, dilate, padding, padding); + }; + auto deconv_pd = std::make_shared( + desc(), engine, fwd_pd); // MKL-DNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKL-DNN operators // still reuse memory from memory planning, so here we need to select a // suboptimal kernel for computation that has the expected memory size requirements - if (!has_bias) { - mkldnn::convolution_backward_weights::desc desc( - mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md, - strides, dilate, padding, padding); - auto deconv_pd = - std::make_shared( - desc, engine, fwd_pd); - while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->diff_weights_desc().get_size() != - GetMemDescSize(weight_md)) { - CHECK(deconv_pd->next_impl()) << "No implementation"; + while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) || + deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) || + deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) { + if (!deconv_pd->next_impl()) { + ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(), + deconv_pd->src_desc().get_size(), + deconv_pd->diff_weights_desc().get_size()); + *deconv_pd = DeconvBwdWeightPD(desc(), engine, fwd_pd); } - return deconv_pd; - } else { - auto bias_md = GetBiasDesc(data_md); - mkldnn::convolution_backward_weights::desc desc( - mkldnn::algorithm::convolution_direct, out_md, weight_md, bias_md, - data_md, strides, dilate, padding, padding); - auto deconv_pd = - std::make_shared( - desc, engine, fwd_pd); - while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->diff_weights_desc().get_size() != - GetMemDescSize(weight_md)) { - CHECK(deconv_pd->next_impl()) << "No implementation"; - } - return deconv_pd; } + return deconv_pd; } class MKLDNNDeconvForward { public: - MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, bool has_bias, + MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights,const NDArray *bias, const NDArray &output); - const mkldnn::convolution_backward_data &GetFwd() const { return *fwd; } + const DeconvFwd &GetFwd() const { return *fwd; } - const mkldnn::convolution_backward_data::primitive_desc &GetPd() const { - return *fwd_pd; - } + const DeconvFwdPD &GetPd() const { return *fwd_pd; } private: - std::shared_ptr fwd; - std::shared_ptr fwd_pd; + std::shared_ptr fwd; + std::shared_ptr fwd_pd; }; // class MKLDNNDeconvForward -MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam ¶m, - const NDArray &data, - const NDArray &weights, bool has_bias, +MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) - : fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) { - fwd = std::make_shared(GetPd()); + : fwd_pd(GetDeconvFwdImpl(param, data, weights, bias, output)) { + fwd = std::make_shared(GetPd()); } -static void MKLDNNDeconvFwdBiasPostProcess( - const DeconvolutionParam ¶m, const OpContext &ctx, const NDArray &bias, - const std::vector &out_data) { - // add bias, broadcast bias to dim 1: channel - if (!param.no_bias) { - // MKLDNN only supports float right now. - typedef float DType; - Stream *s = ctx.get_stream(); - Tensor b = bias.data().get(s); - // The output data is stored in a special MKLDNN format, - // converts its format to the default format. - // Unfortunately, MKLDNN doesn't support broadcast. - auto out_data_def = out_data[deconv::kOut].Reorder2Default(); - Tensor out_cpu = out_data_def.data().get(s); - out_cpu += mshadow::expr::broadcast<1>(b, out_cpu.shape_); - } -} - -MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs, - const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { +MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, const NDArray *bias, + const NDArray &output) { + using deconv_fwd_map = std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL - static thread_local std::unordered_map - fwds; + static thread_local deconv_fwd_map fwds; #else - static MX_THREAD_LOCAL - std::unordered_map - fwds; + static MX_THREAD_LOCAL deconv_fwd_map fwds; #endif - const DeconvolutionParam ¶m = nnvm::get(attrs.parsed); DeconvSignature key(param); // Here we can sign the conv op with NDArray because conv primitive will // decide the right layout for the, so we only need to get the shape and the @@ -275,15 +297,13 @@ MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs, auto it = fwds.find(key); if (it == fwds.end()) { - bool has_bias = (bias != nullptr); - auto fwd = MKLDNNDeconvForward(param, data, weights, has_bias, output); + auto fwd = MKLDNNDeconvForward(param, data, weights, bias, output); it = AddToCache(&fwds, key, fwd); } return it->second; } -void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data) { @@ -294,11 +314,9 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, auto &weight = in_data[deconv::kWeight]; const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias]; - MKLDNNDeconvForward &fwd = - GetDeconvFwd(attrs, data, weight, bias, out_data[deconv::kOut]); + MKLDNNDeconvForward &fwd = + GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]); - auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().diff_dst_desc()); - const mkldnn::memory *weight_mem; if (ctx.is_train) { // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it // to the default format for now. @@ -306,128 +324,94 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, // This asks the engine to change the layout of the weight array after // it's used. weight.Reorder2DefaultAsync(); - weight_mem = - GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group); } else { // For inference, we want to reorder the weight array so we don't need to // reorder data every time. if (weight.IsDefaultData()) { // We also need to modify the layout on the original weight array. The // data conversion happens after the weight array is used. - weight.MKLDNNDataReorderAsync(fwd.GetPd().weights_desc()); - weight_mem = - GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group); - + weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group)); } else { - weight_mem = weight.GetMKLDNNData(); - CHECK(weight_mem->get_desc() == fwd.GetPd().weights_desc()); + CHECK(weight.GetMKLDNNData()->get_desc() == + IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group)); } } - mkldnn_output_t out_mem; - out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().diff_src_desc(), - req[deconv::kOut]); + // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. + // For that, we would pass input tensor in place of output and output tensor in place of + // input (for appropriate convolution primitives: deconvolution forward = convolution backward + // data, deconvolution backward data = convolution forward). Convolution primitive expects + // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: + // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight + // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet + // provides such tensor. + + // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), + // but this time we don't swap input and output tensors, so o = output_channels, i = input_channels, + // so the current weight tensor won't fit (when oihw != iohw). But actually, underneath deconvolution + // MKLDNN also uses convolution, so even though it expects the weight tensor with shape (o, i, h, w), + // it wants it in iohw format, so it's physical representation match current weight tensor. + + // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN operations + IOLogicalSwapMKLDNNMem(weight, param.num_group); + + auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc()); + const mkldnn::memory *weight_mem = + GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group); + mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]); mkldnn_args_map_t net_args; + if (bias) { + const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData(); + net_args.insert({MKLDNN_ARG_BIAS, *bias_mem}); + } - net_args.insert({MKLDNN_ARG_DIFF_DST, *data_mem}); + net_args.insert({MKLDNN_ARG_SRC, *data_mem}); net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem}); - net_args.insert({MKLDNN_ARG_DIFF_SRC, *out_mem.second}); + net_args.insert({MKLDNN_ARG_DST, *out_mem.second}); MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args); CommitOutput(out_data[deconv::kOut], out_mem); MKLDNNStream::Get()->Submit(); - MKLDNNDeconvFwdBiasPostProcess(param, ctx, *bias, out_data); + // swap back from oihw to iohw + IOLogicalSwapMKLDNNMem(weight, param.num_group); } -class MKLDNNDeconvBackwardData { - std::shared_ptr bwd; +class MKLDNNDeconvBackward { + std::shared_ptr bwd_data_pd_; + std::shared_ptr bwd_weight_pd_; + std::shared_ptr bwd_data_; + std::shared_ptr bwd_weight_; public: - std::shared_ptr bwd_pd; - MKLDNNDeconvBackwardData(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output); - - const mkldnn::convolution_forward &GetBwd() const { return *bwd; } - const mkldnn::convolution_forward::primitive_desc &GetDataPd() const { - return *bwd_pd; + MKLDNNDeconvBackward(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) { + const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output); + bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd); + bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd); + bwd_data_ = std::make_shared(GetDataPd()); + bwd_weight_ = std::make_shared(GetWeightsPd()); } -}; - -MKLDNNDeconvBackwardData::MKLDNNDeconvBackwardData( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output) - : bwd_pd(GetDeconvBwdDataImpl(param, data, weights, false, output)) { - bwd = std::make_shared(GetDataPd()); -} -typedef ParamOpSign MKLDNNDeconvSignature; - -static inline MKLDNNDeconvBackwardData &GetDeconvBwdData( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local std::unordered_map - bwds; -#else - static MX_THREAD_LOCAL std::unordered_map - bwds; -#endif - MKLDNNDeconvSignature key(param); - // Here we can sign the conv op with NDArray because conv primitive will - // decide the right layout for the, so we only need to get the shape and the - // data type of the arrays. - key.AddSign(data); - key.AddSign(weights); - key.AddSign(output); + const DeconvBwdData &GetBwdData() const { return *bwd_data_; } - auto it = bwds.find(key); - if (it == bwds.end()) { - auto bwd = MKLDNNDeconvBackwardData(param, data, weights, output); - it = AddToCache(&bwds, key, bwd); - } - return it->second; -} + const DeconvBwdWeight &GetBwdWeights() const { return *bwd_weight_; } -class MKLDNNDeconvBackwardWeights { - std::shared_ptr bwd; + const DeconvBwdDataPD &GetDataPd() const { return *bwd_data_pd_; } - public: - std::shared_ptr - bwd_data_pd; - MKLDNNDeconvBackwardWeights( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &bwd_data_pd); - const mkldnn::convolution_backward_weights &GetBwd() const { return *bwd; } - const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd() - const { - return *bwd_data_pd; - } + const DeconvBwdWeightPD &GetWeightsPd() const { return *bwd_weight_pd_; } }; -MKLDNNDeconvBackwardWeights::MKLDNNDeconvBackwardWeights( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &bwd_data_pd) - : bwd_data_pd(GetDeconvBwdWeightsImpl(param, data, weights, false, output, - bwd_data_pd)) { - bwd = std::make_shared(GetWeightsPd()); -} +typedef ParamOpSign MKLDNNDeconvSignature; -static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &bwd_data_pd) { +static inline MKLDNNDeconvBackward &GetDeconvBwd( + const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) { + using mkldnn_deconv_bwd_map = + std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL - static thread_local std::unordered_map - bwds; + static thread_local mkldnn_deconv_bwd_map bwds; #else - static MX_THREAD_LOCAL std::unordered_map - bwds; + static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds; #endif MKLDNNDeconvSignature key(param); // Here we can sign the conv op with NDArray because conv primitive will @@ -436,16 +420,12 @@ static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights( key.AddSign(data); key.AddSign(weights); key.AddSign(output); + if (bias) key.AddSign(*bias); auto it = bwds.find(key); if (it == bwds.end()) { - auto bwd = - MKLDNNDeconvBackwardWeights(param, data, weights, output, bwd_data_pd); - auto ins_ret = bwds.insert( - std::pair(key, - bwd)); - CHECK(ins_ret.second); - it = ins_ret.first; + auto bwd = MKLDNNDeconvBackward(param, data, weights, bias, output); + it = AddToCache(&bwds, key, bwd); } return it->second; } @@ -461,63 +441,74 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, auto &data = inputs[deconv::kData + 1]; auto &weight = inputs[deconv::kWeight + 1]; + const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1]; auto &out_grad = inputs[deconv::kOut]; CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; - MKLDNNDeconvBackwardData &bwd_data = - GetDeconvBwdData(param, data, weight, inputs[deconv::kOut]); + MKLDNNDeconvBackward &deconvBwd = + GetDeconvBwd(param, data, weight, bias, out_grad); auto out_grad_mem = - out_grad.GetMKLDNNDataReorder(bwd_data.GetDataPd().src_desc()); + out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc()); if (req[deconv::kData]) { - auto weight_mem = GetWeights(weight, bwd_data.GetDataPd().weights_desc(), + // swap is explained in MKLDNNDeconvolutionForward + IOLogicalSwapMKLDNNMem(weight, param.num_group); + auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), param.num_group); auto in_grad_mem = - CreateMKLDNNMem(in_grad[deconv::kData], bwd_data.GetDataPd().dst_desc(), + CreateMKLDNNMem(in_grad[deconv::kData], deconvBwd.GetDataPd().diff_src_desc(), req[deconv::kData]); - mkldnn_args_map_t net_args = {{MKLDNN_ARG_SRC, *out_grad_mem}, + mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem}, {MKLDNN_ARG_WEIGHTS, *weight_mem}, - {MKLDNN_ARG_DST, *in_grad_mem.second}}; - MKLDNNStream::Get()->RegisterPrimArgs(bwd_data.GetBwd(), net_args); + {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}}; + MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdData(), net_args); CommitOutput(in_grad[deconv::kData], in_grad_mem); } - if (req[deconv::kWeight]) { - MKLDNNDeconvBackwardWeights &bwd_weights = GetDeconvBwdWeights( - param, data, weight, inputs[deconv::kOut], bwd_data.GetDataPd()); - if (bwd_data.GetDataPd().src_desc() != - bwd_weights.GetWeightsPd().src_desc()) - out_grad_mem = - out_grad.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().src_desc()); - auto data_mem = - data.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().diff_dst_desc()); - auto in_grad_weight = CreateMKLDNNWeightGrad( - in_grad[deconv::kWeight], - bwd_weights.GetWeightsPd().diff_weights_desc(), req[deconv::kWeight]); + if (req[deconv::kWeight] || req[deconv::kBias]) { + if (deconvBwd.GetDataPd().diff_dst_desc() != + deconvBwd.GetWeightsPd().diff_dst_desc()) + out_grad_mem = + out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc()); + auto data_mem = + data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc()); + mkldnn_output_t in_grad_weight; + const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc(); + // swaps are explained in MKLDNNDeconvolutionForward + // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because of logical swap) + // We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is always in default format), + // so here we check if after a swap, wei_md will have a default format + if (req[deconv::kWeight] == OpReqType::kWriteTo && + IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) { + in_grad_weight = {OutDataOp::Noop, + const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)}; + } else { + IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group); + in_grad_weight = CreateMKLDNNWeightGrad( + in_grad[deconv::kWeight], + wei_md, req[deconv::kWeight]); + } mkldnn_args_map_t net_args = { - {MKLDNN_ARG_SRC, *out_grad_mem}, - {MKLDNN_ARG_DIFF_DST, *data_mem}, + {MKLDNN_ARG_DIFF_DST, *out_grad_mem}, + {MKLDNN_ARG_SRC, *data_mem}, {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}}; - MKLDNNStream::Get()->RegisterPrimArgs(bwd_weights.GetBwd(), net_args); + mkldnn_output_t in_grad_bias; + if (!param.no_bias) { + in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], + deconvBwd.GetWeightsPd().diff_bias_desc(), req[deconv::kBias]); + net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second}); + } + MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdWeights(), net_args); CommitOutput(in_grad[deconv::kWeight], in_grad_weight); + // CommitOutput Should run after RegisterPrimArgs for memory dependency + if (!param.no_bias) CommitOutput(in_grad[deconv::kBias], in_grad_bias); } MKLDNNStream::Get()->Submit(); - if (!param.no_bias) { - typedef float DType; - Stream *s = ctx.get_stream(); - Tensor gbias = - in_grad[deconv::kBias].data().get(s); - - NDArray temp = inputs[deconv::kOut]; - if (temp.IsMKLDNNData()) { - temp = temp.Reorder2Default(); - } - - Tensor grad = temp.data().get(s); - Assign(gbias, req[deconv::kBias], - mshadow::expr::sumall_except_dim<1>(grad)); - } + // swap back from oihw to iohw + if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, param.num_group); + if (req[deconv::kWeight] || req[deconv::kBias]) + IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group); } } // namespace op From ebbb70495b765650518bf0c13e750e1ef254ab0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Tue, 9 Mar 2021 16:47:26 +0100 Subject: [PATCH 2/9] Apply clang-format --- .../nn/mkldnn/mkldnn_deconvolution.cc | 147 ++++++++---------- 1 file changed, 64 insertions(+), 83 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 9d59c65e7891..cc8faca3ef88 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -40,11 +40,10 @@ using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc; using DeconvBwdWeight = mkldnn::deconvolution_backward_weights; using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc; -bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, - const NDArray &input) { +bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) { if (params.kernel.ndim() != 2) return false; - return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) - && input.shape().ndim() == 4; + return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) && + input.shape().ndim() == 4; } // Swaps the logical order of dimensions that in plain format would correspond to input and output @@ -87,7 +86,7 @@ static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) { const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups)); } -// Version of GetWeightDesc for deconvolution (with swap) +// Version of GetWeightDesc for deconvolution (with swap) static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) { return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups); } @@ -140,9 +139,7 @@ std::shared_ptr GetDeconvFwdImpl(const DeconvolutionParam ¶m, c mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides, dilate, padding, padding); }; - auto deconv_pd = - std::make_shared( - desc(), engine); + auto deconv_pd = std::make_shared(desc(), engine); // MKL-DNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKL-DNN operators // still reuse memory from memory planning, so here we need to select a @@ -162,11 +159,10 @@ std::shared_ptr GetDeconvFwdImpl(const DeconvolutionParam ¶m, c return deconv_pd; } -std::shared_ptr -GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, - const NDArray &output, - const DeconvFwdPD &fwd_pd) { +std::shared_ptr GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, + const NDArray &data, const NDArray &weights, + const NDArray &output, + const DeconvFwdPD &fwd_pd) { auto data_md = GetMemDesc(data); auto weight_md = GetDeconvWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); @@ -184,12 +180,10 @@ GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, dilate[0] = param.dilate[0] - 1; dilate[1] = param.dilate[1] - 1; auto desc = [&]() { - return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, - data_md, weight_md, out_md, strides, dilate, - padding, padding); + return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md, + strides, dilate, padding, padding); }; - auto deconv_pd = - std::make_shared(desc(), engine, fwd_pd); + auto deconv_pd = std::make_shared(desc(), engine, fwd_pd); // MKL-DNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKL-DNN operators // still reuse memory from memory planning, so here we need to select a @@ -207,11 +201,9 @@ GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data, return deconv_pd; } -std::shared_ptr -GetDeconvBwdWeightsImpl( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights,const NDArray *bias, const NDArray &output, - const DeconvFwdPD &fwd_pd) { +std::shared_ptr GetDeconvBwdWeightsImpl( + const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output, const DeconvFwdPD &fwd_pd) { auto data_md = GetMemDesc(data); auto weight_md = GetDeconvWeightDesc(weights, param.num_group); auto out_md = GetMemDesc(output); @@ -232,12 +224,10 @@ GetDeconvBwdWeightsImpl( dilate[0] = param.dilate[0] - 1; dilate[1] = param.dilate[1] - 1; auto desc = [&]() { - return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, - data_md, weight_md, bias_md, out_md, - strides, dilate, padding, padding); + return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, + bias_md, out_md, strides, dilate, padding, padding); }; - auto deconv_pd = std::make_shared( - desc(), engine, fwd_pd); + auto deconv_pd = std::make_shared(desc(), engine, fwd_pd); // MKL-DNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKL-DNN operators @@ -258,9 +248,8 @@ GetDeconvBwdWeightsImpl( class MKLDNNDeconvForward { public: - MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights,const NDArray *bias, - const NDArray &output); + MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output); const DeconvFwd &GetFwd() const { return *fwd; } const DeconvFwdPD &GetPd() const { return *fwd_pd; } @@ -314,8 +303,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c auto &weight = in_data[deconv::kWeight]; const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias]; - MKLDNNDeconvForward &fwd = - GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]); + MKLDNNDeconvForward &fwd = GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]); if (ctx.is_train) { // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it @@ -337,28 +325,31 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c } } - // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. - // For that, we would pass input tensor in place of output and output tensor in place of - // input (for appropriate convolution primitives: deconvolution forward = convolution backward - // data, deconvolution backward data = convolution forward). Convolution primitive expects - // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: - // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight - // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet + // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. + // For that, we would pass input tensor in place of output and output tensor in place of + // input (for appropriate convolution primitives: deconvolution forward = convolution backward + // data, deconvolution backward data = convolution forward). Convolution primitive expects + // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: + // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight + // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet // provides such tensor. - // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), - // but this time we don't swap input and output tensors, so o = output_channels, i = input_channels, - // so the current weight tensor won't fit (when oihw != iohw). But actually, underneath deconvolution - // MKLDNN also uses convolution, so even though it expects the weight tensor with shape (o, i, h, w), - // it wants it in iohw format, so it's physical representation match current weight tensor. - - // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN operations + // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), + // but this time we don't swap input and output tensors, so o = output_channels, i = + // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually, + // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight + // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation + // match current weight tensor. + + // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN + // operations IOLogicalSwapMKLDNNMem(weight, param.num_group); auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc()); const mkldnn::memory *weight_mem = GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group); - mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]); + mkldnn_output_t out_mem = + CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]); mkldnn_args_map_t net_args; if (bias) { const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData(); @@ -383,8 +374,8 @@ class MKLDNNDeconvBackward { std::shared_ptr bwd_weight_; public: - MKLDNNDeconvBackward(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray *bias, const NDArray &output) { + MKLDNNDeconvBackward(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output); bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd); bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd); @@ -403,9 +394,9 @@ class MKLDNNDeconvBackward { typedef ParamOpSign MKLDNNDeconvSignature; -static inline MKLDNNDeconvBackward &GetDeconvBwd( - const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray *bias, const NDArray &output) { +static inline MKLDNNDeconvBackward &GetDeconvBwd(const DeconvolutionParam ¶m, + const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { using mkldnn_deconv_bwd_map = std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL @@ -430,8 +421,7 @@ static inline MKLDNNDeconvBackward &GetDeconvBwd( return it->second; } -void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { @@ -444,20 +434,15 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1]; auto &out_grad = inputs[deconv::kOut]; - CHECK_NE(req[deconv::kWeight], kWriteInplace) - << "cannot write weight inplace"; - MKLDNNDeconvBackward &deconvBwd = - GetDeconvBwd(param, data, weight, bias, out_grad); - auto out_grad_mem = - out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc()); + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; + MKLDNNDeconvBackward &deconvBwd = GetDeconvBwd(param, data, weight, bias, out_grad); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc()); if (req[deconv::kData]) { // swap is explained in MKLDNNDeconvolutionForward - IOLogicalSwapMKLDNNMem(weight, param.num_group); - auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), - param.num_group); - auto in_grad_mem = - CreateMKLDNNMem(in_grad[deconv::kData], deconvBwd.GetDataPd().diff_src_desc(), - req[deconv::kData]); + IOLogicalSwapMKLDNNMem(weight, param.num_group); + auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), param.num_group); + auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], + deconvBwd.GetDataPd().diff_src_desc(), req[deconv::kData]); mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem}, {MKLDNN_ARG_WEIGHTS, *weight_mem}, {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}}; @@ -465,33 +450,29 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, CommitOutput(in_grad[deconv::kData], in_grad_mem); } if (req[deconv::kWeight] || req[deconv::kBias]) { - if (deconvBwd.GetDataPd().diff_dst_desc() != - deconvBwd.GetWeightsPd().diff_dst_desc()) - out_grad_mem = - out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc()); - auto data_mem = - data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc()); + if (deconvBwd.GetDataPd().diff_dst_desc() != deconvBwd.GetWeightsPd().diff_dst_desc()) + out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc()); + auto data_mem = data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc()); mkldnn_output_t in_grad_weight; const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc(); // swaps are explained in MKLDNNDeconvolutionForward - // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because of logical swap) - // We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is always in default format), - // so here we check if after a swap, wei_md will have a default format + // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because + // of logical swap) We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is + // always in default format), so here we check if after a swap, wei_md will have a default + // format if (req[deconv::kWeight] == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) { in_grad_weight = {OutDataOp::Noop, const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)}; } else { IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group); - in_grad_weight = CreateMKLDNNWeightGrad( - in_grad[deconv::kWeight], - wei_md, req[deconv::kWeight]); + in_grad_weight = + CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], wei_md, req[deconv::kWeight]); } - mkldnn_args_map_t net_args = { - {MKLDNN_ARG_DIFF_DST, *out_grad_mem}, - {MKLDNN_ARG_SRC, *data_mem}, - {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}}; + mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem}, + {MKLDNN_ARG_SRC, *data_mem}, + {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}}; mkldnn_output_t in_grad_bias; if (!param.no_bias) { in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], From d1512586ff29f3557e78e9fe879cb9b010ce8a61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Thu, 11 Mar 2021 12:17:25 +0100 Subject: [PATCH 3/9] Refactor deconvolution version 1 --- .../nn/mkldnn/mkldnn_deconvolution-inl.h | 169 ++++ .../nn/mkldnn/mkldnn_deconvolution.cc | 733 +++++++++--------- 2 files changed, 532 insertions(+), 370 deletions(-) create mode 100644 src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h new file mode 100644 index 000000000000..f2638013ac3d --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_deconvolution-inl.h + * \brief + * \Author: Paweł Głomski, pawel.glomski@intel.com + */ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include "../deconvolution-inl.h" +#include "./mkldnn_base-inl.h" +#include "./mkldnn_ops-inl.h" + +namespace mxnet { +namespace op { + +using deconv_fwd_t = mkldnn::deconvolution_forward; +using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc; + +using deconv_bwd_t = mkldnn::deconvolution_backward_data; +using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc; + +using deconv_bwd_weight_t = mkldnn::deconvolution_backward_weights; +using deconv_bwd_weight_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc; + +class MKLDNNDeconvFwd { + public: + struct Tensors { + Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &out); + Tensors(bool no_bias, const std::vector &inputs, const std::vector &outputs); + + const NDArray &data; + const NDArray &weight; + const NDArray *bias; + const NDArray &out; + }; + + static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam ¶m, const Tensors &tensors); + static std::shared_ptr MakePD(const DeconvolutionParam ¶m, + const Tensors &tensors); + + MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors); + void ControlWeightFormat(uint32_t num_group, bool is_train, const NDArray &weight); + void Execute(uint32_t num_group, const std::vector &req, const Tensors &tensors); + + private: + const mkldnn::memory *DataMem(const NDArray &data) const; + const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const; + const mkldnn::memory *BiasMem(const NDArray &bias) const; + + mkldnn_output_t OutMem(OpReqType req, const NDArray &out) const; + + std::shared_ptr fwd; + std::shared_ptr fwd_pd; +}; + +class MKLDNNDeconvBwd { + public: + struct ReadTensors { + ReadTensors(bool no_bias, const std::vector &inputs); + const NDArray &data; + const NDArray &weight; + const NDArray *bias; + const NDArray &out_grad; + }; + struct WriteTensors { + WriteTensors(bool no_bias, const std::vector &outputs); + const NDArray &data_grad; + const NDArray &weight_grad; + const NDArray *bias_grad; + }; + + static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam ¶m, const ReadTensors &rt); + static std::shared_ptr MakeDataPD(const DeconvolutionParam ¶m, + const ReadTensors &rt, + const deconv_fwd_pd_t &fwd_pd); + static std::shared_ptr MakeWeightsPD(const DeconvolutionParam ¶m, + const ReadTensors &rt, + const deconv_fwd_pd_t &fwd_pd); + + MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &rt); + void Execute(uint32_t num_group, const std::vector &req, const ReadTensors &rt, + const WriteTensors &wt); + + private: + void IOSwapWeightTensors(uint32_t num_group, const std::vector &req, + const NDArray &weight, const NDArray &weight_grad); + + const mkldnn::memory *ScheduleBwdData(uint32_t num_group, const std::vector &req, + const ReadTensors &rt, const WriteTensors &wt); + + void ScheduleBwdWeight(uint32_t num_group, const std::vector &req, + const ReadTensors &rt, const WriteTensors &wt, + const mkldnn::memory *out_grad_mem); + + const mkldnn::memory *DataMem(const NDArray &data) const; + const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const; + const mkldnn::memory *OutGradMem(const NDArray &out_grad) const; // for bwd data + const mkldnn::memory *OutGradMem(const NDArray &out_grad, // for bwd weight + const mkldnn::memory *out_grad_mem) const; + + mkldnn_output_t DataGradMem(OpReqType req, const NDArray &data_grad) const; + mkldnn_output_t WeightGradMem(uint32_t num_group, OpReqType req, + const NDArray &weight_grad) const; + mkldnn_output_t BiasGradMem(OpReqType req, const NDArray *bias) const; + + std::shared_ptr bwd_data_pd; + std::shared_ptr bwd_weight_pd; + std::shared_ptr bwd_data; + std::shared_ptr bwd_weight; +}; // namespace op + +struct DeconvDescCreator { + DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weight, + const NDArray *bias, const NDArray &out); + + // Imposes plain formats on memory descriptors with padding + // Changing only one at a time, so maybe better implementations will be selected + // (than entirely plain one) + void ImposePlainWherePadding(size_t data_size, size_t weight_size, size_t out_size); + bool CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const; + + deconv_fwd_t::desc MakeFwdDesc() const; + deconv_bwd_t::desc MakeBwdDataDesc() const; + deconv_bwd_weight_t::desc MakeBwdWeightDesc() const; + + mkldnn::memory::desc data_md; + mkldnn::memory::desc weight_md; + mkldnn::memory::desc bias_md; + mkldnn::memory::desc out_md; + + mkldnn::memory::dims strides; + mkldnn::memory::dims padding; + mkldnn::memory::dims dilates; + + mkldnn::engine &engine; +}; + +mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups); +void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups); + +// Version of GetWeightDesc for deconvolution (with swap) +static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weight, int num_groups) { + return IOLogicalSwapDesc(GetWeightDesc(weight, num_groups), num_groups); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__ diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index cc8faca3ef88..b5e7e4166284 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -24,252 +24,46 @@ #if MXNET_USE_MKLDNN == 1 -#include "../deconvolution-inl.h" -#include "./mkldnn_base-inl.h" -#include "./mkldnn_ops-inl.h" +#include "./mkldnn_deconvolution-inl.h" namespace mxnet { namespace op { -using DeconvFwd = mkldnn::deconvolution_forward; -using DeconvFwdPD = mkldnn::deconvolution_forward::primitive_desc; - -using DeconvBwdData = mkldnn::deconvolution_backward_data; -using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc; - -using DeconvBwdWeight = mkldnn::deconvolution_backward_weights; -using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc; - bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) { if (params.kernel.ndim() != 2) return false; return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) && input.shape().ndim() == 4; } -// Swaps the logical order of dimensions that in plain format would correspond to input and output -// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). -static inline mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) { - auto &d = desc.data; - int offset = int(num_groups > 1); - int dim0 = offset + 0; - int dim1 = offset + 1; - std::swap(d.dims[dim0], d.dims[dim1]); - std::swap(d.padded_dims[dim0], d.padded_dims[dim1]); - if (d.format_kind != dnnl_format_kind_any) { - std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]); - // as padding is not supported, these are always zeros? - std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]); - // for blocked format: change indices - for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) { - auto &val = d.format_desc.blocking.inner_idxs[i]; - if (val == dim0) { - val = dim1; - } else if (val == dim1) { - val = dim0; - } - } - } - return desc; -} - -// Applies IOLogicalSwapDesc to arr -static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) { - mkldnn::memory::desc desc; - if (arr.IsMKLDNNData()) { - desc = arr.GetMKLDNNData()->get_desc(); - } else { - const auto &temp = GetWeightDesc(arr, num_groups); - desc = mkldnn::memory::desc( - temp.dims(), temp.data_type(), - static_cast(GetDefaultFormat(temp.data.ndims))); - } - const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups)); -} - -// Version of GetWeightDesc for deconvolution (with swap) -static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) { - return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups); -} - -// Imposes the plain format on memory descriptors with padding -// Changing only one at a time, so maybe better implementations will be selected -// (than entirely plain one) -void ImposePlainWherePadding(mkldnn::memory::desc &src_md, mkldnn::memory::desc &dst_md, - mkldnn::memory::desc &weight_md, size_t src_size, size_t dst_size, - size_t wei_size) { - if (src_size != GetMemDescSize(src_md)) { - CHECK(src_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; - src_md = GetDesc(src_md, GetDefaultFormat(src_md)); - } else if (dst_size != GetMemDescSize(dst_md)) { - CHECK(dst_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; - dst_md = GetDesc(dst_md, GetDefaultFormat(dst_md)); - } else if (wei_size != GetMemDescSize(weight_md)) { - CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; - int num_groups = (weight_md.data.ndims > src_md.data.ndims) ? weight_md.data.dims[0] : 1; - weight_md = IOLogicalSwapDesc(weight_md, num_groups); - weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups); - } -} +/*############################### Forward ###############################*/ -std::shared_ptr GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray *bias, - const NDArray &output) { - auto data_md = GetMemDesc(data); - auto weight_md = GetDeconvWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto bias_md = bias ? GetMemDesc(*bias) - : mkldnn::memory::desc{ - {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any}; - auto engine = CpuEngine::Get()->get_engine(); - CHECK_GE(param.stride.ndim(), 2); - CHECK_GE(param.pad.ndim(), 2); - CHECK_GE(param.dilate.ndim(), 2); - mkldnn::memory::dims strides{0, 0}; - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - mkldnn::memory::dims padding{0, 0}; - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - mkldnn::memory::dims dilate{0, 0}; - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - auto desc = [&]() { - return DeconvFwd::desc( - mkldnn::prop_kind::forward_training, // TODO: check if this should be constant - mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides, - dilate, padding, padding); - }; - auto deconv_pd = std::make_shared(desc(), engine); - // MKL-DNN introduced padded formats since 0.15 which require more memory - // compared to the actual size of the tensor. Currently, MKL-DNN operators - // still reuse memory from memory planning, so here we need to select a - // suboptimal kernel for computation that has the expected memory size requirements - while (deconv_pd->dst_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) { - // for deconvolution primitive next_impl always fails. Keep this? - if (!deconv_pd->next_impl()) { - ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->dst_desc().get_size(), - deconv_pd->src_desc().get_size(), - deconv_pd->weights_desc().get_size()); - *deconv_pd = DeconvFwdPD(desc(), engine); - } - } +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); + const auto ¶m = nnvm::get(attrs.parsed); + const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs); + MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors); - return deconv_pd; -} - -std::shared_ptr GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, - const NDArray &data, const NDArray &weights, - const NDArray &output, - const DeconvFwdPD &fwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetDeconvWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - CHECK_GE(param.stride.ndim(), 2); - CHECK_GE(param.pad.ndim(), 2); - CHECK_GE(param.dilate.ndim(), 2); - mkldnn::memory::dims strides{0, 0}; - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - mkldnn::memory::dims padding{0, 0}; - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - mkldnn::memory::dims dilate{0, 0}; - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - auto desc = [&]() { - return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md, - strides, dilate, padding, padding); - }; - auto deconv_pd = std::make_shared(desc(), engine, fwd_pd); - // MKL-DNN introduced padded formats since 0.15 which require more memory - // compared to the actual size of the tensor. Currently, MKL-DNN operators - // still reuse memory from memory planning, so here we need to select a - // suboptimal kernel for computation that has the expected memory size requirements - while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->diff_src_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) { - if (!deconv_pd->next_impl()) { - ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(), - deconv_pd->diff_src_desc().get_size(), - deconv_pd->weights_desc().get_size()); - *deconv_pd = DeconvBwdDataPD(desc(), engine, fwd_pd); - } - } - return deconv_pd; -} - -std::shared_ptr GetDeconvBwdWeightsImpl( - const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output, const DeconvFwdPD &fwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetDeconvWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto bias_md = bias ? GetMemDesc(*bias) - : mkldnn::memory::desc{ - {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any}; - auto engine = CpuEngine::Get()->get_engine(); - CHECK_GE(param.stride.ndim(), 2); - CHECK_GE(param.pad.ndim(), 2); - CHECK_GE(param.dilate.ndim(), 2); - mkldnn::memory::dims strides{0, 0}; - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - mkldnn::memory::dims padding{0, 0}; - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - mkldnn::memory::dims dilate{0, 0}; - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - auto desc = [&]() { - return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, - bias_md, out_md, strides, dilate, padding, padding); - }; - auto deconv_pd = std::make_shared(desc(), engine, fwd_pd); - - // MKL-DNN introduced padded formats since 0.15 which require more memory - // compared to the actual size of the tensor. Currently, MKL-DNN operators - // still reuse memory from memory planning, so here we need to select a - // suboptimal kernel for computation that has the expected memory size requirements - while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) || - deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) || - deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) { - if (!deconv_pd->next_impl()) { - ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(), - deconv_pd->src_desc().get_size(), - deconv_pd->diff_weights_desc().get_size()); - *deconv_pd = DeconvBwdWeightPD(desc(), engine, fwd_pd); - } - } - return deconv_pd; + fwd.ControlWeightFormat(param.num_group, ctx.is_train, tensors.weight); + fwd.Execute(param.num_group, req, tensors); } -class MKLDNNDeconvForward { - public: - MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output); - const DeconvFwd &GetFwd() const { return *fwd; } - - const DeconvFwdPD &GetPd() const { return *fwd_pd; } - - private: - std::shared_ptr fwd; - std::shared_ptr fwd_pd; -}; // class MKLDNNDeconvForward +MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &out) + : data(data), weight(weight), bias(bias), out(out) {} -MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray *bias, - const NDArray &output) - : fwd_pd(GetDeconvFwdImpl(param, data, weights, bias, output)) { - fwd = std::make_shared(GetPd()); -} +MKLDNNDeconvFwd::Tensors::Tensors(bool no_bias, const std::vector &inputs, + const std::vector &outputs) + : data(inputs[deconv::kData]), + weight(inputs[deconv::kWeight]), + bias(no_bias ? nullptr : &inputs[deconv::kBias]), + out(outputs[deconv::kOut]) {} -MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weights, const NDArray *bias, - const NDArray &output) { - using deconv_fwd_map = std::unordered_map; +MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m, + const Tensors &tensors) { + using deconv_fwd_map = std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL static thread_local deconv_fwd_map fwds; #else @@ -279,38 +73,48 @@ MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam ¶m, const NDArray // Here we can sign the conv op with NDArray because conv primitive will // decide the right layout for the, so we only need to get the shape and the // data type of the arrays. - key.AddSign(data); - key.AddSign(weights); - key.AddSign(output); - if (bias) key.AddSign(*bias); + key.AddSign(tensors.data); + key.AddSign(tensors.weight); + key.AddSign(tensors.out); + if (tensors.bias) key.AddSign(*tensors.bias); auto it = fwds.find(key); if (it == fwds.end()) { - auto fwd = MKLDNNDeconvForward(param, data, weights, bias, output); + auto fwd = MKLDNNDeconvFwd(param, tensors); it = AddToCache(&fwds, key, fwd); } return it->second; } -void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); - const DeconvolutionParam ¶m = nnvm::get(attrs.parsed); - - auto &data = in_data[deconv::kData]; - auto &weight = in_data[deconv::kWeight]; - const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias]; +std::shared_ptr MKLDNNDeconvFwd::MakePD(const DeconvolutionParam ¶m, + const Tensors &tensors) { + DeconvDescCreator ddc(param, tensors.data, tensors.weight, tensors.bias, tensors.out); + auto pd = std::make_shared(ddc.MakeFwdDesc(), ddc.engine); + + while (true) { + size_t data_size = pd->src_desc().get_size(); + size_t weight_size = pd->weights_desc().get_size(); + size_t out_size = pd->dst_desc().get_size(); + if (ddc.CheckImpl(data_size, weight_size, out_size)) break; + if (pd->next_impl()) continue; + ddc.ImposePlainWherePadding(data_size, weight_size, out_size); + *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine); + } + return pd; +} - MKLDNNDeconvForward &fwd = GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]); +MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors) + : fwd_pd(MakePD(param, tensors)) { + fwd = std::make_shared(*fwd_pd); +} - if (ctx.is_train) { +void MKLDNNDeconvFwd::ControlWeightFormat(uint32_t num_group, bool is_train, + const NDArray &weight) { + if (is_train) { // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it // to the default format for now. if (weight.IsMKLDNNData()) - // This asks the engine to change the layout of the weight array after - // it's used. + // This asks the engine to change the layout of the weight array after it's used. weight.Reorder2DefaultAsync(); } else { // For inference, we want to reorder the weight array so we don't need to @@ -318,178 +122,367 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c if (weight.IsDefaultData()) { // We also need to modify the layout on the original weight array. The // data conversion happens after the weight array is used. - weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group)); + weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group)); } else { CHECK(weight.GetMKLDNNData()->get_desc() == - IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group)); + IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group)); } } +} +void MKLDNNDeconvFwd::Execute(uint32_t num_group, const std::vector &req, + const Tensors &tensors) { // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. // For that, we would pass input tensor in place of output and output tensor in place of // input (for appropriate convolution primitives: deconvolution forward = convolution backward // data, deconvolution backward data = convolution forward). Convolution primitive expects // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight - // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet - // provides such tensor. - + // tensor with shape (input_channels, output_channels, h, w) and MXNet provides such tensor. + // // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), // but this time we don't swap input and output tensors, so o = output_channels, i = // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually, // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation // match current weight tensor. + // + // So here we swap logical order of input and output dimensions for weight tensor just for + // MKLDNN operations + IOLogicalSwapMKLDNNMem(tensors.weight, num_group); + { + mkldnn_args_map_t net_args; + auto out_mem = OutMem(req[deconv::kOut], tensors.out); + + net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)}); + net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, tensors.weight)}); + net_args.insert({MKLDNN_ARG_DST, *out_mem.second}); + if (tensors.bias) net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)}); - // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN - // operations - IOLogicalSwapMKLDNNMem(weight, param.num_group); - - auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc()); - const mkldnn::memory *weight_mem = - GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group); - mkldnn_output_t out_mem = - CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]); - mkldnn_args_map_t net_args; - if (bias) { - const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData(); - net_args.insert({MKLDNN_ARG_BIAS, *bias_mem}); + // CommitOutput Should run after RegisterPrimArgs for memory dependency + MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args); + CommitOutput(tensors.out, out_mem); + MKLDNNStream::Get()->Submit(); } + IOLogicalSwapMKLDNNMem(tensors.weight, num_group); // swap back from oihw to iohw +} - net_args.insert({MKLDNN_ARG_SRC, *data_mem}); - net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem}); - net_args.insert({MKLDNN_ARG_DST, *out_mem.second}); - MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args); - CommitOutput(out_data[deconv::kOut], out_mem); - MKLDNNStream::Get()->Submit(); - - // swap back from oihw to iohw - IOLogicalSwapMKLDNNMem(weight, param.num_group); -} - -class MKLDNNDeconvBackward { - std::shared_ptr bwd_data_pd_; - std::shared_ptr bwd_weight_pd_; - std::shared_ptr bwd_data_; - std::shared_ptr bwd_weight_; - - public: - MKLDNNDeconvBackward(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { - const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output); - bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd); - bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd); - bwd_data_ = std::make_shared(GetDataPd()); - bwd_weight_ = std::make_shared(GetWeightsPd()); - } +const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const { + return data.GetMKLDNNDataReorder(fwd_pd->src_desc()); +} - const DeconvBwdData &GetBwdData() const { return *bwd_data_; } +const mkldnn::memory *MKLDNNDeconvFwd::WeightMem(uint32_t num_group, const NDArray &weight) const { + return GetWeights(weight, fwd_pd->weights_desc(), num_group); +} + +const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const { + return bias.GetMKLDNNData(); +} + +mkldnn_output_t MKLDNNDeconvFwd::OutMem(OpReqType req, const NDArray &out) const { + return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req); +} + +/*############################### Backward ###############################*/ + +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; - const DeconvBwdWeight &GetBwdWeights() const { return *bwd_weight_; } + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); + const auto ¶m = nnvm::get(attrs.parsed); + const auto &rt = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs); + const auto &wt = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs); + MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, rt); - const DeconvBwdDataPD &GetDataPd() const { return *bwd_data_pd_; } + bwd.Execute(param.num_group, req, rt, wt); +} - const DeconvBwdWeightPD &GetWeightsPd() const { return *bwd_weight_pd_; } -}; +MKLDNNDeconvBwd::ReadTensors::ReadTensors(bool no_bias, const std::vector &inputs) + : data(inputs[deconv::kData + 1]), + weight(inputs[deconv::kWeight + 1]), + bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]), + out_grad(inputs[deconv::kOut]) {} -typedef ParamOpSign MKLDNNDeconvSignature; +MKLDNNDeconvBwd::WriteTensors::WriteTensors(bool no_bias, const std::vector &outputs) + : data_grad(outputs[deconv::kData]), + weight_grad(outputs[deconv::kWeight]), + bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {} -static inline MKLDNNDeconvBackward &GetDeconvBwd(const DeconvolutionParam ¶m, - const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { - using mkldnn_deconv_bwd_map = - std::unordered_map; +MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m, + const ReadTensors &rt) { + using mkldnn_deconv_bwd_map = std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL static thread_local mkldnn_deconv_bwd_map bwds; #else static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds; #endif - MKLDNNDeconvSignature key(param); + DeconvSignature key(param); // Here we can sign the conv op with NDArray because conv primitive will // decide the right layout for the, so we only need to get the shape and the // data type of the arrays. - key.AddSign(data); - key.AddSign(weights); - key.AddSign(output); - if (bias) key.AddSign(*bias); + key.AddSign(rt.data); + key.AddSign(rt.weight); + key.AddSign(rt.out_grad); + if (rt.bias) key.AddSign(*rt.bias); auto it = bwds.find(key); if (it == bwds.end()) { - auto bwd = MKLDNNDeconvBackward(param, data, weights, bias, output); + auto bwd = MKLDNNDeconvBwd(param, rt); it = AddToCache(&bwds, key, bwd); } return it->second; } -void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); - const std::vector &in_grad = outputs; - const DeconvolutionParam ¶m = nnvm::get(attrs.parsed); +std::shared_ptr MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam ¶m, + const ReadTensors &rt, + const deconv_fwd_pd_t &fwd_pd) { + DeconvDescCreator ddc(param, rt.data, rt.weight, nullptr, rt.out_grad); + auto pd = std::make_shared(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + + while (true) { + size_t data_size = pd->diff_src_desc().get_size(); + size_t weight_size = pd->weights_desc().get_size(); + size_t out_size = pd->diff_dst_desc().get_size(); + if (ddc.CheckImpl(data_size, weight_size, out_size)) break; + if (pd->next_impl()) continue; + ddc.ImposePlainWherePadding(data_size, weight_size, out_size); + *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + } + return pd; +} - auto &data = inputs[deconv::kData + 1]; - auto &weight = inputs[deconv::kWeight + 1]; - const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1]; - auto &out_grad = inputs[deconv::kOut]; +std::shared_ptr MKLDNNDeconvBwd::MakeWeightsPD( + const DeconvolutionParam ¶m, const ReadTensors &rt, const deconv_fwd_pd_t &fwd_pd) { + DeconvDescCreator ddc(param, rt.data, rt.weight, rt.bias, rt.out_grad); + auto pd = std::make_shared(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd); + + while (true) { + size_t data_size = pd->src_desc().get_size(); + size_t weight_size = pd->diff_weights_desc().get_size(); + size_t out_size = pd->diff_dst_desc().get_size(); + if (ddc.CheckImpl(data_size, weight_size, out_size)) break; + if (pd->next_impl()) continue; + ddc.ImposePlainWherePadding(data_size, weight_size, out_size); + *pd = deconv_bwd_weight_pd_t(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd); + } + return pd; +} - CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; - MKLDNNDeconvBackward &deconvBwd = GetDeconvBwd(param, data, weight, bias, out_grad); - auto out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc()); +MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &rt) { + const auto fwd_pd = MKLDNNDeconvFwd::MakePD( // TODO: use cached? + param, MKLDNNDeconvFwd::Tensors(rt.data, rt.weight, rt.bias, rt.out_grad)); + bwd_data_pd = MakeDataPD(param, rt, *fwd_pd); + bwd_weight_pd = MakeWeightsPD(param, rt, *fwd_pd); + bwd_data = std::make_shared(*bwd_data_pd); + bwd_weight = std::make_shared(*bwd_weight_pd); +} + +void MKLDNNDeconvBwd::Execute(uint32_t num_group, const std::vector &req, + const ReadTensors &rt, const WriteTensors &wt) { + // swaps are explained in MKLDNNDeconvFwd::Execute + IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad); + { + auto out_grad_mem = ScheduleBwdData(num_group, req, rt, wt); + ScheduleBwdWeight(num_group, req, rt, wt, out_grad_mem); + MKLDNNStream::Get()->Submit(); + } + IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad); +} + +void MKLDNNDeconvBwd::IOSwapWeightTensors(uint32_t num_group, const std::vector &req, + const NDArray &weight, const NDArray &weight_grad) { + if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, num_group); + if (req[deconv::kWeight] || req[deconv::kBias]) IOLogicalSwapMKLDNNMem(weight_grad, num_group); +} + +const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(uint32_t num_group, + const std::vector &req, + const ReadTensors &rt, + const WriteTensors &wt) { if (req[deconv::kData]) { - // swap is explained in MKLDNNDeconvolutionForward - IOLogicalSwapMKLDNNMem(weight, param.num_group); - auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), param.num_group); - auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], - deconvBwd.GetDataPd().diff_src_desc(), req[deconv::kData]); - mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem}, - {MKLDNN_ARG_WEIGHTS, *weight_mem}, - {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}}; - MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdData(), net_args); - CommitOutput(in_grad[deconv::kData], in_grad_mem); + mkldnn_args_map_t net_args; + auto out_grad_mem = OutGradMem(rt.out_grad); + auto data_grad_mem = DataGradMem(req[deconv::kData], wt.data_grad); + + net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem}); + net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, rt.weight)}); + net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second}); + + // CommitOutput Should run after RegisterPrimArgs for memory dependency + MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args); + CommitOutput(wt.data_grad, data_grad_mem); + return out_grad_mem; // try reuse it in ScheduleBwdWeight } + return nullptr; +} + +void MKLDNNDeconvBwd::ScheduleBwdWeight(uint32_t num_group, const std::vector &req, + const ReadTensors &rt, const WriteTensors &wt, + const mkldnn::memory *out_grad_mem) { if (req[deconv::kWeight] || req[deconv::kBias]) { - if (deconvBwd.GetDataPd().diff_dst_desc() != deconvBwd.GetWeightsPd().diff_dst_desc()) - out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc()); - auto data_mem = data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc()); - mkldnn_output_t in_grad_weight; - const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc(); - // swaps are explained in MKLDNNDeconvolutionForward - // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because - // of logical swap) We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is - // always in default format), so here we check if after a swap, wei_md will have a default - // format - if (req[deconv::kWeight] == OpReqType::kWriteTo && - IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) { - in_grad_weight = {OutDataOp::Noop, - const_cast(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)}; - } else { - IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group); - in_grad_weight = - CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], wei_md, req[deconv::kWeight]); - } + mkldnn_args_map_t net_args; + auto weight_grad_mem = WeightGradMem(num_group, req[deconv::kWeight], wt.weight_grad); + auto bias_grad_mem = BiasGradMem(req[deconv::kBias], wt.bias_grad); + + net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(rt.out_grad, out_grad_mem)}); + net_args.insert({MKLDNN_ARG_SRC, *DataMem(rt.data)}); + net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weight_grad_mem.second}); + if (bias_grad_mem.second) net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second}); - mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem}, - {MKLDNN_ARG_SRC, *data_mem}, - {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}}; - mkldnn_output_t in_grad_bias; - if (!param.no_bias) { - in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias], - deconvBwd.GetWeightsPd().diff_bias_desc(), req[deconv::kBias]); - net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second}); - } - MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdWeights(), net_args); - CommitOutput(in_grad[deconv::kWeight], in_grad_weight); // CommitOutput Should run after RegisterPrimArgs for memory dependency - if (!param.no_bias) CommitOutput(in_grad[deconv::kBias], in_grad_bias); + MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weight, net_args); + CommitOutput(wt.weight_grad, weight_grad_mem); + if (bias_grad_mem.second) CommitOutput(*wt.bias_grad, bias_grad_mem); } - MKLDNNStream::Get()->Submit(); +} - // swap back from oihw to iohw - if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, param.num_group); - if (req[deconv::kWeight] || req[deconv::kBias]) - IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group); +const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const { + return data.GetMKLDNNDataReorder(bwd_weight_pd->src_desc()); +} + +const mkldnn::memory *MKLDNNDeconvBwd::WeightMem(uint32_t num_group, const NDArray &weight) const { + return GetWeights(weight, bwd_data_pd->weights_desc(), num_group); +} + +const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const { + return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc()); +} + +const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad, + const mkldnn::memory *out_grad_mem) const { + if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weight_pd->diff_dst_desc()) + return out_grad.GetMKLDNNDataReorder(bwd_weight_pd->diff_dst_desc()); + return out_grad_mem; +} + +mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(OpReqType req, const NDArray &data_grad) const { + return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req); +} + +mkldnn_output_t MKLDNNDeconvBwd::WeightGradMem(uint32_t num_group, OpReqType req, + const NDArray &weight_grad) const { + // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because + // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weight_grad + // memory (which, when not swapped, is always in default format), so here we check if after a + // swap, wei_md will have a default format + const auto &wei_md = bwd_weight_pd->diff_weights_desc(); + if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) + return {OutDataOp::Noop, const_cast(weight_grad).CreateMKLDNNData(wei_md)}; + return CreateMKLDNNWeightGrad(weight_grad, wei_md, req); +} + +mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(OpReqType req, const NDArray *bias) const { + return bias ? CreateMKLDNNMem(*bias, bwd_weight_pd->diff_bias_desc(), req) + : mkldnn_output_t(OutDataOp::Noop, nullptr); +} + +/*############################### DeconvDescCreator ###############################*/ + +DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, + const NDArray &weight, const NDArray *bias, const NDArray &out) + : data_md(GetMemDesc(data)), + weight_md(GetDeconvWeightDesc(weight, param.num_group)), + bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()), + out_md(GetMemDesc(out)), + strides(param.stride.ndim()), + padding(param.pad.ndim()), + dilates(param.dilate.ndim()), + engine(CpuEngine::Get()->get_engine()) { + // assuming only deconv2D is supported for now + CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim()); + CHECK(param.stride.ndim() == 2); + for (int i = 0; i < param.stride.ndim(); ++i) { + strides[i] = param.stride[i]; + padding[i] = param.pad[i]; + dilates[i] = param.dilate[i] - 1; + } +} + +void DeconvDescCreator::ImposePlainWherePadding(size_t data_size, size_t weight_size, + size_t out_size) { + if (data_size != GetMemDescSize(data_md)) { + CHECK(data_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + data_md = GetDesc(data_md, GetDefaultFormat(data_md)); + } else if (out_size != GetMemDescSize(out_md)) { + CHECK(out_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + out_md = GetDesc(out_md, GetDefaultFormat(out_md)); + } else if (weight_size != GetMemDescSize(weight_md)) { + CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + int num_groups = (weight_md.data.ndims > data_md.data.ndims) ? weight_md.data.dims[0] : 1; + weight_md = IOLogicalSwapDesc(weight_md, num_groups); + weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups); + } +} + +bool DeconvDescCreator::CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const { + // MKLDNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKLDNN operators + // still reuse memory from memory planning, so here we need to accept only a + // kernel that has the expected memory size requirements (which is suboptimal) + return (data_size == GetMemDescSize(data_md) && weight_size == GetMemDescSize(weight_md) && + out_size == GetMemDescSize(out_md)); +} + +deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const { + // TODO: check if forward_training should be constant + return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, + out_md, strides, dilates, padding, padding); +} + +deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const { + return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md, + strides, dilates, padding, padding); +} + +deconv_bwd_weight_t::desc DeconvDescCreator::MakeBwdWeightDesc() const { + return deconv_bwd_weight_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, + bias_md, out_md, strides, dilates, padding, padding); +} + +// Swaps the logical order of dimensions that in plain format would correspond to input and output +// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). +mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) { + auto &d = desc.data; + int offset = int(num_groups > 1); + int dim0 = offset + 0; + int dim1 = offset + 1; + std::swap(d.dims[dim0], d.dims[dim1]); + std::swap(d.padded_dims[dim0], d.padded_dims[dim1]); + if (d.format_kind != dnnl_format_kind_any) { + std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]); + // as padding is not supported, these are always zeros? + std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]); + // for blocked format: change indices + for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) { + auto &val = d.format_desc.blocking.inner_idxs[i]; + if (val == dim0) { + val = dim1; + } else if (val == dim1) { + val = dim0; + } + } + } + return desc; +} + +// Applies IOLogicalSwapDesc to MKLDNN memory of arr +void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) { + mkldnn::memory::desc desc; + if (arr.IsMKLDNNData()) { + desc = arr.GetMKLDNNData()->get_desc(); + } else { + // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use + // descriptor from GetWeightDesc but with default format + const auto &temp = GetWeightDesc(arr, num_groups); + desc = mkldnn::memory::desc( + temp.dims(), temp.data_type(), + static_cast(GetDefaultFormat(temp.data.ndims))); + } + const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups)); } } // namespace op From 1c8987bb938314f480b4a1102c0abfd70cf1bb53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Mon, 15 Mar 2021 15:09:35 +0100 Subject: [PATCH 4/9] Refactor deconvolution version 2 and use permute_axes in IOLogicalSwapDesc --- .../nn/mkldnn/mkldnn_deconvolution-inl.h | 153 +++--- .../nn/mkldnn/mkldnn_deconvolution.cc | 452 +++++++++--------- 2 files changed, 329 insertions(+), 276 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h index f2638013ac3d..404cacc500c5 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h @@ -20,10 +20,21 @@ /*! * \file mkldnn_deconvolution-inl.h * \brief - * \Author: Paweł Głomski, pawel.glomski@intel.com + * ________ + * Data---->|Deconv| + * Weight-->| FWD |--->out + * Bias---->|______| + * ________ + * Data_grad<----|Deconv|<---out_grad + * Weight_grad<--| BWD |<---data + * Bias_grad<----| |<---Weight + * |______|<---Bias + * + * "out" in this (and .cc) file will always refer to the output of Deconv FWD and + * "out_grad" to its gradient */ -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_ #if MXNET_USE_MKLDNN == 1 #include "../deconvolution-inl.h" @@ -39,18 +50,20 @@ using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc; using deconv_bwd_t = mkldnn::deconvolution_backward_data; using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc; -using deconv_bwd_weight_t = mkldnn::deconvolution_backward_weights; -using deconv_bwd_weight_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc; +using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights; +using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc; class MKLDNNDeconvFwd { public: struct Tensors { - Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &out); - Tensors(bool no_bias, const std::vector &inputs, const std::vector &outputs); + Tensors(const NDArray &data, const NDArray &weights, const NDArray *const bias, + const NDArray &out); + Tensors(const bool no_bias, const std::vector &inputs, + const std::vector &outputs); const NDArray &data; - const NDArray &weight; - const NDArray *bias; + const NDArray &weights; + const NDArray *const bias; const NDArray &out; }; @@ -59,15 +72,15 @@ class MKLDNNDeconvFwd { const Tensors &tensors); MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors); - void ControlWeightFormat(uint32_t num_group, bool is_train, const NDArray &weight); - void Execute(uint32_t num_group, const std::vector &req, const Tensors &tensors); + void ControlWeightsFormat(const uint32_t num_group, const bool is_train, const NDArray &weights); + void Execute(const uint32_t num_group, const std::vector &req, const Tensors &tensors); private: const mkldnn::memory *DataMem(const NDArray &data) const; - const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const; + const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const; const mkldnn::memory *BiasMem(const NDArray &bias) const; - mkldnn_output_t OutMem(OpReqType req, const NDArray &out) const; + mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const; std::shared_ptr fwd; std::shared_ptr fwd_pd; @@ -76,75 +89,89 @@ class MKLDNNDeconvFwd { class MKLDNNDeconvBwd { public: struct ReadTensors { - ReadTensors(bool no_bias, const std::vector &inputs); + ReadTensors(const bool no_bias, const std::vector &inputs); const NDArray &data; - const NDArray &weight; - const NDArray *bias; + const NDArray &weights; + const NDArray *const bias; const NDArray &out_grad; }; struct WriteTensors { - WriteTensors(bool no_bias, const std::vector &outputs); + WriteTensors(const bool no_bias, const std::vector &outputs); const NDArray &data_grad; - const NDArray &weight_grad; - const NDArray *bias_grad; + const NDArray &weights_grad; + const NDArray *const bias_grad; }; - static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam ¶m, const ReadTensors &rt); + static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam ¶m, + const ReadTensors &read_tensors); static std::shared_ptr MakeDataPD(const DeconvolutionParam ¶m, - const ReadTensors &rt, + const ReadTensors &read_tensors, const deconv_fwd_pd_t &fwd_pd); - static std::shared_ptr MakeWeightsPD(const DeconvolutionParam ¶m, - const ReadTensors &rt, - const deconv_fwd_pd_t &fwd_pd); + static std::shared_ptr MakeWeightsPD(const DeconvolutionParam ¶m, + const ReadTensors &read_tensors, + const deconv_fwd_pd_t &fwd_pd); - MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &rt); - void Execute(uint32_t num_group, const std::vector &req, const ReadTensors &rt, - const WriteTensors &wt); + MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors); + void Execute(const uint32_t num_group, const std::vector &req, + const ReadTensors &read_tensors, const WriteTensors &write_tensors); private: - void IOSwapWeightTensors(uint32_t num_group, const std::vector &req, - const NDArray &weight, const NDArray &weight_grad); + void IOSwapWeightsTensors(const uint32_t num_group, const std::vector &req, + const NDArray &weights, const NDArray &weights_grad); - const mkldnn::memory *ScheduleBwdData(uint32_t num_group, const std::vector &req, - const ReadTensors &rt, const WriteTensors &wt); + // returns the output gradient memory used to calculate the data (input) gradient, which + // might be reused when calculating the gradient of weights + const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const std::vector &req, + const ReadTensors &read_tensors, + const WriteTensors &write_tensors); - void ScheduleBwdWeight(uint32_t num_group, const std::vector &req, - const ReadTensors &rt, const WriteTensors &wt, - const mkldnn::memory *out_grad_mem); + void ScheduleBwdWeights(const uint32_t num_group, const std::vector &req, + const ReadTensors &read_tensors, const WriteTensors &write_tensors, + const mkldnn::memory *const out_grad_mem); const mkldnn::memory *DataMem(const NDArray &data) const; - const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const; - const mkldnn::memory *OutGradMem(const NDArray &out_grad) const; // for bwd data - const mkldnn::memory *OutGradMem(const NDArray &out_grad, // for bwd weight - const mkldnn::memory *out_grad_mem) const; + const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const; + + // for calculating the gradient of data (input) + const mkldnn::memory *OutGradMem(const NDArray &out_grad) const; + // for calculating the gradient of weights + const mkldnn::memory *OutGradMem(const NDArray &out_grad, + const mkldnn::memory *const out_grad_mem) const; - mkldnn_output_t DataGradMem(OpReqType req, const NDArray &data_grad) const; - mkldnn_output_t WeightGradMem(uint32_t num_group, OpReqType req, - const NDArray &weight_grad) const; - mkldnn_output_t BiasGradMem(OpReqType req, const NDArray *bias) const; + mkldnn_output_t DataGradMem(const OpReqType req, const NDArray &data_grad) const; + mkldnn_output_t WeightsGradMem(const uint32_t num_group, const OpReqType req, + const NDArray &weights_grad) const; + mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray *const bias) const; std::shared_ptr bwd_data_pd; - std::shared_ptr bwd_weight_pd; + std::shared_ptr bwd_weights_pd; std::shared_ptr bwd_data; - std::shared_ptr bwd_weight; -}; // namespace op + std::shared_ptr bwd_weights; +}; +// Utility class for creating operation descriptors of deconvolution primitives struct DeconvDescCreator { - DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weight, - const NDArray *bias, const NDArray &out); - - // Imposes plain formats on memory descriptors with padding - // Changing only one at a time, so maybe better implementations will be selected - // (than entirely plain one) - void ImposePlainWherePadding(size_t data_size, size_t weight_size, size_t out_size); - bool CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const; + DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + const NDArray *const bias, const NDArray &out); + + // Imposes plain formats on memory descriptors with padding (so the next selected implementation + // will pass CheckImplSizeReq). After calling this method, new primitive descriptor (with new + // operator descriptor) should be created, which should select an implementation with matching + // size requirements. + // data_size, weights_size, out_size - size requirements of current implementation + // Returns whether successfully imposed a plain format on any of the data, weights, and output + // memory descriptors. + bool ImposePlainWherePadding(const size_t data_size, const size_t weights_size, + const size_t out_size); + bool CheckImplSizeReq(const size_t data_size, const size_t weights_size, + const size_t out_size) const; deconv_fwd_t::desc MakeFwdDesc() const; deconv_bwd_t::desc MakeBwdDataDesc() const; - deconv_bwd_weight_t::desc MakeBwdWeightDesc() const; + deconv_bwd_weights_t::desc MakeBwdWeightsDesc() const; mkldnn::memory::desc data_md; - mkldnn::memory::desc weight_md; + mkldnn::memory::desc weights_md; mkldnn::memory::desc bias_md; mkldnn::memory::desc out_md; @@ -155,15 +182,19 @@ struct DeconvDescCreator { mkldnn::engine &engine; }; -mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups); -void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups); +// Swaps the logical order of dimensions that in plain format would correspond to input and output +// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). +mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups); + +// Applies IOLogicalSwapDesc to MKLDNN memory of arr +void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups); -// Version of GetWeightDesc for deconvolution (with swap) -static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weight, int num_groups) { - return IOLogicalSwapDesc(GetWeightDesc(weight, num_groups), num_groups); +// Version of GetWeightsDesc for deconvolution (with swap) +inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const int num_groups) { + return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups); } } // namespace op } // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__ +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H__ diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index b5e7e4166284..699318d9beec 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -30,12 +30,11 @@ namespace mxnet { namespace op { bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) { - if (params.kernel.ndim() != 2) return false; - return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) && - input.shape().ndim() == 4; + return params.kernel.ndim() == 2 && input.shape().ndim() == 4 && + (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16); } -/*############################### Forward ###############################*/ +// Forward void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, @@ -46,18 +45,18 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs); MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors); - fwd.ControlWeightFormat(param.num_group, ctx.is_train, tensors.weight); + fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights); fwd.Execute(param.num_group, req, tensors); } -MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, - const NDArray &out) - : data(data), weight(weight), bias(bias), out(out) {} +MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights, + const NDArray *const bias, const NDArray &out) + : data(data), weights(weights), bias(bias), out(out) {} -MKLDNNDeconvFwd::Tensors::Tensors(bool no_bias, const std::vector &inputs, +MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector &inputs, const std::vector &outputs) : data(inputs[deconv::kData]), - weight(inputs[deconv::kWeight]), + weights(inputs[deconv::kWeight]), bias(no_bias ? nullptr : &inputs[deconv::kBias]), out(outputs[deconv::kOut]) {} @@ -70,17 +69,16 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m, static MX_THREAD_LOCAL deconv_fwd_map fwds; #endif DeconvSignature key(param); - // Here we can sign the conv op with NDArray because conv primitive will - // decide the right layout for the, so we only need to get the shape and the - // data type of the arrays. key.AddSign(tensors.data); - key.AddSign(tensors.weight); + key.AddSign(tensors.weights); key.AddSign(tensors.out); - if (tensors.bias) key.AddSign(*tensors.bias); + if (tensors.bias) { + key.AddSign(*tensors.bias); + } auto it = fwds.find(key); if (it == fwds.end()) { - auto fwd = MKLDNNDeconvFwd(param, tensors); + const MKLDNNDeconvFwd fwd(param, tensors); it = AddToCache(&fwds, key, fwd); } return it->second; @@ -88,17 +86,20 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m, std::shared_ptr MKLDNNDeconvFwd::MakePD(const DeconvolutionParam ¶m, const Tensors &tensors) { - DeconvDescCreator ddc(param, tensors.data, tensors.weight, tensors.bias, tensors.out); - auto pd = std::make_shared(ddc.MakeFwdDesc(), ddc.engine); - - while (true) { - size_t data_size = pd->src_desc().get_size(); - size_t weight_size = pd->weights_desc().get_size(); - size_t out_size = pd->dst_desc().get_size(); - if (ddc.CheckImpl(data_size, weight_size, out_size)) break; - if (pd->next_impl()) continue; - ddc.ImposePlainWherePadding(data_size, weight_size, out_size); - *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine); + DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out); + const auto pd = std::make_shared(ddc.MakeFwdDesc(), ddc.engine); + const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); }; + const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); }; + const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); }; + + while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) { + if (!pd->next_impl()) { + // ImposePlainWherePadding fails when all memory descriptors already have plain formats + // imposed, meaning there is no implementation with plain formats + CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) + << "No implementation of deconvolution forward propagation"; + *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine); + } } return pd; } @@ -108,243 +109,273 @@ MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors fwd = std::make_shared(*fwd_pd); } -void MKLDNNDeconvFwd::ControlWeightFormat(uint32_t num_group, bool is_train, - const NDArray &weight) { +void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train, + const NDArray &weights) { if (is_train) { // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it // to the default format for now. - if (weight.IsMKLDNNData()) - // This asks the engine to change the layout of the weight array after it's used. - weight.Reorder2DefaultAsync(); + if (weights.IsMKLDNNData()) { + // This asks the engine to change the layout of the weights array after it's used. + weights.Reorder2DefaultAsync(); + } } else { - // For inference, we want to reorder the weight array so we don't need to + // For inference, we want to reorder the weights array so we don't need to // reorder data every time. - if (weight.IsDefaultData()) { - // We also need to modify the layout on the original weight array. The - // data conversion happens after the weight array is used. - weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group)); + if (weights.IsDefaultData()) { + // We also need to modify the layout on the original weights array. + // The data conversion happens after the weights array is used. + weights.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group)); } else { - CHECK(weight.GetMKLDNNData()->get_desc() == + CHECK(weights.GetMKLDNNData()->get_desc() == IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group)); } } } -void MKLDNNDeconvFwd::Execute(uint32_t num_group, const std::vector &req, +void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector &req, const Tensors &tensors) { // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. - // For that, we would pass input tensor in place of output and output tensor in place of - // input (for appropriate convolution primitives: deconvolution forward = convolution backward - // data, deconvolution backward data = convolution forward). Convolution primitive expects - // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: - // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight - // tensor with shape (input_channels, output_channels, h, w) and MXNet provides such tensor. + // For that, we would pass input tensor in place of output and output tensor in place of input + // (for appropriate convolution primitives: deconvolution forward = convolution backward data, + // deconvolution backward data = convolution forward). + // The convolution primitive expects weights tensor with the shape of + // (primitive_out_channels, primitive_in_channels, h, w), but with swapped input and output: + // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels, + // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor. // - // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), - // but this time we don't swap input and output tensors, so o = output_channels, i = - // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually, - // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight - // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation - // match current weight tensor. + // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of + // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and + // output tensors, so: + // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels, + // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels). + // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the + // weights tensor with the logical order of oihw, it wants its physical representation to + // match the order of iohw, which is the same as current weights tensor. // - // So here we swap logical order of input and output dimensions for weight tensor just for - // MKLDNN operations - IOLogicalSwapMKLDNNMem(tensors.weight, num_group); + // So here we swap logical order of input and output dimensions for weights tensor just for + // MKLDNN operations. + IOLogicalSwapMKLDNNMem(tensors.weights, num_group); { mkldnn_args_map_t net_args; - auto out_mem = OutMem(req[deconv::kOut], tensors.out); + const auto &out_mem = OutMem(req[deconv::kOut], tensors.out); net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)}); - net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, tensors.weight)}); + net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)}); net_args.insert({MKLDNN_ARG_DST, *out_mem.second}); - if (tensors.bias) net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)}); + if (tensors.bias) { + net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)}); + } - // CommitOutput Should run after RegisterPrimArgs for memory dependency + // CommitOutput should run after RegisterPrimArgs for memory dependency MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args); CommitOutput(tensors.out, out_mem); MKLDNNStream::Get()->Submit(); } - IOLogicalSwapMKLDNNMem(tensors.weight, num_group); // swap back from oihw to iohw + IOLogicalSwapMKLDNNMem(tensors.weights, num_group); // swap back from oihw to iohw } const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const { return data.GetMKLDNNDataReorder(fwd_pd->src_desc()); } -const mkldnn::memory *MKLDNNDeconvFwd::WeightMem(uint32_t num_group, const NDArray &weight) const { - return GetWeights(weight, fwd_pd->weights_desc(), num_group); +const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group, + const NDArray &weights) const { + return GetWeights(weights, fwd_pd->weights_desc(), num_group); } const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const { return bias.GetMKLDNNData(); } -mkldnn_output_t MKLDNNDeconvFwd::OutMem(OpReqType req, const NDArray &out) const { +mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const { return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req); } -/*############################### Backward ###############################*/ +// Backward void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace"; TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const auto ¶m = nnvm::get(attrs.parsed); - const auto &rt = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs); - const auto &wt = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs); - MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, rt); + const auto &read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs); + const auto &write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs); + MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors); - bwd.Execute(param.num_group, req, rt, wt); + bwd.Execute(param.num_group, req, read_tensors, write_tensors); } -MKLDNNDeconvBwd::ReadTensors::ReadTensors(bool no_bias, const std::vector &inputs) +MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector &inputs) : data(inputs[deconv::kData + 1]), - weight(inputs[deconv::kWeight + 1]), + weights(inputs[deconv::kWeight + 1]), bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]), out_grad(inputs[deconv::kOut]) {} -MKLDNNDeconvBwd::WriteTensors::WriteTensors(bool no_bias, const std::vector &outputs) +MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector &outputs) : data_grad(outputs[deconv::kData]), - weight_grad(outputs[deconv::kWeight]), + weights_grad(outputs[deconv::kWeight]), bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {} MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m, - const ReadTensors &rt) { - using mkldnn_deconv_bwd_map = std::unordered_map; + const ReadTensors &read_tensors) { + using deconv_bwd_map = std::unordered_map; #if DMLC_CXX11_THREAD_LOCAL - static thread_local mkldnn_deconv_bwd_map bwds; + static thread_local deconv_bwd_map bwds; #else - static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds; + static MX_THREAD_LOCAL deconv_bwd_map bwds; #endif DeconvSignature key(param); - // Here we can sign the conv op with NDArray because conv primitive will - // decide the right layout for the, so we only need to get the shape and the - // data type of the arrays. - key.AddSign(rt.data); - key.AddSign(rt.weight); - key.AddSign(rt.out_grad); - if (rt.bias) key.AddSign(*rt.bias); + key.AddSign(read_tensors.data); + key.AddSign(read_tensors.weights); + key.AddSign(read_tensors.out_grad); + if (read_tensors.bias) { + key.AddSign(*read_tensors.bias); + } auto it = bwds.find(key); if (it == bwds.end()) { - auto bwd = MKLDNNDeconvBwd(param, rt); + const MKLDNNDeconvBwd bwd(param, read_tensors); it = AddToCache(&bwds, key, bwd); } return it->second; } std::shared_ptr MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam ¶m, - const ReadTensors &rt, + const ReadTensors &read_tensors, const deconv_fwd_pd_t &fwd_pd) { - DeconvDescCreator ddc(param, rt.data, rt.weight, nullptr, rt.out_grad); - auto pd = std::make_shared(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); - - while (true) { - size_t data_size = pd->diff_src_desc().get_size(); - size_t weight_size = pd->weights_desc().get_size(); - size_t out_size = pd->diff_dst_desc().get_size(); - if (ddc.CheckImpl(data_size, weight_size, out_size)) break; - if (pd->next_impl()) continue; - ddc.ImposePlainWherePadding(data_size, weight_size, out_size); - *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr, + read_tensors.out_grad); + const auto pd = std::make_shared(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); }; + const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); }; + const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); }; + + while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) { + if (!pd->next_impl()) { + // ImposePlainWherePadding fails when all memory descriptors already have plain formats + // imposed, meaning there is no implementation with plain formats + CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) + << "No implementation of deconvolution backward propagation"; + *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + } } return pd; } -std::shared_ptr MKLDNNDeconvBwd::MakeWeightsPD( - const DeconvolutionParam ¶m, const ReadTensors &rt, const deconv_fwd_pd_t &fwd_pd) { - DeconvDescCreator ddc(param, rt.data, rt.weight, rt.bias, rt.out_grad); - auto pd = std::make_shared(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd); - - while (true) { - size_t data_size = pd->src_desc().get_size(); - size_t weight_size = pd->diff_weights_desc().get_size(); - size_t out_size = pd->diff_dst_desc().get_size(); - if (ddc.CheckImpl(data_size, weight_size, out_size)) break; - if (pd->next_impl()) continue; - ddc.ImposePlainWherePadding(data_size, weight_size, out_size); - *pd = deconv_bwd_weight_pd_t(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd); +std::shared_ptr MKLDNNDeconvBwd::MakeWeightsPD( + const DeconvolutionParam ¶m, const ReadTensors &read_tensors, + const deconv_fwd_pd_t &fwd_pd) { + DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias, + read_tensors.out_grad); + const auto pd = + std::make_shared(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd); + const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); }; + const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); }; + const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); }; + + while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) { + if (!pd->next_impl()) { + // ImposePlainWherePadding fails when all memory descriptors already have plain formats + // imposed, meaning there is no implementation with plain formats + CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) + << "No implementation of calculating deconvolution weights gradient"; + *pd = deconv_bwd_weights_pd_t(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd); + } } return pd; } -MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &rt) { - const auto fwd_pd = MKLDNNDeconvFwd::MakePD( // TODO: use cached? - param, MKLDNNDeconvFwd::Tensors(rt.data, rt.weight, rt.bias, rt.out_grad)); - bwd_data_pd = MakeDataPD(param, rt, *fwd_pd); - bwd_weight_pd = MakeWeightsPD(param, rt, *fwd_pd); +MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors) { + const auto &fwd_pd = MKLDNNDeconvFwd::MakePD( + param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias, + read_tensors.out_grad)); + bwd_data_pd = MakeDataPD(param, read_tensors, *fwd_pd); + bwd_weights_pd = MakeWeightsPD(param, read_tensors, *fwd_pd); bwd_data = std::make_shared(*bwd_data_pd); - bwd_weight = std::make_shared(*bwd_weight_pd); + bwd_weights = std::make_shared(*bwd_weights_pd); } -void MKLDNNDeconvBwd::Execute(uint32_t num_group, const std::vector &req, - const ReadTensors &rt, const WriteTensors &wt) { +void MKLDNNDeconvBwd::Execute(const uint32_t num_group, const std::vector &req, + const ReadTensors &read_tensors, const WriteTensors &write_tensors) { // swaps are explained in MKLDNNDeconvFwd::Execute - IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad); + IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad); { - auto out_grad_mem = ScheduleBwdData(num_group, req, rt, wt); - ScheduleBwdWeight(num_group, req, rt, wt, out_grad_mem); + auto *const out_grad_mem = ScheduleBwdData(num_group, req, read_tensors, write_tensors); + ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem); MKLDNNStream::Get()->Submit(); } - IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad); + IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad); } -void MKLDNNDeconvBwd::IOSwapWeightTensors(uint32_t num_group, const std::vector &req, - const NDArray &weight, const NDArray &weight_grad) { - if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, num_group); - if (req[deconv::kWeight] || req[deconv::kBias]) IOLogicalSwapMKLDNNMem(weight_grad, num_group); +void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group, + const std::vector &req, + const NDArray &weights, const NDArray &weights_grad) { + if (req[deconv::kData]) { + IOLogicalSwapMKLDNNMem(weights, num_group); + } + if (req[deconv::kWeight] || req[deconv::kBias]) { + IOLogicalSwapMKLDNNMem(weights_grad, num_group); + } } -const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(uint32_t num_group, +const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group, const std::vector &req, - const ReadTensors &rt, - const WriteTensors &wt) { + const ReadTensors &read_tensors, + const WriteTensors &write_tensors) { if (req[deconv::kData]) { mkldnn_args_map_t net_args; - auto out_grad_mem = OutGradMem(rt.out_grad); - auto data_grad_mem = DataGradMem(req[deconv::kData], wt.data_grad); + auto *const out_grad_mem = OutGradMem(read_tensors.out_grad); + const auto &data_grad_mem = DataGradMem(req[deconv::kData], write_tensors.data_grad); net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem}); - net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, rt.weight)}); + net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)}); net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second}); - // CommitOutput Should run after RegisterPrimArgs for memory dependency + // CommitOutput should run after RegisterPrimArgs for memory dependency MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args); - CommitOutput(wt.data_grad, data_grad_mem); - return out_grad_mem; // try reuse it in ScheduleBwdWeight + CommitOutput(write_tensors.data_grad, data_grad_mem); + return out_grad_mem; } return nullptr; } -void MKLDNNDeconvBwd::ScheduleBwdWeight(uint32_t num_group, const std::vector &req, - const ReadTensors &rt, const WriteTensors &wt, - const mkldnn::memory *out_grad_mem) { +void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group, + const std::vector &req, + const ReadTensors &read_tensors, + const WriteTensors &write_tensors, + const mkldnn::memory *const out_grad_mem) { if (req[deconv::kWeight] || req[deconv::kBias]) { mkldnn_args_map_t net_args; - auto weight_grad_mem = WeightGradMem(num_group, req[deconv::kWeight], wt.weight_grad); - auto bias_grad_mem = BiasGradMem(req[deconv::kBias], wt.bias_grad); - - net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(rt.out_grad, out_grad_mem)}); - net_args.insert({MKLDNN_ARG_SRC, *DataMem(rt.data)}); - net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weight_grad_mem.second}); - if (bias_grad_mem.second) net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second}); - - // CommitOutput Should run after RegisterPrimArgs for memory dependency - MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weight, net_args); - CommitOutput(wt.weight_grad, weight_grad_mem); - if (bias_grad_mem.second) CommitOutput(*wt.bias_grad, bias_grad_mem); + const auto &weights_grad_mem = + WeightsGradMem(num_group, req[deconv::kWeight], write_tensors.weights_grad); + const auto &bias_grad_mem = BiasGradMem(req[deconv::kBias], write_tensors.bias_grad); + + net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)}); + net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)}); + net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weights_grad_mem.second}); + if (bias_grad_mem.second) { + net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second}); + } + + // CommitOutput should run after RegisterPrimArgs for memory dependency + MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weights, net_args); + CommitOutput(write_tensors.weights_grad, weights_grad_mem); + if (bias_grad_mem.second) { + CommitOutput(*write_tensors.bias_grad, bias_grad_mem); + } } } const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const { - return data.GetMKLDNNDataReorder(bwd_weight_pd->src_desc()); + return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc()); } -const mkldnn::memory *MKLDNNDeconvBwd::WeightMem(uint32_t num_group, const NDArray &weight) const { - return GetWeights(weight, bwd_data_pd->weights_desc(), num_group); +const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group, + const NDArray &weights) const { + return GetWeights(weights, bwd_data_pd->weights_desc(), num_group); } const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const { @@ -352,39 +383,42 @@ const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const } const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad, - const mkldnn::memory *out_grad_mem) const { - if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weight_pd->diff_dst_desc()) - return out_grad.GetMKLDNNDataReorder(bwd_weight_pd->diff_dst_desc()); + const mkldnn::memory *const out_grad_mem) const { + if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weights_pd->diff_dst_desc()) { + return out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc()); + } return out_grad_mem; } -mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(OpReqType req, const NDArray &data_grad) const { +mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req, const NDArray &data_grad) const { return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req); } -mkldnn_output_t MKLDNNDeconvBwd::WeightGradMem(uint32_t num_group, OpReqType req, - const NDArray &weight_grad) const { +mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group, const OpReqType req, + const NDArray &weights_grad) const { // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because - // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weight_grad + // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad // memory (which, when not swapped, is always in default format), so here we check if after a // swap, wei_md will have a default format - const auto &wei_md = bwd_weight_pd->diff_weights_desc(); - if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) - return {OutDataOp::Noop, const_cast(weight_grad).CreateMKLDNNData(wei_md)}; - return CreateMKLDNNWeightGrad(weight_grad, wei_md, req); + const auto &wei_md = bwd_weights_pd->diff_weights_desc(); + if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) { + return {OutDataOp::Noop, const_cast(weights_grad).CreateMKLDNNData(wei_md)}; + } + return CreateMKLDNNWeightGrad(weights_grad, wei_md, req); } -mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(OpReqType req, const NDArray *bias) const { - return bias ? CreateMKLDNNMem(*bias, bwd_weight_pd->diff_bias_desc(), req) +mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req, const NDArray *const bias) const { + return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req) : mkldnn_output_t(OutDataOp::Noop, nullptr); } -/*############################### DeconvDescCreator ###############################*/ +// DeconvDescCreator DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, - const NDArray &weight, const NDArray *bias, const NDArray &out) + const NDArray &weights, const NDArray *const bias, + const NDArray &out) : data_md(GetMemDesc(data)), - weight_md(GetDeconvWeightDesc(weight, param.num_group)), + weights_md(GetDeconvWeightsDesc(weights, param.num_group)), bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()), out_md(GetMemDesc(out)), strides(param.stride.ndim()), @@ -401,76 +435,64 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDAr } } -void DeconvDescCreator::ImposePlainWherePadding(size_t data_size, size_t weight_size, - size_t out_size) { - if (data_size != GetMemDescSize(data_md)) { - CHECK(data_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; +bool DeconvDescCreator::ImposePlainWherePadding(const size_t data_size, const size_t weights_size, + const size_t out_size) { + // Changing only one at a time, so maybe better implementations will be selected (than entirely + // plain one) + if (data_md.data.format_kind == dnnl_format_kind_any && data_size != GetMemDescSize(data_md)) { data_md = GetDesc(data_md, GetDefaultFormat(data_md)); - } else if (out_size != GetMemDescSize(out_md)) { - CHECK(out_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; + return true; + } else if (out_md.data.format_kind == dnnl_format_kind_any && + out_size != GetMemDescSize(out_md)) { out_md = GetDesc(out_md, GetDefaultFormat(out_md)); - } else if (weight_size != GetMemDescSize(weight_md)) { - CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation"; - int num_groups = (weight_md.data.ndims > data_md.data.ndims) ? weight_md.data.dims[0] : 1; - weight_md = IOLogicalSwapDesc(weight_md, num_groups); - weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups); + return true; + } else if (weights_md.data.format_kind == dnnl_format_kind_any && + weights_size != GetMemDescSize(weights_md)) { + const int num_gr = (weights_md.data.ndims > data_md.data.ndims) ? weights_md.data.dims[0] : 1; + weights_md = IOLogicalSwapDesc(weights_md, num_gr); + weights_md = IOLogicalSwapDesc(GetDesc(weights_md, GetDefaultFormat(weights_md)), num_gr); + return true; } + return false; } -bool DeconvDescCreator::CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const { +bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size, + const size_t out_size) const { // MKLDNN introduced padded formats since 0.15 which require more memory // compared to the actual size of the tensor. Currently, MKLDNN operators // still reuse memory from memory planning, so here we need to accept only a // kernel that has the expected memory size requirements (which is suboptimal) - return (data_size == GetMemDescSize(data_md) && weight_size == GetMemDescSize(weight_md) && + return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) && out_size == GetMemDescSize(out_md)); } deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const { - // TODO: check if forward_training should be constant return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training, - mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, + mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md, out_md, strides, dilates, padding, padding); } deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const { - return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md, + return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, out_md, strides, dilates, padding, padding); } -deconv_bwd_weight_t::desc DeconvDescCreator::MakeBwdWeightDesc() const { - return deconv_bwd_weight_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, - bias_md, out_md, strides, dilates, padding, padding); -} - -// Swaps the logical order of dimensions that in plain format would correspond to input and output -// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). -mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) { - auto &d = desc.data; - int offset = int(num_groups > 1); - int dim0 = offset + 0; - int dim1 = offset + 1; - std::swap(d.dims[dim0], d.dims[dim1]); - std::swap(d.padded_dims[dim0], d.padded_dims[dim1]); - if (d.format_kind != dnnl_format_kind_any) { - std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]); - // as padding is not supported, these are always zeros? - std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]); - // for blocked format: change indices - for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) { - auto &val = d.format_desc.blocking.inner_idxs[i]; - if (val == dim0) { - val = dim1; - } else if (val == dim1) { - val = dim0; - } - } - } - return desc; +deconv_bwd_weights_t::desc DeconvDescCreator::MakeBwdWeightsDesc() const { + return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, + bias_md, out_md, strides, dilates, padding, padding); +} + +// Utilities + +mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups) { + std::vector order(desc.data.ndims); + std::iota(std::begin(order), std::end(order), 0); + const int offset = int(num_groups > 1); + std::swap(order[offset + 0], order[offset + 1]); + return desc.permute_axes(order); } -// Applies IOLogicalSwapDesc to MKLDNN memory of arr -void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) { +void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups) { mkldnn::memory::desc desc; if (arr.IsMKLDNNData()) { desc = arr.GetMKLDNNData()->get_desc(); From 4112d44b5c5e79ba54d18dbccd4985fa2208be13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Fri, 26 Mar 2021 12:20:24 +0100 Subject: [PATCH 5/9] Refactor deconvolution version 3 --- .../nn/mkldnn/mkldnn_deconvolution-inl.h | 261 +++++++++++++++--- .../nn/mkldnn/mkldnn_deconvolution.cc | 234 +++------------- 2 files changed, 257 insertions(+), 238 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h index 404cacc500c5..bd5934dcfb07 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h @@ -19,19 +19,19 @@ /*! * \file mkldnn_deconvolution-inl.h - * \brief - * ________ - * Data---->|Deconv| - * Weight-->| FWD |--->out - * Bias---->|______| - * ________ - * Data_grad<----|Deconv|<---out_grad - * Weight_grad<--| BWD |<---data - * Bias_grad<----| |<---Weight - * |______|<---Bias + * Naming convention: + * ________ + * (src) data --->|Deconv| + * weights --->| FWD |---> out (dst) + * bias --->|______| + * ________ + * (diff_src) data_grad <---|Deconv|<--- out_grad (diff_dst) + * (diff_weight) weights_grad <---| BWD |<--- data (src) + * (diff_bias) bias_grad <---| |<--- weight + * |______|<--- bias * * "out" in this (and .cc) file will always refer to the output of Deconv FWD and - * "out_grad" to its gradient + * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses. */ #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_ @@ -47,12 +47,48 @@ namespace op { using deconv_fwd_t = mkldnn::deconvolution_forward; using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc; -using deconv_bwd_t = mkldnn::deconvolution_backward_data; +using deconv_bwd_data_t = mkldnn::deconvolution_backward_data; using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc; using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights; using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc; + + +// Swaps the logical order of dimensions that in plain format would correspond to input and output +// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). +inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, + const uint32_t num_group) { + std::vector order(desc.data.ndims); + std::iota(std::begin(order), std::end(order), 0); + const int offset = int(num_group > 1); + std::swap(order[offset + 0], order[offset + 1]); + return desc.permute_axes(order); +} + +// Applies IOLogicalSwapDesc to MKLDNN memory of arr +inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, const uint32_t num_group) { + mkldnn::memory::desc desc; + if (arr.IsMKLDNNData()) { + desc = arr.GetMKLDNNData()->get_desc(); + } else { + // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use + // descriptor from GetWeightDesc but with default format + const auto &temp = GetWeightDesc(arr, num_group); + desc = mkldnn::memory::desc( + temp.dims(), temp.data_type(), + static_cast(GetDefaultFormat(temp.data.ndims))); + } + const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group)); +} + +// Version of GetWeightsDesc for deconvolution (with swap) +inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const uint32_t num_group) { + return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group); +} + + + class MKLDNNDeconvFwd { public: struct Tensors { @@ -68,12 +104,13 @@ class MKLDNNDeconvFwd { }; static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam ¶m, const Tensors &tensors); - static std::shared_ptr MakePD(const DeconvolutionParam ¶m, - const Tensors &tensors); + static std::shared_ptr CreatePrimitiveDesc(const DeconvolutionParam ¶m, + const Tensors &tensors); MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors); - void ControlWeightsFormat(const uint32_t num_group, const bool is_train, const NDArray &weights); - void Execute(const uint32_t num_group, const std::vector &req, const Tensors &tensors); + void ControlWeightsFormat(const uint32_t num_group, const bool is_train, + const NDArray &weights) const; + void Execute(const uint32_t num_group, const OpReqType req, const Tensors &tensors) const; private: const mkldnn::memory *DataMem(const NDArray &data) const; @@ -82,10 +119,47 @@ class MKLDNNDeconvFwd { mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const; + private: std::shared_ptr fwd; std::shared_ptr fwd_pd; }; + +MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector &inputs, + const std::vector &outputs) + : data(inputs[deconv::kData]), + weights(inputs[deconv::kWeight]), + bias(no_bias ? nullptr : &inputs[deconv::kBias]), + out(outputs[deconv::kOut]) {} + +MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights, + const NDArray *const bias, const NDArray &out) + : data(data), weights(weights), bias(bias), out(out) {} + +MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors) + : fwd_pd(CreatePrimitiveDesc(param, tensors)) { + fwd = std::make_shared(*fwd_pd); +} + +inline const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const { + return data.GetMKLDNNDataReorder(fwd_pd->src_desc()); +} + +inline const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group, + const NDArray &weights) const { + return GetWeights(weights, fwd_pd->weights_desc(), num_group); +} + +inline const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const { + return bias.GetMKLDNNData(); +} + +inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const { + return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req); +} + + + class MKLDNNDeconvBwd { public: struct ReadTensors { @@ -104,30 +178,33 @@ class MKLDNNDeconvBwd { static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam ¶m, const ReadTensors &read_tensors); - static std::shared_ptr MakeDataPD(const DeconvolutionParam ¶m, - const ReadTensors &read_tensors, - const deconv_fwd_pd_t &fwd_pd); - static std::shared_ptr MakeWeightsPD(const DeconvolutionParam ¶m, - const ReadTensors &read_tensors, - const deconv_fwd_pd_t &fwd_pd); + + static std::shared_ptr CreateDataPrimitiveDesc( + const DeconvolutionParam ¶m, const ReadTensors &read_tensors, + const deconv_fwd_pd_t &fwd_pd); + + static std::shared_ptr CreateWeightsPrimitiveDesc( + const DeconvolutionParam ¶m, const ReadTensors &read_tensors, + const deconv_fwd_pd_t &fwd_pd); MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors); + void Execute(const uint32_t num_group, const std::vector &req, - const ReadTensors &read_tensors, const WriteTensors &write_tensors); + const ReadTensors &read_tensors, const WriteTensors &write_tensors) const; private: void IOSwapWeightsTensors(const uint32_t num_group, const std::vector &req, - const NDArray &weights, const NDArray &weights_grad); + const NDArray &weights, const NDArray &weights_grad) const; - // returns the output gradient memory used to calculate the data (input) gradient, which - // might be reused when calculating the gradient of weights - const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const std::vector &req, + // returns the output gradient memory used to calculate the data (input) gradient, + // which might be reused when calculating the gradient of weights + const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const OpReqType req, const ReadTensors &read_tensors, - const WriteTensors &write_tensors); + const WriteTensors &write_tensors) const; void ScheduleBwdWeights(const uint32_t num_group, const std::vector &req, const ReadTensors &read_tensors, const WriteTensors &write_tensors, - const mkldnn::memory *const out_grad_mem); + const mkldnn::memory *const out_grad_mem) const; const mkldnn::memory *DataMem(const NDArray &data) const; const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const; @@ -145,12 +222,94 @@ class MKLDNNDeconvBwd { std::shared_ptr bwd_data_pd; std::shared_ptr bwd_weights_pd; - std::shared_ptr bwd_data; + std::shared_ptr bwd_data; std::shared_ptr bwd_weights; }; + +MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector &inputs) + : data(inputs[deconv::kData + 1]), + weights(inputs[deconv::kWeight + 1]), + bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]), + out_grad(inputs[deconv::kOut]) {} + +MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector &outputs) + : data_grad(outputs[deconv::kData]), + weights_grad(outputs[deconv::kWeight]), + bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {} + +MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors) { + const auto &fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc( + param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias, + read_tensors.out_grad)); + bwd_data_pd = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd); + bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd); + bwd_data = std::make_shared(*bwd_data_pd); + bwd_weights = std::make_shared(*bwd_weights_pd); +} + +inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group, + const std::vector &req, + const NDArray &weights, + const NDArray &weights_grad) const { + if (req[deconv::kData]) { + IOLogicalSwapMKLDNNMem(weights, num_group); + } + if (req[deconv::kWeight] || (req.size() < deconv::kBias && req[deconv::kBias])) { + IOLogicalSwapMKLDNNMem(weights_grad, num_group); + } +} + +inline const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const { + return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc()); +} + +inline const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group, + const NDArray &weights) const { + return GetWeights(weights, bwd_data_pd->weights_desc(), num_group); +} + +inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const { + return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc()); +} + +inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem( + const NDArray &out_grad, const mkldnn::memory *const out_grad_mem) const { + return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc()) + ? out_grad_mem + : out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc()); +} + +inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req, + const NDArray &data_grad) const { + return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req); +} + +inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group, + const OpReqType req, + const NDArray &weights_grad) const { + // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because + // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad + // memory (which, when not swapped, is always in default format), so here we check if after a + // swap, weights_md will have a default format + const auto &weights_md = bwd_weights_pd->diff_weights_desc(); + if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) { + return {OutDataOp::Noop, const_cast(weights_grad).CreateMKLDNNData(weights_md)}; + } + return CreateMKLDNNWeightGrad(weights_grad, weights_md, req); +} + +inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req, + const NDArray *const bias) const { + return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req) + : mkldnn_output_t(OutDataOp::Noop, nullptr); +} + + + // Utility class for creating operation descriptors of deconvolution primitives -struct DeconvDescCreator { +class DeconvDescCreator { + public: DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, const NDArray *const bias, const NDArray &out); @@ -166,10 +325,11 @@ struct DeconvDescCreator { bool CheckImplSizeReq(const size_t data_size, const size_t weights_size, const size_t out_size) const; - deconv_fwd_t::desc MakeFwdDesc() const; - deconv_bwd_t::desc MakeBwdDataDesc() const; - deconv_bwd_weights_t::desc MakeBwdWeightsDesc() const; + deconv_fwd_t::desc CreateFwdDesc() const; + deconv_bwd_data_t::desc CreateBwdDataDesc() const; + deconv_bwd_weights_t::desc CreateBwdWeightsDesc() const; + private: mkldnn::memory::desc data_md; mkldnn::memory::desc weights_md; mkldnn::memory::desc bias_md; @@ -178,20 +338,33 @@ struct DeconvDescCreator { mkldnn::memory::dims strides; mkldnn::memory::dims padding; mkldnn::memory::dims dilates; - - mkldnn::engine &engine; }; -// Swaps the logical order of dimensions that in plain format would correspond to input and output -// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw). -mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups); -// Applies IOLogicalSwapDesc to MKLDNN memory of arr -void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups); +inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size, + const size_t out_size) const { + // MKLDNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKLDNN operators + // still reuse memory from memory planning, so here we need to accept only a + // kernel that has the expected memory size requirements (which is suboptimal) + return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) && + out_size == GetMemDescSize(out_md)); +} -// Version of GetWeightsDesc for deconvolution (with swap) -inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const int num_groups) { - return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups); +inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const { + return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md, + out_md, strides, dilates, padding, padding); +} + +inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const { + return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, + out_md, strides, dilates, padding, padding); +} + +inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const { + return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, + bias_md, out_md, strides, dilates, padding, padding); } } // namespace op diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index 699318d9beec..f248259dbd23 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -19,7 +19,6 @@ /*! * \file mkldnn_deconvolution.cc - * \brief */ #if MXNET_USE_MKLDNN == 1 @@ -34,7 +33,7 @@ bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16); } -// Forward + void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, @@ -42,24 +41,13 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c const std::vector &outputs) { TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const auto ¶m = nnvm::get(attrs.parsed); - const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs); - MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors); + const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs); + const auto &fwd = MKLDNNDeconvFwd::GetCached(param, tensors); fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights); - fwd.Execute(param.num_group, req, tensors); + fwd.Execute(param.num_group, req[deconv::kOut], tensors); } -MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights, - const NDArray *const bias, const NDArray &out) - : data(data), weights(weights), bias(bias), out(out) {} - -MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector &inputs, - const std::vector &outputs) - : data(inputs[deconv::kData]), - weights(inputs[deconv::kWeight]), - bias(no_bias ? nullptr : &inputs[deconv::kBias]), - out(outputs[deconv::kOut]) {} - MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m, const Tensors &tensors) { using deconv_fwd_map = std::unordered_map; @@ -84,10 +72,11 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m, return it->second; } -std::shared_ptr MKLDNNDeconvFwd::MakePD(const DeconvolutionParam ¶m, - const Tensors &tensors) { +std::shared_ptr MKLDNNDeconvFwd::CreatePrimitiveDesc( + const DeconvolutionParam ¶m, const Tensors &tensors) { DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out); - const auto pd = std::make_shared(ddc.MakeFwdDesc(), ddc.engine); + const auto &engine = CpuEngine::Get()->get_engine(); + const auto pd = std::make_shared(ddc.CreateFwdDesc(), engine); const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); }; const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); }; const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); }; @@ -98,19 +87,14 @@ std::shared_ptr MKLDNNDeconvFwd::MakePD(const DeconvolutionPara // imposed, meaning there is no implementation with plain formats CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) << "No implementation of deconvolution forward propagation"; - *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine); + *pd = deconv_fwd_pd_t(ddc.CreateFwdDesc(), engine); } } return pd; } -MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors) - : fwd_pd(MakePD(param, tensors)) { - fwd = std::make_shared(*fwd_pd); -} - void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train, - const NDArray &weights) { + const NDArray &weights) const { if (is_train) { // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it // to the default format for now. @@ -132,8 +116,8 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool } } -void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector &req, - const Tensors &tensors) { +void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req, + const Tensors &tensors) const { // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. // For that, we would pass input tensor in place of output and output tensor in place of input // (for appropriate convolution primitives: deconvolution forward = convolution backward data, @@ -157,7 +141,7 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vectorsrc_desc()); -} -const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group, - const NDArray &weights) const { - return GetWeights(weights, fwd_pd->weights_desc(), num_group); -} - -const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const { - return bias.GetMKLDNNData(); -} - -mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const { - return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req); -} - -// Backward void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, @@ -201,24 +168,13 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext & TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); const auto ¶m = nnvm::get(attrs.parsed); - const auto &read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs); - const auto &write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs); + const auto read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs); + const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs); MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors); bwd.Execute(param.num_group, req, read_tensors, write_tensors); } -MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector &inputs) - : data(inputs[deconv::kData + 1]), - weights(inputs[deconv::kWeight + 1]), - bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]), - out_grad(inputs[deconv::kOut]) {} - -MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector &outputs) - : data_grad(outputs[deconv::kData]), - weights_grad(outputs[deconv::kWeight]), - bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {} - MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m, const ReadTensors &read_tensors) { using deconv_bwd_map = std::unordered_map; @@ -243,12 +199,13 @@ MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m, return it->second; } -std::shared_ptr MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam ¶m, - const ReadTensors &read_tensors, - const deconv_fwd_pd_t &fwd_pd) { +std::shared_ptr MKLDNNDeconvBwd::CreateDataPrimitiveDesc( + const DeconvolutionParam ¶m, const ReadTensors &read_tensors, + const deconv_fwd_pd_t &fwd_pd) { DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr, read_tensors.out_grad); - const auto pd = std::make_shared(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + const auto &engine = CpuEngine::Get()->get_engine(); + const auto pd = std::make_shared(ddc.CreateBwdDataDesc(), engine, fwd_pd); const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); }; const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); }; const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); }; @@ -259,19 +216,20 @@ std::shared_ptr MKLDNNDeconvBwd::MakeDataPD(const Deconvol // imposed, meaning there is no implementation with plain formats CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) << "No implementation of deconvolution backward propagation"; - *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd); + *pd = deconv_bwd_data_pd_t(ddc.CreateBwdDataDesc(), engine, fwd_pd); } } return pd; } -std::shared_ptr MKLDNNDeconvBwd::MakeWeightsPD( +std::shared_ptr MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc( const DeconvolutionParam ¶m, const ReadTensors &read_tensors, const deconv_fwd_pd_t &fwd_pd) { DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad); + const auto &engine = CpuEngine::Get()->get_engine(); const auto pd = - std::make_shared(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd); + std::make_shared(ddc.CreateBwdWeightsDesc(), engine, fwd_pd); const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); }; const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); }; const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); }; @@ -282,53 +240,34 @@ std::shared_ptr MKLDNNDeconvBwd::MakeWeightsPD( // imposed, meaning there is no implementation with plain formats CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size())) << "No implementation of calculating deconvolution weights gradient"; - *pd = deconv_bwd_weights_pd_t(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd); + *pd = deconv_bwd_weights_pd_t(ddc.CreateBwdWeightsDesc(), engine, fwd_pd); } } return pd; } -MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors) { - const auto &fwd_pd = MKLDNNDeconvFwd::MakePD( - param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias, - read_tensors.out_grad)); - bwd_data_pd = MakeDataPD(param, read_tensors, *fwd_pd); - bwd_weights_pd = MakeWeightsPD(param, read_tensors, *fwd_pd); - bwd_data = std::make_shared(*bwd_data_pd); - bwd_weights = std::make_shared(*bwd_weights_pd); -} - void MKLDNNDeconvBwd::Execute(const uint32_t num_group, const std::vector &req, - const ReadTensors &read_tensors, const WriteTensors &write_tensors) { + const ReadTensors &read_tensors, + const WriteTensors &write_tensors) const { // swaps are explained in MKLDNNDeconvFwd::Execute IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad); { - auto *const out_grad_mem = ScheduleBwdData(num_group, req, read_tensors, write_tensors); + auto *const out_grad_mem = + ScheduleBwdData(num_group, req[deconv::kData], read_tensors, write_tensors); ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem); MKLDNNStream::Get()->Submit(); } IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad); } -void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group, - const std::vector &req, - const NDArray &weights, const NDArray &weights_grad) { - if (req[deconv::kData]) { - IOLogicalSwapMKLDNNMem(weights, num_group); - } - if (req[deconv::kWeight] || req[deconv::kBias]) { - IOLogicalSwapMKLDNNMem(weights_grad, num_group); - } -} - const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group, - const std::vector &req, + const OpReqType req, const ReadTensors &read_tensors, - const WriteTensors &write_tensors) { - if (req[deconv::kData]) { + const WriteTensors &write_tensors) const { + if (req) { mkldnn_args_map_t net_args; auto *const out_grad_mem = OutGradMem(read_tensors.out_grad); - const auto &data_grad_mem = DataGradMem(req[deconv::kData], write_tensors.data_grad); + const auto &data_grad_mem = DataGradMem(req, write_tensors.data_grad); net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem}); net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)}); @@ -346,12 +285,14 @@ void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group, const std::vector &req, const ReadTensors &read_tensors, const WriteTensors &write_tensors, - const mkldnn::memory *const out_grad_mem) { - if (req[deconv::kWeight] || req[deconv::kBias]) { + const mkldnn::memory *const out_grad_mem) const { + OpReqType weight_req = req[deconv::kWeight]; + OpReqType bias_req = req.size() > deconv::kBias ? req[deconv::kBias] : OpReqType::kNullOp; + if (weight_req || bias_req) { mkldnn_args_map_t net_args; const auto &weights_grad_mem = - WeightsGradMem(num_group, req[deconv::kWeight], write_tensors.weights_grad); - const auto &bias_grad_mem = BiasGradMem(req[deconv::kBias], write_tensors.bias_grad); + WeightsGradMem(num_group, weight_req, write_tensors.weights_grad); + const auto &bias_grad_mem = BiasGradMem(bias_req, write_tensors.bias_grad); net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)}); net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)}); @@ -369,50 +310,7 @@ void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group, } } -const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const { - return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc()); -} - -const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group, - const NDArray &weights) const { - return GetWeights(weights, bwd_data_pd->weights_desc(), num_group); -} - -const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const { - return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc()); -} - -const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad, - const mkldnn::memory *const out_grad_mem) const { - if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weights_pd->diff_dst_desc()) { - return out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc()); - } - return out_grad_mem; -} - -mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req, const NDArray &data_grad) const { - return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req); -} -mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group, const OpReqType req, - const NDArray &weights_grad) const { - // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because - // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad - // memory (which, when not swapped, is always in default format), so here we check if after a - // swap, wei_md will have a default format - const auto &wei_md = bwd_weights_pd->diff_weights_desc(); - if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) { - return {OutDataOp::Noop, const_cast(weights_grad).CreateMKLDNNData(wei_md)}; - } - return CreateMKLDNNWeightGrad(weights_grad, wei_md, req); -} - -mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req, const NDArray *const bias) const { - return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req) - : mkldnn_output_t(OutDataOp::Noop, nullptr); -} - -// DeconvDescCreator DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, const NDArray *const bias, @@ -423,8 +321,7 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDAr out_md(GetMemDesc(out)), strides(param.stride.ndim()), padding(param.pad.ndim()), - dilates(param.dilate.ndim()), - engine(CpuEngine::Get()->get_engine()) { + dilates(param.dilate.ndim()) { // assuming only deconv2D is supported for now CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim()); CHECK(param.stride.ndim() == 2); @@ -456,57 +353,6 @@ bool DeconvDescCreator::ImposePlainWherePadding(const size_t data_size, const si return false; } -bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size, - const size_t out_size) const { - // MKLDNN introduced padded formats since 0.15 which require more memory - // compared to the actual size of the tensor. Currently, MKLDNN operators - // still reuse memory from memory planning, so here we need to accept only a - // kernel that has the expected memory size requirements (which is suboptimal) - return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) && - out_size == GetMemDescSize(out_md)); -} - -deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const { - return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training, - mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md, - out_md, strides, dilates, padding, padding); -} - -deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const { - return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, out_md, - strides, dilates, padding, padding); -} - -deconv_bwd_weights_t::desc DeconvDescCreator::MakeBwdWeightsDesc() const { - return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, - bias_md, out_md, strides, dilates, padding, padding); -} - -// Utilities - -mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups) { - std::vector order(desc.data.ndims); - std::iota(std::begin(order), std::end(order), 0); - const int offset = int(num_groups > 1); - std::swap(order[offset + 0], order[offset + 1]); - return desc.permute_axes(order); -} - -void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups) { - mkldnn::memory::desc desc; - if (arr.IsMKLDNNData()) { - desc = arr.GetMKLDNNData()->get_desc(); - } else { - // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use - // descriptor from GetWeightDesc but with default format - const auto &temp = GetWeightDesc(arr, num_groups); - desc = mkldnn::memory::desc( - temp.dims(), temp.data_type(), - static_cast(GetDefaultFormat(temp.data.ndims))); - } - const_cast(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups)); -} - } // namespace op } // namespace mxnet #endif // MXNET_USE_MKLDNN == 1 From 73e6d0b07fb2a820b847617d10a8f54a4ea09e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Wed, 31 Mar 2021 09:32:55 +0200 Subject: [PATCH 6/9] Enable Deconvolution2D test --- tests/python/mkl/test_mkldnn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py index 60ebbfb97477..579826f9b4b9 100644 --- a/tests/python/mkl/test_mkldnn.py +++ b/tests/python/mkl/test_mkldnn.py @@ -469,10 +469,10 @@ def check_convolution_training(stype): @with_seed() -@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579") +# @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579") def test_Deconvolution(): def check_Deconvolution_training(stype): - for shape in [(3, 3, 10), (3, 3, 10, 10)]: + for shape in [(3, 3, 10, 10)]: # testing only 2D for now data_tmp = np.random.randint(256, size=shape) data = mx.symbol.Variable('data', stype=stype) From c79dff98c16e0ec8824620922144602d84b4bea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Thu, 1 Apr 2021 09:22:08 +0200 Subject: [PATCH 7/9] Fix sanity --- src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h | 5 ++++- src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h index bd5934dcfb07..db957da056e8 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h @@ -37,6 +37,9 @@ #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_ #if MXNET_USE_MKLDNN == 1 +#include +#include + #include "../deconvolution-inl.h" #include "./mkldnn_base-inl.h" #include "./mkldnn_ops-inl.h" @@ -61,7 +64,7 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const uint32_t num_group) { std::vector order(desc.data.ndims); std::iota(std::begin(order), std::end(order), 0); - const int offset = int(num_group > 1); + const int offset = static_cast(num_group > 1); std::swap(order[offset + 0], order[offset + 1]); return desc.permute_axes(order); } diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index f248259dbd23..7678567d95c8 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -323,8 +323,9 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam ¶m, const NDAr padding(param.pad.ndim()), dilates(param.dilate.ndim()) { // assuming only deconv2D is supported for now - CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim()); - CHECK(param.stride.ndim() == 2); + CHECK_EQ(param.stride.ndim(), param.pad.ndim()); + CHECK_EQ(param.stride.ndim(), param.dilate.ndim()); + CHECK_EQ(param.stride.ndim(), 2); for (int i = 0; i < param.stride.ndim(); ++i) { strides[i] = param.stride[i]; padding[i] = param.pad[i]; From cab15627b1e963cd2ff7bdaa02d40dd55078ffb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Thu, 1 Apr 2021 11:13:00 +0200 Subject: [PATCH 8/9] Fix windows builds --- src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h | 1 + tests/python/mkl/test_mkldnn.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h index db957da056e8..b51ec2a85650 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h @@ -39,6 +39,7 @@ #if MXNET_USE_MKLDNN == 1 #include #include +#include #include "../deconvolution-inl.h" #include "./mkldnn_base-inl.h" diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py index 579826f9b4b9..de0c249f52ab 100644 --- a/tests/python/mkl/test_mkldnn.py +++ b/tests/python/mkl/test_mkldnn.py @@ -469,7 +469,6 @@ def check_convolution_training(stype): @with_seed() -# @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579") def test_Deconvolution(): def check_Deconvolution_training(stype): for shape in [(3, 3, 10, 10)]: # testing only 2D for now From 74cc40dc40eac6af307b7cbb825089741a9ffed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= Date: Mon, 19 Apr 2021 12:42:25 +0200 Subject: [PATCH 9/9] Fix deconvolution with bias test --- tests/python/unittest/test_operator.py | 34 +++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index d02ff9537667..29b2f39d5178 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1660,22 +1660,38 @@ def test_deconvolution_forward_with_bias(): def check_deconvolution_forward_with_bias(shape=(1, 16, 5, 5), num_filter=32, num_group=1, kernel=(3, 3), pad=(1, 1)): x = mx.sym.Variable('x') w = mx.sym.Variable('w') - input_data = mx.random.uniform(-5, 5, shape, ctx=mx.cpu()) - y = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad) - exe = y.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null') + b = mx.sym.Variable('b') + y_nb = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=True, pad=pad) + y_b = mx.sym.Deconvolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad) + + + exe_nb = y_nb.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null') + exe_b = y_b.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null') + + + data = np.random.uniform(-5, 5, size=exe_b.arg_arrays[0].shape) + weights = np.random.normal(size=exe_b.arg_arrays[1].shape) + bias = np.random.normal(size=exe_b.arg_arrays[2].shape) + + def exe_forward(exe): + exe.arg_arrays[0][:] = data + exe.arg_arrays[1][:] = weights + if len(exe.arg_arrays) == 3: + exe.arg_arrays[2][:] = bias + return exe.forward(is_train=False)[0].asnumpy() + + out_nb = exe_forward(exe_nb) + out_b = exe_forward(exe_b) + bias = np.broadcast_to(bias, [np.prod(out_nb.shape[2:])] + [num_filter]).T + bias = np.broadcast_to(bias.reshape((num_filter, *out_nb.shape[2:])), out_b.shape) + assert_almost_equal(out_nb + bias, out_b) - exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape) - exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape) - exe.forward(is_train=False) - o = exe.outputs[0] - t = o.asnumpy() check_deconvolution_forward_with_bias((1, 16, 5), 32, 1, (3,), (1,)) check_deconvolution_forward_with_bias((32, 16, 5), 32, 1, (3,), (1,)) check_deconvolution_forward_with_bias((1, 16, 5, 5), 32, 1, (3, 3), (1, 1)) check_deconvolution_forward_with_bias((32, 16, 5, 5), 32, 1, (3, 3), (1, 1)) - def check_nearest_upsampling_with_shape(shapes, scale, root_scale): arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)} arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}