From 9e22a6d6a51669a3e0e4049ffba0a5ab45ad5cbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Fri, 12 Feb 2021 10:13:53 +0100
Subject: [PATCH 1/9] Use mkldnn deconvolution primitive in deconvolution

---
 .../nn/mkldnn/mkldnn_deconvolution.cc         | 561 +++++++++---------
 1 file changed, 276 insertions(+), 285 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 65bf93298b95..9d59c65e7891 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -31,69 +31,96 @@
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNDeconv(const DeconvolutionParam &params,
+using DeconvFwd = mkldnn::deconvolution_forward;
+using DeconvFwdPD = mkldnn::deconvolution_forward::primitive_desc;
+
+using DeconvBwdData = mkldnn::deconvolution_backward_data;
+using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc;
+
+using DeconvBwdWeight = mkldnn::deconvolution_backward_weights;
+using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc;
+
+bool SupportMKLDNNDeconv(const DeconvolutionParam &params, 
                          const NDArray &input) {
   if (params.kernel.ndim() != 2) return false;
   return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16)
          && input.shape().ndim() == 4;
 }
 
-static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
-  mkldnn::memory::dims dims(1);
-  // This is deconvolution on 4D data. The second dimension is the channel.
-  dims[0] = md.data.dims[1];
-  return mkldnn::memory::desc(
-      dims, static_cast<mkldnn::memory::data_type>(md.data.data_type),
-      mkldnn::memory::format_tag::any);
+// Swaps the logical order of dimensions that in plain format would correspond to input and output
+// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
+static inline mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) {
+  auto &d = desc.data;
+  int offset = int(num_groups > 1);
+  int dim0 = offset + 0;
+  int dim1 = offset + 1;
+  std::swap(d.dims[dim0], d.dims[dim1]);
+  std::swap(d.padded_dims[dim0], d.padded_dims[dim1]);
+  if (d.format_kind != dnnl_format_kind_any) {
+    std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]);
+    // as padding is not supported, these are always zeros?
+    std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]);
+    // for blocked format: change indices
+    for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) {
+      auto &val = d.format_desc.blocking.inner_idxs[i];
+      if (val == dim0) {
+        val = dim1;
+      } else if (val == dim1) {
+        val = dim0;
+      }
+    }
+  }
+  return desc;
 }
 
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwd_(
-    const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md,
-    bool has_bias, const mkldnn::memory::desc &out_md,
-    const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
-    const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKL-DNN operators
-  // still reuse memory from memory planning, so here we need to select a
-  // suboptimal kernel for computation that has the expected memory size requirements
-  if (!has_bias) {
-    mkldnn::convolution_forward::desc desc(
-        mkldnn::prop_kind::forward_training,
-        mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md,
-        strides, dilates, padding, padding);
-    auto deconv_pd =
-        std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc,
-                                                                      engine);
-    while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) ||
-           deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
-           deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) {
-      CHECK(deconv_pd->next_impl()) << "No implementation";
-    }
-    return deconv_pd;
+// Applies IOLogicalSwapDesc to arr
+static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) {
+  mkldnn::memory::desc desc;
+  if (arr.IsMKLDNNData()) {
+    desc = arr.GetMKLDNNData()->get_desc();
   } else {
-    auto bias_md = GetBiasDesc(data_md);
-    mkldnn::convolution_forward::desc desc(
-        mkldnn::prop_kind::forward_training,
-        mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md,
-        data_md, strides, dilates, padding, padding);
-    auto deconv_pd =
-        std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc,
-                                                                      engine);
-    while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) ||
-           deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
-           deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) {
-      CHECK(deconv_pd->next_impl()) << "No implementation";
-    }
-    return deconv_pd;
+    const auto &temp = GetWeightDesc(arr, num_groups);
+    desc = mkldnn::memory::desc(
+        temp.dims(), temp.data_type(),
+        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
   }
+  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups));
+}
+
+// Version of GetWeightDesc for deconvolution (with swap) 
+static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) {
+  return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups);
 }
 
-std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-GetDeconvFwdImpl(const DeconvolutionParam &param, const NDArray &data,
-                 const NDArray &weights, bool has_bias, const NDArray &output) {
+// Imposes the plain format on memory descriptors with padding
+// Changing only one at a time, so maybe better implementations will be selected
+// (than entirely plain one)
+void ImposePlainWherePadding(mkldnn::memory::desc &src_md, mkldnn::memory::desc &dst_md,
+                             mkldnn::memory::desc &weight_md, size_t src_size, size_t dst_size,
+                             size_t wei_size) {
+  if (src_size != GetMemDescSize(src_md)) {
+    CHECK(src_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    src_md = GetDesc(src_md, GetDefaultFormat(src_md));
+  } else if (dst_size != GetMemDescSize(dst_md)) {
+    CHECK(dst_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    dst_md = GetDesc(dst_md, GetDefaultFormat(dst_md));
+  } else if (wei_size != GetMemDescSize(weight_md)) {
+    CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    int num_groups = (weight_md.data.ndims > src_md.data.ndims) ? weight_md.data.dims[0] : 1;
+    weight_md = IOLogicalSwapDesc(weight_md, num_groups);
+    weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups);
+  }
+}
+
+std::shared_ptr<DeconvFwdPD> GetDeconvFwdImpl(const DeconvolutionParam &param, const NDArray &data,
+                                              const NDArray &weights, const NDArray *bias,
+                                              const NDArray &output) {
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
+  auto bias_md = bias ? GetMemDesc(*bias)
+                      : mkldnn::memory::desc{
+                            {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
   auto engine = CpuEngine::Get()->get_engine();
   CHECK_GE(param.stride.ndim(), 2);
   CHECK_GE(param.pad.ndim(), 2);
@@ -107,32 +134,41 @@ GetDeconvFwdImpl(const DeconvolutionParam &param, const NDArray &data,
   mkldnn::memory::dims dilate{0, 0};
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
-  auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
-                              strides, padding, dilate);
-  mkldnn::convolution_backward_data::desc desc(
-      mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md,
-      strides, dilate, padding, padding);
-  auto deconv_pd =
-      std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
-          desc, engine, *bwd_pd);
+  auto desc = [&]() {
+    return DeconvFwd::desc(
+        mkldnn::prop_kind::forward_training,  // TODO: check if this should be constant
+        mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides,
+        dilate, padding, padding);
+  };
+  auto deconv_pd = 
+      std::make_shared<DeconvFwdPD>(
+        desc(), engine);
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
   // still reuse memory from memory planning, so here we need to select a
   // suboptimal kernel for computation that has the expected memory size requirements
-  while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
-         deconv_pd->diff_src_desc().get_size() != GetMemDescSize(out_md) ||
+  while (deconv_pd->dst_desc().get_size() != GetMemDescSize(out_md) ||
+         deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) ||
          deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) {
-    CHECK(deconv_pd->next_impl()) << "No implementation";
+    // for deconvolution primitive next_impl always fails. Keep this?
+    if (!deconv_pd->next_impl()) {
+      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->dst_desc().get_size(),
+                              deconv_pd->src_desc().get_size(),
+                              deconv_pd->weights_desc().get_size());
+      *deconv_pd = DeconvFwdPD(desc(), engine);
+    }
   }
+
   return deconv_pd;
 }
 
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
-GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data,
-                     const NDArray &weights, bool has_bias,
-                     const NDArray &output) {
+std::shared_ptr<DeconvBwdDataPD> 
+GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data, 
+                     const NDArray &weights,
+                     const NDArray &output, 
+                     const DeconvFwdPD &fwd_pd) {
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
   CHECK_GE(param.stride.ndim(), 2);
@@ -147,18 +183,41 @@ GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data,
   mkldnn::memory::dims dilate{0, 0};
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
-  return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, strides,
-                       padding, dilate);
+  auto desc = [&]() {
+    return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct,
+                                                     data_md, weight_md, out_md, strides, dilate,
+                                                     padding, padding);
+  };
+  auto deconv_pd =
+      std::make_shared<DeconvBwdDataPD>(desc(), engine, fwd_pd);
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, MKL-DNN operators
+  // still reuse memory from memory planning, so here we need to select a
+  // suboptimal kernel for computation that has the expected memory size requirements
+  while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) ||
+         deconv_pd->diff_src_desc().get_size() != GetMemDescSize(data_md) ||
+         deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) {
+    if (!deconv_pd->next_impl()) {
+      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(),
+                              deconv_pd->diff_src_desc().get_size(),
+                              deconv_pd->weights_desc().get_size());
+      *deconv_pd = DeconvBwdDataPD(desc(), engine, fwd_pd);
+    }
+  }
+  return deconv_pd;
 }
 
-std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
+std::shared_ptr<DeconvBwdWeightPD> 
 GetDeconvBwdWeightsImpl(
-    const DeconvolutionParam &param, const NDArray &data,
-    const NDArray &weights, bool has_bias, const NDArray &output,
-    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+    const DeconvolutionParam &param, const NDArray &data, 
+    const NDArray &weights,const NDArray *bias, const NDArray &output,
+    const DeconvFwdPD &fwd_pd) {
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
+  auto bias_md = bias ? GetMemDesc(*bias)
+                      : mkldnn::memory::desc{
+                            {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
   auto engine = CpuEngine::Get()->get_engine();
   CHECK_GE(param.stride.ndim(), 2);
   CHECK_GE(param.pad.ndim(), 2);
@@ -172,98 +231,61 @@ GetDeconvBwdWeightsImpl(
   mkldnn::memory::dims dilate{0, 0};
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
+  auto desc = [&]() {
+    return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct,
+                                                        data_md, weight_md, bias_md, out_md,
+                                                        strides, dilate, padding, padding);
+  };
+  auto deconv_pd = std::make_shared<DeconvBwdWeightPD>(
+      desc(), engine, fwd_pd);
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
   // still reuse memory from memory planning, so here we need to select a
   // suboptimal kernel for computation that has the expected memory size requirements
-  if (!has_bias) {
-    mkldnn::convolution_backward_weights::desc desc(
-        mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md,
-        strides, dilate, padding, padding);
-    auto deconv_pd =
-        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
-            desc, engine, fwd_pd);
-    while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
-           deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
-           deconv_pd->diff_weights_desc().get_size() !=
-               GetMemDescSize(weight_md)) {
-      CHECK(deconv_pd->next_impl()) << "No implementation";
+  while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) ||
+         deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) ||
+         deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) {
+    if (!deconv_pd->next_impl()) {
+      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(),
+                              deconv_pd->src_desc().get_size(),
+                              deconv_pd->diff_weights_desc().get_size());
+      *deconv_pd = DeconvBwdWeightPD(desc(), engine, fwd_pd);
     }
-    return deconv_pd;
-  } else {
-    auto bias_md = GetBiasDesc(data_md);
-    mkldnn::convolution_backward_weights::desc desc(
-        mkldnn::algorithm::convolution_direct, out_md, weight_md, bias_md,
-        data_md, strides, dilate, padding, padding);
-    auto deconv_pd =
-        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
-            desc, engine, fwd_pd);
-    while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
-           deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
-           deconv_pd->diff_weights_desc().get_size() !=
-               GetMemDescSize(weight_md)) {
-      CHECK(deconv_pd->next_impl()) << "No implementation";
-    }
-    return deconv_pd;
   }
+  return deconv_pd;
 }
 
 class MKLDNNDeconvForward {
  public:
-  MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data,
-                      const NDArray &weights, bool has_bias,
+  MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data, 
+                      const NDArray &weights,const NDArray *bias, 
                       const NDArray &output);
-  const mkldnn::convolution_backward_data &GetFwd() const { return *fwd; }
+  const DeconvFwd &GetFwd() const { return *fwd; }
 
-  const mkldnn::convolution_backward_data::primitive_desc &GetPd() const {
-    return *fwd_pd;
-  }
+  const DeconvFwdPD &GetPd() const { return *fwd_pd; }
 
  private:
-  std::shared_ptr<mkldnn::convolution_backward_data> fwd;
-  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> fwd_pd;
+  std::shared_ptr<DeconvFwd> fwd;
+  std::shared_ptr<DeconvFwdPD> fwd_pd;
 };  // class MKLDNNDeconvForward
 
-MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam &param,
-                                         const NDArray &data,
-                                         const NDArray &weights, bool has_bias,
+MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data,
+                                         const NDArray &weights, const NDArray *bias,
                                          const NDArray &output)
-    : fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) {
-  fwd = std::make_shared<mkldnn::convolution_backward_data>(GetPd());
+    : fwd_pd(GetDeconvFwdImpl(param, data, weights, bias, output)) {
+  fwd = std::make_shared<DeconvFwd>(GetPd());
 }
 
-static void MKLDNNDeconvFwdBiasPostProcess(
-    const DeconvolutionParam &param, const OpContext &ctx, const NDArray &bias,
-    const std::vector<NDArray> &out_data) {
-  // add bias, broadcast bias to dim 1: channel
-  if (!param.no_bias) {
-    // MKLDNN only supports float right now.
-    typedef float DType;
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    Tensor<cpu, 1, DType> b = bias.data().get<cpu, 1, DType>(s);
-    // The output data is stored in a special MKLDNN format,
-    // converts its format to the default format.
-    // Unfortunately, MKLDNN doesn't support broadcast.
-    auto out_data_def = out_data[deconv::kOut].Reorder2Default();
-    Tensor<cpu, 4, DType> out_cpu = out_data_def.data().get<cpu, 4, DType>(s);
-    out_cpu += mshadow::expr::broadcast<1>(b, out_cpu.shape_);
-  }
-}
-
-MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs,
-                                  const NDArray &data, const NDArray &weights,
-                                  const NDArray *bias, const NDArray &output) {
+MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam &param, const NDArray &data,
+                                  const NDArray &weights, const NDArray *bias,
+                                  const NDArray &output) {
+  using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<DeconvSignature, MKLDNNDeconvForward,
-                                         OpHash>
-      fwds;
+  static thread_local deconv_fwd_map fwds;
 #else
-  static MX_THREAD_LOCAL
-      std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash>
-          fwds;
+  static MX_THREAD_LOCAL deconv_fwd_map fwds;
 #endif
-  const DeconvolutionParam &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
   DeconvSignature key(param);
   // Here we can sign the conv op with NDArray because conv primitive will
   // decide the right layout for the, so we only need to get the shape and the
@@ -275,15 +297,13 @@ MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    bool has_bias = (bias != nullptr);
-    auto fwd = MKLDNNDeconvForward(param, data, weights, has_bias, output);
+    auto fwd = MKLDNNDeconvForward(param, data, weights, bias, output);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
-                                const OpContext &ctx,
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                 const std::vector<NDArray> &in_data,
                                 const std::vector<OpReqType> &req,
                                 const std::vector<NDArray> &out_data) {
@@ -294,11 +314,9 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
   auto &weight = in_data[deconv::kWeight];
   const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias];
 
-  MKLDNNDeconvForward &fwd =
-      GetDeconvFwd(attrs, data, weight, bias, out_data[deconv::kOut]);
+  MKLDNNDeconvForward &fwd = 
+      GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]);
 
-  auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().diff_dst_desc());
-  const mkldnn::memory *weight_mem;
   if (ctx.is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
@@ -306,128 +324,94 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
       // This asks the engine to change the layout of the weight array after
       // it's used.
       weight.Reorder2DefaultAsync();
-    weight_mem =
-        GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
   } else {
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
     if (weight.IsDefaultData()) {
       // We also need to modify the layout on the original weight array. The
       // data conversion happens after the weight array is used.
-      weight.MKLDNNDataReorderAsync(fwd.GetPd().weights_desc());
-      weight_mem =
-          GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
-
+      weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group));
     } else {
-      weight_mem = weight.GetMKLDNNData();
-      CHECK(weight_mem->get_desc() == fwd.GetPd().weights_desc());
+      CHECK(weight.GetMKLDNNData()->get_desc() ==
+            IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group));
     }
   }
-  mkldnn_output_t out_mem;
-  out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().diff_src_desc(),
-                            req[deconv::kOut]);
 
+  // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. 
+  // For that, we would pass input tensor in place of output and output tensor in place of 
+  // input (for appropriate convolution primitives: deconvolution forward = convolution backward 
+  // data, deconvolution backward data = convolution forward). Convolution primitive expects 
+  // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: 
+  // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight 
+  // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet 
+  // provides such tensor.
+
+  // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), 
+  // but this time we don't swap input and output tensors, so o = output_channels, i = input_channels, 
+  // so the current weight tensor won't fit (when oihw != iohw). But actually, underneath deconvolution 
+  // MKLDNN also uses convolution, so even though it expects the weight tensor with shape (o, i, h, w), 
+  // it wants it in iohw format, so it's physical representation match current weight tensor.
+  
+  // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN operations
+  IOLogicalSwapMKLDNNMem(weight, param.num_group);
+
+  auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc());
+  const mkldnn::memory *weight_mem =
+      GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
+  mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]);
   mkldnn_args_map_t net_args;
+  if (bias) {
+    const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData();
+    net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
+  }
 
-  net_args.insert({MKLDNN_ARG_DIFF_DST, *data_mem});
+  net_args.insert({MKLDNN_ARG_SRC, *data_mem});
   net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem});
-  net_args.insert({MKLDNN_ARG_DIFF_SRC, *out_mem.second});
+  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
   MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
   CommitOutput(out_data[deconv::kOut], out_mem);
   MKLDNNStream::Get()->Submit();
 
-  MKLDNNDeconvFwdBiasPostProcess(param, ctx, *bias, out_data);
+  // swap back from oihw to iohw
+  IOLogicalSwapMKLDNNMem(weight, param.num_group);
 }
 
-class MKLDNNDeconvBackwardData {
-  std::shared_ptr<mkldnn::convolution_forward> bwd;
+class MKLDNNDeconvBackward {
+  std::shared_ptr<DeconvBwdDataPD> bwd_data_pd_;
+  std::shared_ptr<DeconvBwdWeightPD> bwd_weight_pd_;
+  std::shared_ptr<DeconvBwdData> bwd_data_;
+  std::shared_ptr<DeconvBwdWeight> bwd_weight_;
 
  public:
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> bwd_pd;
-  MKLDNNDeconvBackwardData(const DeconvolutionParam &param, const NDArray &data,
-                           const NDArray &weights, const NDArray &output);
-
-  const mkldnn::convolution_forward &GetBwd() const { return *bwd; }
-  const mkldnn::convolution_forward::primitive_desc &GetDataPd() const {
-    return *bwd_pd;
+  MKLDNNDeconvBackward(const DeconvolutionParam &param, const NDArray &data, 
+                       const NDArray &weights, const NDArray *bias, const NDArray &output) {
+    const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output);
+    bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd);
+    bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd);
+    bwd_data_ = std::make_shared<DeconvBwdData>(GetDataPd());
+    bwd_weight_ = std::make_shared<DeconvBwdWeight>(GetWeightsPd());
   }
-};
-
-MKLDNNDeconvBackwardData::MKLDNNDeconvBackwardData(
-    const DeconvolutionParam &param, const NDArray &data,
-    const NDArray &weights, const NDArray &output)
-    : bwd_pd(GetDeconvBwdDataImpl(param, data, weights, false, output)) {
-  bwd = std::make_shared<mkldnn::convolution_forward>(GetDataPd());
-}
 
-typedef ParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
-
-static inline MKLDNNDeconvBackwardData &GetDeconvBwdData(
-    const DeconvolutionParam &param, const NDArray &data,
-    const NDArray &weights, const NDArray &output) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNDeconvSignature,
-                                         MKLDNNDeconvBackwardData, OpHash>
-      bwds;
-#else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNDeconvSignature,
-                                            MKLDNNDeconvBackwardData, OpHash>
-      bwds;
-#endif
-  MKLDNNDeconvSignature key(param);
-  // Here we can sign the conv op with NDArray because conv primitive will
-  // decide the right layout for the, so we only need to get the shape and the
-  // data type of the arrays.
-  key.AddSign(data);
-  key.AddSign(weights);
-  key.AddSign(output);
+  const DeconvBwdData &GetBwdData() const { return *bwd_data_; }
 
-  auto it = bwds.find(key);
-  if (it == bwds.end()) {
-    auto bwd = MKLDNNDeconvBackwardData(param, data, weights, output);
-    it = AddToCache(&bwds, key, bwd);
-  }
-  return it->second;
-}
+  const DeconvBwdWeight &GetBwdWeights() const { return *bwd_weight_; }
 
-class MKLDNNDeconvBackwardWeights {
-  std::shared_ptr<mkldnn::convolution_backward_weights> bwd;
+  const DeconvBwdDataPD &GetDataPd() const { return *bwd_data_pd_; }
 
- public:
-  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-      bwd_data_pd;
-  MKLDNNDeconvBackwardWeights(
-      const DeconvolutionParam &param, const NDArray &data,
-      const NDArray &weights, const NDArray &output,
-      const mkldnn::convolution_forward::primitive_desc &bwd_data_pd);
-  const mkldnn::convolution_backward_weights &GetBwd() const { return *bwd; }
-  const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd()
-      const {
-    return *bwd_data_pd;
-  }
+  const DeconvBwdWeightPD &GetWeightsPd() const { return *bwd_weight_pd_; }
 };
 
-MKLDNNDeconvBackwardWeights::MKLDNNDeconvBackwardWeights(
-    const DeconvolutionParam &param, const NDArray &data,
-    const NDArray &weights, const NDArray &output,
-    const mkldnn::convolution_forward::primitive_desc &bwd_data_pd)
-    : bwd_data_pd(GetDeconvBwdWeightsImpl(param, data, weights, false, output,
-                                          bwd_data_pd)) {
-  bwd = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
-}
+typedef ParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
 
-static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights(
-    const DeconvolutionParam &param, const NDArray &data,
-    const NDArray &weights, const NDArray &output,
-    const mkldnn::convolution_forward::primitive_desc &bwd_data_pd) {
+static inline MKLDNNDeconvBackward &GetDeconvBwd(
+    const DeconvolutionParam &param, const NDArray &data, 
+    const NDArray &weights, const NDArray *bias, const NDArray &output) {
+  using mkldnn_deconv_bwd_map =
+      std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local std::unordered_map<MKLDNNDeconvSignature,
-                                         MKLDNNDeconvBackwardWeights, OpHash>
-      bwds;
+  static thread_local mkldnn_deconv_bwd_map bwds;
 #else
-  static MX_THREAD_LOCAL std::unordered_map<MKLDNNDeconvSignature,
-                                            MKLDNNDeconvBackwardWeights, OpHash>
-      bwds;
+  static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds;
 #endif
   MKLDNNDeconvSignature key(param);
   // Here we can sign the conv op with NDArray because conv primitive will
@@ -436,16 +420,12 @@ static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights(
   key.AddSign(data);
   key.AddSign(weights);
   key.AddSign(output);
+  if (bias) key.AddSign(*bias);
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    auto bwd =
-        MKLDNNDeconvBackwardWeights(param, data, weights, output, bwd_data_pd);
-    auto ins_ret = bwds.insert(
-        std::pair<MKLDNNDeconvSignature, MKLDNNDeconvBackwardWeights>(key,
-                                                                      bwd));
-    CHECK(ins_ret.second);
-    it = ins_ret.first;
+    auto bwd = MKLDNNDeconvBackward(param, data, weights, bias, output);
+    it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
@@ -461,63 +441,74 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
 
   auto &data = inputs[deconv::kData + 1];
   auto &weight = inputs[deconv::kWeight + 1];
+  const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1];
   auto &out_grad = inputs[deconv::kOut];
 
   CHECK_NE(req[deconv::kWeight], kWriteInplace)
       << "cannot write weight inplace";
-  MKLDNNDeconvBackwardData &bwd_data =
-      GetDeconvBwdData(param, data, weight, inputs[deconv::kOut]);
+  MKLDNNDeconvBackward &deconvBwd =
+      GetDeconvBwd(param, data, weight, bias, out_grad);
   auto out_grad_mem =
-      out_grad.GetMKLDNNDataReorder(bwd_data.GetDataPd().src_desc());
+      out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc());
   if (req[deconv::kData]) {
-    auto weight_mem = GetWeights(weight, bwd_data.GetDataPd().weights_desc(),
+    // swap is explained in MKLDNNDeconvolutionForward
+    IOLogicalSwapMKLDNNMem(weight, param.num_group); 
+    auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), 
                                  param.num_group);
     auto in_grad_mem =
-        CreateMKLDNNMem(in_grad[deconv::kData], bwd_data.GetDataPd().dst_desc(),
+        CreateMKLDNNMem(in_grad[deconv::kData], deconvBwd.GetDataPd().diff_src_desc(), 
                         req[deconv::kData]);
-    mkldnn_args_map_t net_args = {{MKLDNN_ARG_SRC, *out_grad_mem},
+    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
                                   {MKLDNN_ARG_WEIGHTS, *weight_mem},
-                                  {MKLDNN_ARG_DST, *in_grad_mem.second}};
-    MKLDNNStream::Get()->RegisterPrimArgs(bwd_data.GetBwd(), net_args);
+                                  {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}};
+    MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdData(), net_args);
     CommitOutput(in_grad[deconv::kData], in_grad_mem);
   }
-  if (req[deconv::kWeight]) {
-    MKLDNNDeconvBackwardWeights &bwd_weights = GetDeconvBwdWeights(
-        param, data, weight, inputs[deconv::kOut], bwd_data.GetDataPd());
-    if (bwd_data.GetDataPd().src_desc() !=
-        bwd_weights.GetWeightsPd().src_desc())
-      out_grad_mem =
-          out_grad.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().src_desc());
-    auto data_mem =
-        data.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().diff_dst_desc());
-    auto in_grad_weight = CreateMKLDNNWeightGrad(
-        in_grad[deconv::kWeight],
-        bwd_weights.GetWeightsPd().diff_weights_desc(), req[deconv::kWeight]);
+  if (req[deconv::kWeight] || req[deconv::kBias]) {
+    if (deconvBwd.GetDataPd().diff_dst_desc() != 
+        deconvBwd.GetWeightsPd().diff_dst_desc())
+      out_grad_mem = 
+          out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc());
+    auto data_mem = 
+      data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc());
+    mkldnn_output_t in_grad_weight;
+    const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc();
+    // swaps are explained in MKLDNNDeconvolutionForward
+    // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because of logical swap)
+    // We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is always in default format), 
+    // so here we check if after a swap, wei_md will have a default format
+    if (req[deconv::kWeight] == OpReqType::kWriteTo &&
+        IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) {
+      in_grad_weight = {OutDataOp::Noop,
+                        const_cast<NDArray &>(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)};
+    } else {
+      IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group);
+      in_grad_weight = CreateMKLDNNWeightGrad(
+          in_grad[deconv::kWeight], 
+          wei_md, req[deconv::kWeight]);
+    }
 
     mkldnn_args_map_t net_args = {
-        {MKLDNN_ARG_SRC, *out_grad_mem},
-        {MKLDNN_ARG_DIFF_DST, *data_mem},
+        {MKLDNN_ARG_DIFF_DST, *out_grad_mem},
+        {MKLDNN_ARG_SRC, *data_mem},
         {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
-    MKLDNNStream::Get()->RegisterPrimArgs(bwd_weights.GetBwd(), net_args);
+    mkldnn_output_t in_grad_bias;
+    if (!param.no_bias) {
+      in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias],
+                                     deconvBwd.GetWeightsPd().diff_bias_desc(), req[deconv::kBias]);
+      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
+    }
+    MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdWeights(), net_args);
     CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
+    // CommitOutput Should run after RegisterPrimArgs for memory dependency
+    if (!param.no_bias) CommitOutput(in_grad[deconv::kBias], in_grad_bias);
   }
   MKLDNNStream::Get()->Submit();
 
-  if (!param.no_bias) {
-    typedef float DType;
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    Tensor<cpu, 1, DType> gbias =
-        in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
-
-    NDArray temp = inputs[deconv::kOut];
-    if (temp.IsMKLDNNData()) {
-      temp = temp.Reorder2Default();
-    }
-
-    Tensor<cpu, 4, DType> grad = temp.data().get<cpu, 4, DType>(s);
-    Assign(gbias, req[deconv::kBias],
-           mshadow::expr::sumall_except_dim<1>(grad));
-  }
+  // swap back from oihw to iohw
+  if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, param.num_group);
+  if (req[deconv::kWeight] || req[deconv::kBias])
+    IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group);
 }
 
 }  // namespace op

From ebbb70495b765650518bf0c13e750e1ef254ab0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Tue, 9 Mar 2021 16:47:26 +0100
Subject: [PATCH 2/9] Apply clang-format

---
 .../nn/mkldnn/mkldnn_deconvolution.cc         | 147 ++++++++----------
 1 file changed, 64 insertions(+), 83 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 9d59c65e7891..cc8faca3ef88 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -40,11 +40,10 @@ using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc;
 using DeconvBwdWeight = mkldnn::deconvolution_backward_weights;
 using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc;
 
-bool SupportMKLDNNDeconv(const DeconvolutionParam &params, 
-                         const NDArray &input) {
+bool SupportMKLDNNDeconv(const DeconvolutionParam &params, const NDArray &input) {
   if (params.kernel.ndim() != 2) return false;
-  return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16)
-         && input.shape().ndim() == 4;
+  return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
+         input.shape().ndim() == 4;
 }
 
 // Swaps the logical order of dimensions that in plain format would correspond to input and output
@@ -87,7 +86,7 @@ static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) {
   const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups));
 }
 
-// Version of GetWeightDesc for deconvolution (with swap) 
+// Version of GetWeightDesc for deconvolution (with swap)
 static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) {
   return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups);
 }
@@ -140,9 +139,7 @@ std::shared_ptr<DeconvFwdPD> GetDeconvFwdImpl(const DeconvolutionParam &param, c
         mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides,
         dilate, padding, padding);
   };
-  auto deconv_pd = 
-      std::make_shared<DeconvFwdPD>(
-        desc(), engine);
+  auto deconv_pd = std::make_shared<DeconvFwdPD>(desc(), engine);
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
   // still reuse memory from memory planning, so here we need to select a
@@ -162,11 +159,10 @@ std::shared_ptr<DeconvFwdPD> GetDeconvFwdImpl(const DeconvolutionParam &param, c
   return deconv_pd;
 }
 
-std::shared_ptr<DeconvBwdDataPD> 
-GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data, 
-                     const NDArray &weights,
-                     const NDArray &output, 
-                     const DeconvFwdPD &fwd_pd) {
+std::shared_ptr<DeconvBwdDataPD> GetDeconvBwdDataImpl(const DeconvolutionParam &param,
+                                                      const NDArray &data, const NDArray &weights,
+                                                      const NDArray &output,
+                                                      const DeconvFwdPD &fwd_pd) {
   auto data_md = GetMemDesc(data);
   auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
@@ -184,12 +180,10 @@ GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data,
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
   auto desc = [&]() {
-    return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct,
-                                                     data_md, weight_md, out_md, strides, dilate,
-                                                     padding, padding);
+    return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md,
+                               strides, dilate, padding, padding);
   };
-  auto deconv_pd =
-      std::make_shared<DeconvBwdDataPD>(desc(), engine, fwd_pd);
+  auto deconv_pd = std::make_shared<DeconvBwdDataPD>(desc(), engine, fwd_pd);
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
   // still reuse memory from memory planning, so here we need to select a
@@ -207,11 +201,9 @@ GetDeconvBwdDataImpl(const DeconvolutionParam &param, const NDArray &data,
   return deconv_pd;
 }
 
-std::shared_ptr<DeconvBwdWeightPD> 
-GetDeconvBwdWeightsImpl(
-    const DeconvolutionParam &param, const NDArray &data, 
-    const NDArray &weights,const NDArray *bias, const NDArray &output,
-    const DeconvFwdPD &fwd_pd) {
+std::shared_ptr<DeconvBwdWeightPD> GetDeconvBwdWeightsImpl(
+    const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+    const NDArray *bias, const NDArray &output, const DeconvFwdPD &fwd_pd) {
   auto data_md = GetMemDesc(data);
   auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
@@ -232,12 +224,10 @@ GetDeconvBwdWeightsImpl(
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
   auto desc = [&]() {
-    return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct,
-                                                        data_md, weight_md, bias_md, out_md,
-                                                        strides, dilate, padding, padding);
+    return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md,
+                                 bias_md, out_md, strides, dilate, padding, padding);
   };
-  auto deconv_pd = std::make_shared<DeconvBwdWeightPD>(
-      desc(), engine, fwd_pd);
+  auto deconv_pd = std::make_shared<DeconvBwdWeightPD>(desc(), engine, fwd_pd);
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
@@ -258,9 +248,8 @@ GetDeconvBwdWeightsImpl(
 
 class MKLDNNDeconvForward {
  public:
-  MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data, 
-                      const NDArray &weights,const NDArray *bias, 
-                      const NDArray &output);
+  MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+                      const NDArray *bias, const NDArray &output);
   const DeconvFwd &GetFwd() const { return *fwd; }
 
   const DeconvFwdPD &GetPd() const { return *fwd_pd; }
@@ -314,8 +303,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c
   auto &weight = in_data[deconv::kWeight];
   const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias];
 
-  MKLDNNDeconvForward &fwd = 
-      GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]);
+  MKLDNNDeconvForward &fwd = GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]);
 
   if (ctx.is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
@@ -337,28 +325,31 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c
     }
   }
 
-  // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives. 
-  // For that, we would pass input tensor in place of output and output tensor in place of 
-  // input (for appropriate convolution primitives: deconvolution forward = convolution backward 
-  // data, deconvolution backward data = convolution forward). Convolution primitive expects 
-  // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors: 
-  // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight 
-  // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet 
+  // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
+  // For that, we would pass input tensor in place of output and output tensor in place of
+  // input (for appropriate convolution primitives: deconvolution forward = convolution backward
+  // data, deconvolution backward data = convolution forward). Convolution primitive expects
+  // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors:
+  // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight
+  // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet
   // provides such tensor.
 
-  // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w), 
-  // but this time we don't swap input and output tensors, so o = output_channels, i = input_channels, 
-  // so the current weight tensor won't fit (when oihw != iohw). But actually, underneath deconvolution 
-  // MKLDNN also uses convolution, so even though it expects the weight tensor with shape (o, i, h, w), 
-  // it wants it in iohw format, so it's physical representation match current weight tensor.
-  
-  // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN operations
+  // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w),
+  // but this time we don't swap input and output tensors, so o = output_channels, i =
+  // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually,
+  // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight
+  // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation
+  // match current weight tensor.
+
+  // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN
+  // operations
   IOLogicalSwapMKLDNNMem(weight, param.num_group);
 
   auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc());
   const mkldnn::memory *weight_mem =
       GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
-  mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]);
+  mkldnn_output_t out_mem =
+      CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]);
   mkldnn_args_map_t net_args;
   if (bias) {
     const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData();
@@ -383,8 +374,8 @@ class MKLDNNDeconvBackward {
   std::shared_ptr<DeconvBwdWeight> bwd_weight_;
 
  public:
-  MKLDNNDeconvBackward(const DeconvolutionParam &param, const NDArray &data, 
-                       const NDArray &weights, const NDArray *bias, const NDArray &output) {
+  MKLDNNDeconvBackward(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+                       const NDArray *bias, const NDArray &output) {
     const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output);
     bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd);
     bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd);
@@ -403,9 +394,9 @@ class MKLDNNDeconvBackward {
 
 typedef ParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
 
-static inline MKLDNNDeconvBackward &GetDeconvBwd(
-    const DeconvolutionParam &param, const NDArray &data, 
-    const NDArray &weights, const NDArray *bias, const NDArray &output) {
+static inline MKLDNNDeconvBackward &GetDeconvBwd(const DeconvolutionParam &param,
+                                                 const NDArray &data, const NDArray &weights,
+                                                 const NDArray *bias, const NDArray &output) {
   using mkldnn_deconv_bwd_map =
       std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
@@ -430,8 +421,7 @@ static inline MKLDNNDeconvBackward &GetDeconvBwd(
   return it->second;
 }
 
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
-                                 const OpContext &ctx,
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                  const std::vector<NDArray> &inputs,
                                  const std::vector<OpReqType> &req,
                                  const std::vector<NDArray> &outputs) {
@@ -444,20 +434,15 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
   const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1];
   auto &out_grad = inputs[deconv::kOut];
 
-  CHECK_NE(req[deconv::kWeight], kWriteInplace)
-      << "cannot write weight inplace";
-  MKLDNNDeconvBackward &deconvBwd =
-      GetDeconvBwd(param, data, weight, bias, out_grad);
-  auto out_grad_mem =
-      out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc());
+  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  MKLDNNDeconvBackward &deconvBwd = GetDeconvBwd(param, data, weight, bias, out_grad);
+  auto out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc());
   if (req[deconv::kData]) {
     // swap is explained in MKLDNNDeconvolutionForward
-    IOLogicalSwapMKLDNNMem(weight, param.num_group); 
-    auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), 
-                                 param.num_group);
-    auto in_grad_mem =
-        CreateMKLDNNMem(in_grad[deconv::kData], deconvBwd.GetDataPd().diff_src_desc(), 
-                        req[deconv::kData]);
+    IOLogicalSwapMKLDNNMem(weight, param.num_group);
+    auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData],
+                                       deconvBwd.GetDataPd().diff_src_desc(), req[deconv::kData]);
     mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
                                   {MKLDNN_ARG_WEIGHTS, *weight_mem},
                                   {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}};
@@ -465,33 +450,29 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
     CommitOutput(in_grad[deconv::kData], in_grad_mem);
   }
   if (req[deconv::kWeight] || req[deconv::kBias]) {
-    if (deconvBwd.GetDataPd().diff_dst_desc() != 
-        deconvBwd.GetWeightsPd().diff_dst_desc())
-      out_grad_mem = 
-          out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc());
-    auto data_mem = 
-      data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc());
+    if (deconvBwd.GetDataPd().diff_dst_desc() != deconvBwd.GetWeightsPd().diff_dst_desc())
+      out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc());
+    auto data_mem = data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc());
     mkldnn_output_t in_grad_weight;
     const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc();
     // swaps are explained in MKLDNNDeconvolutionForward
-    // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because of logical swap)
-    // We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is always in default format), 
-    // so here we check if after a swap, wei_md will have a default format
+    // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
+    // of logical swap) We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is
+    // always in default format), so here we check if after a swap, wei_md will have a default
+    // format
     if (req[deconv::kWeight] == OpReqType::kWriteTo &&
         IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) {
       in_grad_weight = {OutDataOp::Noop,
                         const_cast<NDArray &>(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)};
     } else {
       IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group);
-      in_grad_weight = CreateMKLDNNWeightGrad(
-          in_grad[deconv::kWeight], 
-          wei_md, req[deconv::kWeight]);
+      in_grad_weight =
+          CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], wei_md, req[deconv::kWeight]);
     }
 
-    mkldnn_args_map_t net_args = {
-        {MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-        {MKLDNN_ARG_SRC, *data_mem},
-        {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
+    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
+                                  {MKLDNN_ARG_SRC, *data_mem},
+                                  {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
     mkldnn_output_t in_grad_bias;
     if (!param.no_bias) {
       in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias],

From d1512586ff29f3557e78e9fe879cb9b010ce8a61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Thu, 11 Mar 2021 12:17:25 +0100
Subject: [PATCH 3/9] Refactor deconvolution version 1

---
 .../nn/mkldnn/mkldnn_deconvolution-inl.h      | 169 ++++
 .../nn/mkldnn/mkldnn_deconvolution.cc         | 733 +++++++++---------
 2 files changed, 532 insertions(+), 370 deletions(-)
 create mode 100644 src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
new file mode 100644
index 000000000000..f2638013ac3d
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_deconvolution-inl.h
+ * \brief
+ * \Author: Paweł Głomski, pawel.glomski@intel.com
+ */
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include "../deconvolution-inl.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+namespace mxnet {
+namespace op {
+
+using deconv_fwd_t = mkldnn::deconvolution_forward;
+using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
+
+using deconv_bwd_t = mkldnn::deconvolution_backward_data;
+using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
+
+using deconv_bwd_weight_t = mkldnn::deconvolution_backward_weights;
+using deconv_bwd_weight_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
+
+class MKLDNNDeconvFwd {
+ public:
+  struct Tensors {
+    Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &out);
+    Tensors(bool no_bias, const std::vector<NDArray> &inputs, const std::vector<NDArray> &outputs);
+
+    const NDArray &data;
+    const NDArray &weight;
+    const NDArray *bias;
+    const NDArray &out;
+  };
+
+  static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam &param, const Tensors &tensors);
+  static std::shared_ptr<deconv_fwd_pd_t> MakePD(const DeconvolutionParam &param,
+                                                 const Tensors &tensors);
+
+  MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors);
+  void ControlWeightFormat(uint32_t num_group, bool is_train, const NDArray &weight);
+  void Execute(uint32_t num_group, const std::vector<OpReqType> &req, const Tensors &tensors);
+
+ private:
+  const mkldnn::memory *DataMem(const NDArray &data) const;
+  const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const;
+  const mkldnn::memory *BiasMem(const NDArray &bias) const;
+
+  mkldnn_output_t OutMem(OpReqType req, const NDArray &out) const;
+
+  std::shared_ptr<deconv_fwd_t> fwd;
+  std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
+};
+
+class MKLDNNDeconvBwd {
+ public:
+  struct ReadTensors {
+    ReadTensors(bool no_bias, const std::vector<NDArray> &inputs);
+    const NDArray &data;
+    const NDArray &weight;
+    const NDArray *bias;
+    const NDArray &out_grad;
+  };
+  struct WriteTensors {
+    WriteTensors(bool no_bias, const std::vector<NDArray> &outputs);
+    const NDArray &data_grad;
+    const NDArray &weight_grad;
+    const NDArray *bias_grad;
+  };
+
+  static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam &param, const ReadTensors &rt);
+  static std::shared_ptr<deconv_bwd_data_pd_t> MakeDataPD(const DeconvolutionParam &param,
+                                                          const ReadTensors &rt,
+                                                          const deconv_fwd_pd_t &fwd_pd);
+  static std::shared_ptr<deconv_bwd_weight_pd_t> MakeWeightsPD(const DeconvolutionParam &param,
+                                                               const ReadTensors &rt,
+                                                               const deconv_fwd_pd_t &fwd_pd);
+
+  MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &rt);
+  void Execute(uint32_t num_group, const std::vector<OpReqType> &req, const ReadTensors &rt,
+               const WriteTensors &wt);
+
+ private:
+  void IOSwapWeightTensors(uint32_t num_group, const std::vector<OpReqType> &req,
+                           const NDArray &weight, const NDArray &weight_grad);
+
+  const mkldnn::memory *ScheduleBwdData(uint32_t num_group, const std::vector<OpReqType> &req,
+                                        const ReadTensors &rt, const WriteTensors &wt);
+
+  void ScheduleBwdWeight(uint32_t num_group, const std::vector<OpReqType> &req,
+                         const ReadTensors &rt, const WriteTensors &wt,
+                         const mkldnn::memory *out_grad_mem);
+
+  const mkldnn::memory *DataMem(const NDArray &data) const;
+  const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const;
+  const mkldnn::memory *OutGradMem(const NDArray &out_grad) const;  // for bwd data
+  const mkldnn::memory *OutGradMem(const NDArray &out_grad,         // for bwd weight
+                                   const mkldnn::memory *out_grad_mem) const;
+
+  mkldnn_output_t DataGradMem(OpReqType req, const NDArray &data_grad) const;
+  mkldnn_output_t WeightGradMem(uint32_t num_group, OpReqType req,
+                                const NDArray &weight_grad) const;
+  mkldnn_output_t BiasGradMem(OpReqType req, const NDArray *bias) const;
+
+  std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
+  std::shared_ptr<deconv_bwd_weight_pd_t> bwd_weight_pd;
+  std::shared_ptr<deconv_bwd_t> bwd_data;
+  std::shared_ptr<deconv_bwd_weight_t> bwd_weight;
+};  // namespace op
+
+struct DeconvDescCreator {
+  DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data, const NDArray &weight,
+                    const NDArray *bias, const NDArray &out);
+
+  // Imposes plain formats on memory descriptors with padding
+  // Changing only one at a time, so maybe better implementations will be selected
+  // (than entirely plain one)
+  void ImposePlainWherePadding(size_t data_size, size_t weight_size, size_t out_size);
+  bool CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const;
+
+  deconv_fwd_t::desc MakeFwdDesc() const;
+  deconv_bwd_t::desc MakeBwdDataDesc() const;
+  deconv_bwd_weight_t::desc MakeBwdWeightDesc() const;
+
+  mkldnn::memory::desc data_md;
+  mkldnn::memory::desc weight_md;
+  mkldnn::memory::desc bias_md;
+  mkldnn::memory::desc out_md;
+
+  mkldnn::memory::dims strides;
+  mkldnn::memory::dims padding;
+  mkldnn::memory::dims dilates;
+
+  mkldnn::engine &engine;
+};
+
+mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups);
+void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups);
+
+// Version of GetWeightDesc for deconvolution (with swap)
+static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weight, int num_groups) {
+  return IOLogicalSwapDesc(GetWeightDesc(weight, num_groups), num_groups);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index cc8faca3ef88..b5e7e4166284 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -24,252 +24,46 @@
 
 #if MXNET_USE_MKLDNN == 1
 
-#include "../deconvolution-inl.h"
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
 
-using DeconvFwd = mkldnn::deconvolution_forward;
-using DeconvFwdPD = mkldnn::deconvolution_forward::primitive_desc;
-
-using DeconvBwdData = mkldnn::deconvolution_backward_data;
-using DeconvBwdDataPD = mkldnn::deconvolution_backward_data::primitive_desc;
-
-using DeconvBwdWeight = mkldnn::deconvolution_backward_weights;
-using DeconvBwdWeightPD = mkldnn::deconvolution_backward_weights::primitive_desc;
-
 bool SupportMKLDNNDeconv(const DeconvolutionParam &params, const NDArray &input) {
   if (params.kernel.ndim() != 2) return false;
   return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
          input.shape().ndim() == 4;
 }
 
-// Swaps the logical order of dimensions that in plain format would correspond to input and output
-// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-static inline mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) {
-  auto &d = desc.data;
-  int offset = int(num_groups > 1);
-  int dim0 = offset + 0;
-  int dim1 = offset + 1;
-  std::swap(d.dims[dim0], d.dims[dim1]);
-  std::swap(d.padded_dims[dim0], d.padded_dims[dim1]);
-  if (d.format_kind != dnnl_format_kind_any) {
-    std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]);
-    // as padding is not supported, these are always zeros?
-    std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]);
-    // for blocked format: change indices
-    for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) {
-      auto &val = d.format_desc.blocking.inner_idxs[i];
-      if (val == dim0) {
-        val = dim1;
-      } else if (val == dim1) {
-        val = dim0;
-      }
-    }
-  }
-  return desc;
-}
-
-// Applies IOLogicalSwapDesc to arr
-static inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) {
-  mkldnn::memory::desc desc;
-  if (arr.IsMKLDNNData()) {
-    desc = arr.GetMKLDNNData()->get_desc();
-  } else {
-    const auto &temp = GetWeightDesc(arr, num_groups);
-    desc = mkldnn::memory::desc(
-        temp.dims(), temp.data_type(),
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
-  }
-  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups));
-}
-
-// Version of GetWeightDesc for deconvolution (with swap)
-static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weights, int num_groups) {
-  return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups);
-}
-
-// Imposes the plain format on memory descriptors with padding
-// Changing only one at a time, so maybe better implementations will be selected
-// (than entirely plain one)
-void ImposePlainWherePadding(mkldnn::memory::desc &src_md, mkldnn::memory::desc &dst_md,
-                             mkldnn::memory::desc &weight_md, size_t src_size, size_t dst_size,
-                             size_t wei_size) {
-  if (src_size != GetMemDescSize(src_md)) {
-    CHECK(src_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
-    src_md = GetDesc(src_md, GetDefaultFormat(src_md));
-  } else if (dst_size != GetMemDescSize(dst_md)) {
-    CHECK(dst_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
-    dst_md = GetDesc(dst_md, GetDefaultFormat(dst_md));
-  } else if (wei_size != GetMemDescSize(weight_md)) {
-    CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
-    int num_groups = (weight_md.data.ndims > src_md.data.ndims) ? weight_md.data.dims[0] : 1;
-    weight_md = IOLogicalSwapDesc(weight_md, num_groups);
-    weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups);
-  }
-}
+/*############################### Forward ###############################*/
 
-std::shared_ptr<DeconvFwdPD> GetDeconvFwdImpl(const DeconvolutionParam &param, const NDArray &data,
-                                              const NDArray &weights, const NDArray *bias,
-                                              const NDArray &output) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto bias_md = bias ? GetMemDesc(*bias)
-                      : mkldnn::memory::desc{
-                            {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
-  auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2);
-  CHECK_GE(param.pad.ndim(), 2);
-  CHECK_GE(param.dilate.ndim(), 2);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
-  mkldnn::memory::dims dilate{0, 0};
-  dilate[0] = param.dilate[0] - 1;
-  dilate[1] = param.dilate[1] - 1;
-  auto desc = [&]() {
-    return DeconvFwd::desc(
-        mkldnn::prop_kind::forward_training,  // TODO: check if this should be constant
-        mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md, out_md, strides,
-        dilate, padding, padding);
-  };
-  auto deconv_pd = std::make_shared<DeconvFwdPD>(desc(), engine);
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKL-DNN operators
-  // still reuse memory from memory planning, so here we need to select a
-  // suboptimal kernel for computation that has the expected memory size requirements
-  while (deconv_pd->dst_desc().get_size() != GetMemDescSize(out_md) ||
-         deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) ||
-         deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) {
-    // for deconvolution primitive next_impl always fails. Keep this?
-    if (!deconv_pd->next_impl()) {
-      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->dst_desc().get_size(),
-                              deconv_pd->src_desc().get_size(),
-                              deconv_pd->weights_desc().get_size());
-      *deconv_pd = DeconvFwdPD(desc(), engine);
-    }
-  }
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                                const std::vector<NDArray> &inputs,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
+  MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
 
-  return deconv_pd;
-}
-
-std::shared_ptr<DeconvBwdDataPD> GetDeconvBwdDataImpl(const DeconvolutionParam &param,
-                                                      const NDArray &data, const NDArray &weights,
-                                                      const NDArray &output,
-                                                      const DeconvFwdPD &fwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2);
-  CHECK_GE(param.pad.ndim(), 2);
-  CHECK_GE(param.dilate.ndim(), 2);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
-  mkldnn::memory::dims dilate{0, 0};
-  dilate[0] = param.dilate[0] - 1;
-  dilate[1] = param.dilate[1] - 1;
-  auto desc = [&]() {
-    return DeconvBwdData::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md,
-                               strides, dilate, padding, padding);
-  };
-  auto deconv_pd = std::make_shared<DeconvBwdDataPD>(desc(), engine, fwd_pd);
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKL-DNN operators
-  // still reuse memory from memory planning, so here we need to select a
-  // suboptimal kernel for computation that has the expected memory size requirements
-  while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) ||
-         deconv_pd->diff_src_desc().get_size() != GetMemDescSize(data_md) ||
-         deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) {
-    if (!deconv_pd->next_impl()) {
-      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(),
-                              deconv_pd->diff_src_desc().get_size(),
-                              deconv_pd->weights_desc().get_size());
-      *deconv_pd = DeconvBwdDataPD(desc(), engine, fwd_pd);
-    }
-  }
-  return deconv_pd;
-}
-
-std::shared_ptr<DeconvBwdWeightPD> GetDeconvBwdWeightsImpl(
-    const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
-    const NDArray *bias, const NDArray &output, const DeconvFwdPD &fwd_pd) {
-  auto data_md = GetMemDesc(data);
-  auto weight_md = GetDeconvWeightDesc(weights, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto bias_md = bias ? GetMemDesc(*bias)
-                      : mkldnn::memory::desc{
-                            {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
-  auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2);
-  CHECK_GE(param.pad.ndim(), 2);
-  CHECK_GE(param.dilate.ndim(), 2);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
-  mkldnn::memory::dims dilate{0, 0};
-  dilate[0] = param.dilate[0] - 1;
-  dilate[1] = param.dilate[1] - 1;
-  auto desc = [&]() {
-    return DeconvBwdWeight::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md,
-                                 bias_md, out_md, strides, dilate, padding, padding);
-  };
-  auto deconv_pd = std::make_shared<DeconvBwdWeightPD>(desc(), engine, fwd_pd);
-
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKL-DNN operators
-  // still reuse memory from memory planning, so here we need to select a
-  // suboptimal kernel for computation that has the expected memory size requirements
-  while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(out_md) ||
-         deconv_pd->src_desc().get_size() != GetMemDescSize(data_md) ||
-         deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) {
-    if (!deconv_pd->next_impl()) {
-      ImposePlainWherePadding(data_md, out_md, weight_md, deconv_pd->diff_dst_desc().get_size(),
-                              deconv_pd->src_desc().get_size(),
-                              deconv_pd->diff_weights_desc().get_size());
-      *deconv_pd = DeconvBwdWeightPD(desc(), engine, fwd_pd);
-    }
-  }
-  return deconv_pd;
+  fwd.ControlWeightFormat(param.num_group, ctx.is_train, tensors.weight);
+  fwd.Execute(param.num_group, req, tensors);
 }
 
-class MKLDNNDeconvForward {
- public:
-  MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
-                      const NDArray *bias, const NDArray &output);
-  const DeconvFwd &GetFwd() const { return *fwd; }
-
-  const DeconvFwdPD &GetPd() const { return *fwd_pd; }
-
- private:
-  std::shared_ptr<DeconvFwd> fwd;
-  std::shared_ptr<DeconvFwdPD> fwd_pd;
-};  // class MKLDNNDeconvForward
+MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias,
+                                  const NDArray &out)
+    : data(data), weight(weight), bias(bias), out(out) {}
 
-MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam &param, const NDArray &data,
-                                         const NDArray &weights, const NDArray *bias,
-                                         const NDArray &output)
-    : fwd_pd(GetDeconvFwdImpl(param, data, weights, bias, output)) {
-  fwd = std::make_shared<DeconvFwd>(GetPd());
-}
+MKLDNNDeconvFwd::Tensors::Tensors(bool no_bias, const std::vector<NDArray> &inputs,
+                                  const std::vector<NDArray> &outputs)
+    : data(inputs[deconv::kData]),
+      weight(inputs[deconv::kWeight]),
+      bias(no_bias ? nullptr : &inputs[deconv::kBias]),
+      out(outputs[deconv::kOut]) {}
 
-MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam &param, const NDArray &data,
-                                  const NDArray &weights, const NDArray *bias,
-                                  const NDArray &output) {
-  using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash>;
+MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
+                                            const Tensors &tensors) {
+  using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_fwd_map fwds;
 #else
@@ -279,38 +73,48 @@ MKLDNNDeconvForward &GetDeconvFwd(const DeconvolutionParam &param, const NDArray
   // Here we can sign the conv op with NDArray because conv primitive will
   // decide the right layout for the, so we only need to get the shape and the
   // data type of the arrays.
-  key.AddSign(data);
-  key.AddSign(weights);
-  key.AddSign(output);
-  if (bias) key.AddSign(*bias);
+  key.AddSign(tensors.data);
+  key.AddSign(tensors.weight);
+  key.AddSign(tensors.out);
+  if (tensors.bias) key.AddSign(*tensors.bias);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd = MKLDNNDeconvForward(param, data, weights, bias, output);
+    auto fwd = MKLDNNDeconvFwd(param, tensors);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                                const std::vector<NDArray> &in_data,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &out_data) {
-  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const DeconvolutionParam &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-
-  auto &data = in_data[deconv::kData];
-  auto &weight = in_data[deconv::kWeight];
-  const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias];
+std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::MakePD(const DeconvolutionParam &param,
+                                                         const Tensors &tensors) {
+  DeconvDescCreator ddc(param, tensors.data, tensors.weight, tensors.bias, tensors.out);
+  auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.MakeFwdDesc(), ddc.engine);
+
+  while (true) {
+    size_t data_size = pd->src_desc().get_size();
+    size_t weight_size = pd->weights_desc().get_size();
+    size_t out_size = pd->dst_desc().get_size();
+    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
+    if (pd->next_impl()) continue;
+    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
+    *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine);
+  }
+  return pd;
+}
 
-  MKLDNNDeconvForward &fwd = GetDeconvFwd(param, data, weight, bias, out_data[deconv::kOut]);
+MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors)
+    : fwd_pd(MakePD(param, tensors)) {
+  fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
+}
 
-  if (ctx.is_train) {
+void MKLDNNDeconvFwd::ControlWeightFormat(uint32_t num_group, bool is_train,
+                                          const NDArray &weight) {
+  if (is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
     if (weight.IsMKLDNNData())
-      // This asks the engine to change the layout of the weight array after
-      // it's used.
+      // This asks the engine to change the layout of the weight array after it's used.
       weight.Reorder2DefaultAsync();
   } else {
     // For inference, we want to reorder the weight array so we don't need to
@@ -318,178 +122,367 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c
     if (weight.IsDefaultData()) {
       // We also need to modify the layout on the original weight array. The
       // data conversion happens after the weight array is used.
-      weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group));
+      weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     } else {
       CHECK(weight.GetMKLDNNData()->get_desc() ==
-            IOLogicalSwapDesc(fwd.GetPd().weights_desc(), param.num_group));
+            IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     }
   }
+}
 
+void MKLDNNDeconvFwd::Execute(uint32_t num_group, const std::vector<OpReqType> &req,
+                              const Tensors &tensors) {
   // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
   // For that, we would pass input tensor in place of output and output tensor in place of
   // input (for appropriate convolution primitives: deconvolution forward = convolution backward
   // data, deconvolution backward data = convolution forward). Convolution primitive expects
   // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors:
   // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight
-  // tensor with shape (input_channels, output_channels, h, w), which is (i, o, h, w) and MXNet
-  // provides such tensor.
-
+  // tensor with shape (input_channels, output_channels, h, w) and MXNet provides such tensor.
+  //
   // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w),
   // but this time we don't swap input and output tensors, so o = output_channels, i =
   // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually,
   // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight
   // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation
   // match current weight tensor.
+  //
+  // So here we swap logical order of input and output dimensions for weight tensor just for
+  // MKLDNN operations
+  IOLogicalSwapMKLDNNMem(tensors.weight, num_group);
+  {
+    mkldnn_args_map_t net_args;
+    auto out_mem = OutMem(req[deconv::kOut], tensors.out);
+
+    net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
+    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, tensors.weight)});
+    net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
+    if (tensors.bias) net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)});
 
-  // So here we swap logical order of input and output dimensions for weight tensor just for MKLDNN
-  // operations
-  IOLogicalSwapMKLDNNMem(weight, param.num_group);
-
-  auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().src_desc());
-  const mkldnn::memory *weight_mem =
-      GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
-  mkldnn_output_t out_mem =
-      CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().dst_desc(), req[deconv::kOut]);
-  mkldnn_args_map_t net_args;
-  if (bias) {
-    const mkldnn::memory *bias_mem = in_data[deconv::kBias].GetMKLDNNData();
-    net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
+    // CommitOutput Should run after RegisterPrimArgs for memory dependency
+    MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args);
+    CommitOutput(tensors.out, out_mem);
+    MKLDNNStream::Get()->Submit();
   }
+  IOLogicalSwapMKLDNNMem(tensors.weight, num_group);  // swap back from oihw to iohw
+}
 
-  net_args.insert({MKLDNN_ARG_SRC, *data_mem});
-  net_args.insert({MKLDNN_ARG_WEIGHTS, *weight_mem});
-  net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-  MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
-  CommitOutput(out_data[deconv::kOut], out_mem);
-  MKLDNNStream::Get()->Submit();
-
-  // swap back from oihw to iohw
-  IOLogicalSwapMKLDNNMem(weight, param.num_group);
-}
-
-class MKLDNNDeconvBackward {
-  std::shared_ptr<DeconvBwdDataPD> bwd_data_pd_;
-  std::shared_ptr<DeconvBwdWeightPD> bwd_weight_pd_;
-  std::shared_ptr<DeconvBwdData> bwd_data_;
-  std::shared_ptr<DeconvBwdWeight> bwd_weight_;
-
- public:
-  MKLDNNDeconvBackward(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
-                       const NDArray *bias, const NDArray &output) {
-    const auto fwd_pd = GetDeconvFwdImpl(param, data, weights, bias, output);
-    bwd_data_pd_ = GetDeconvBwdDataImpl(param, data, weights, output, *fwd_pd);
-    bwd_weight_pd_ = GetDeconvBwdWeightsImpl(param, data, weights, bias, output, *fwd_pd);
-    bwd_data_ = std::make_shared<DeconvBwdData>(GetDataPd());
-    bwd_weight_ = std::make_shared<DeconvBwdWeight>(GetWeightsPd());
-  }
+const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
+  return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
+}
 
-  const DeconvBwdData &GetBwdData() const { return *bwd_data_; }
+const mkldnn::memory *MKLDNNDeconvFwd::WeightMem(uint32_t num_group, const NDArray &weight) const {
+  return GetWeights(weight, fwd_pd->weights_desc(), num_group);
+}
+
+const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
+  return bias.GetMKLDNNData();
+}
+
+mkldnn_output_t MKLDNNDeconvFwd::OutMem(OpReqType req, const NDArray &out) const {
+  return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
+}
+
+/*############################### Backward ###############################*/
+
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                                 const std::vector<NDArray> &inputs,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<NDArray> &outputs) {
+  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
 
-  const DeconvBwdWeight &GetBwdWeights() const { return *bwd_weight_; }
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  const auto &rt = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto &wt = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
+  MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, rt);
 
-  const DeconvBwdDataPD &GetDataPd() const { return *bwd_data_pd_; }
+  bwd.Execute(param.num_group, req, rt, wt);
+}
 
-  const DeconvBwdWeightPD &GetWeightsPd() const { return *bwd_weight_pd_; }
-};
+MKLDNNDeconvBwd::ReadTensors::ReadTensors(bool no_bias, const std::vector<NDArray> &inputs)
+    : data(inputs[deconv::kData + 1]),
+      weight(inputs[deconv::kWeight + 1]),
+      bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
+      out_grad(inputs[deconv::kOut]) {}
 
-typedef ParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
+MKLDNNDeconvBwd::WriteTensors::WriteTensors(bool no_bias, const std::vector<NDArray> &outputs)
+    : data_grad(outputs[deconv::kData]),
+      weight_grad(outputs[deconv::kWeight]),
+      bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
 
-static inline MKLDNNDeconvBackward &GetDeconvBwd(const DeconvolutionParam &param,
-                                                 const NDArray &data, const NDArray &weights,
-                                                 const NDArray *bias, const NDArray &output) {
-  using mkldnn_deconv_bwd_map =
-      std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackward, OpHash>;
+MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
+                                            const ReadTensors &rt) {
+  using mkldnn_deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local mkldnn_deconv_bwd_map bwds;
 #else
   static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds;
 #endif
-  MKLDNNDeconvSignature key(param);
+  DeconvSignature key(param);
   // Here we can sign the conv op with NDArray because conv primitive will
   // decide the right layout for the, so we only need to get the shape and the
   // data type of the arrays.
-  key.AddSign(data);
-  key.AddSign(weights);
-  key.AddSign(output);
-  if (bias) key.AddSign(*bias);
+  key.AddSign(rt.data);
+  key.AddSign(rt.weight);
+  key.AddSign(rt.out_grad);
+  if (rt.bias) key.AddSign(*rt.bias);
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    auto bwd = MKLDNNDeconvBackward(param, data, weights, bias, output);
+    auto bwd = MKLDNNDeconvBwd(param, rt);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                                 const std::vector<NDArray> &inputs,
-                                 const std::vector<OpReqType> &req,
-                                 const std::vector<NDArray> &outputs) {
-  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const std::vector<NDArray> &in_grad = outputs;
-  const DeconvolutionParam &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam &param,
+                                                                  const ReadTensors &rt,
+                                                                  const deconv_fwd_pd_t &fwd_pd) {
+  DeconvDescCreator ddc(param, rt.data, rt.weight, nullptr, rt.out_grad);
+  auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+
+  while (true) {
+    size_t data_size = pd->diff_src_desc().get_size();
+    size_t weight_size = pd->weights_desc().get_size();
+    size_t out_size = pd->diff_dst_desc().get_size();
+    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
+    if (pd->next_impl()) continue;
+    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
+    *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+  }
+  return pd;
+}
 
-  auto &data = inputs[deconv::kData + 1];
-  auto &weight = inputs[deconv::kWeight + 1];
-  const auto *bias = param.no_bias ? nullptr : &inputs[deconv::kBias + 1];
-  auto &out_grad = inputs[deconv::kOut];
+std::shared_ptr<deconv_bwd_weight_pd_t> MKLDNNDeconvBwd::MakeWeightsPD(
+    const DeconvolutionParam &param, const ReadTensors &rt, const deconv_fwd_pd_t &fwd_pd) {
+  DeconvDescCreator ddc(param, rt.data, rt.weight, rt.bias, rt.out_grad);
+  auto pd = std::make_shared<deconv_bwd_weight_pd_t>(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd);
+
+  while (true) {
+    size_t data_size = pd->src_desc().get_size();
+    size_t weight_size = pd->diff_weights_desc().get_size();
+    size_t out_size = pd->diff_dst_desc().get_size();
+    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
+    if (pd->next_impl()) continue;
+    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
+    *pd = deconv_bwd_weight_pd_t(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd);
+  }
+  return pd;
+}
 
-  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  MKLDNNDeconvBackward &deconvBwd = GetDeconvBwd(param, data, weight, bias, out_grad);
-  auto out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetDataPd().diff_dst_desc());
+MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &rt) {
+  const auto fwd_pd = MKLDNNDeconvFwd::MakePD(  // TODO: use cached?
+      param, MKLDNNDeconvFwd::Tensors(rt.data, rt.weight, rt.bias, rt.out_grad));
+  bwd_data_pd = MakeDataPD(param, rt, *fwd_pd);
+  bwd_weight_pd = MakeWeightsPD(param, rt, *fwd_pd);
+  bwd_data = std::make_shared<deconv_bwd_t>(*bwd_data_pd);
+  bwd_weight = std::make_shared<deconv_bwd_weight_t>(*bwd_weight_pd);
+}
+
+void MKLDNNDeconvBwd::Execute(uint32_t num_group, const std::vector<OpReqType> &req,
+                              const ReadTensors &rt, const WriteTensors &wt) {
+  // swaps are explained in MKLDNNDeconvFwd::Execute
+  IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad);
+  {
+    auto out_grad_mem = ScheduleBwdData(num_group, req, rt, wt);
+    ScheduleBwdWeight(num_group, req, rt, wt, out_grad_mem);
+    MKLDNNStream::Get()->Submit();
+  }
+  IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad);
+}
+
+void MKLDNNDeconvBwd::IOSwapWeightTensors(uint32_t num_group, const std::vector<OpReqType> &req,
+                                          const NDArray &weight, const NDArray &weight_grad) {
+  if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, num_group);
+  if (req[deconv::kWeight] || req[deconv::kBias]) IOLogicalSwapMKLDNNMem(weight_grad, num_group);
+}
+
+const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(uint32_t num_group,
+                                                       const std::vector<OpReqType> &req,
+                                                       const ReadTensors &rt,
+                                                       const WriteTensors &wt) {
   if (req[deconv::kData]) {
-    // swap is explained in MKLDNNDeconvolutionForward
-    IOLogicalSwapMKLDNNMem(weight, param.num_group);
-    auto weight_mem = GetWeights(weight, deconvBwd.GetDataPd().weights_desc(), param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData],
-                                       deconvBwd.GetDataPd().diff_src_desc(), req[deconv::kData]);
-    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                  {MKLDNN_ARG_WEIGHTS, *weight_mem},
-                                  {MKLDNN_ARG_DIFF_SRC, *in_grad_mem.second}};
-    MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdData(), net_args);
-    CommitOutput(in_grad[deconv::kData], in_grad_mem);
+    mkldnn_args_map_t net_args;
+    auto out_grad_mem = OutGradMem(rt.out_grad);
+    auto data_grad_mem = DataGradMem(req[deconv::kData], wt.data_grad);
+
+    net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem});
+    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, rt.weight)});
+    net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second});
+
+    // CommitOutput Should run after RegisterPrimArgs for memory dependency
+    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
+    CommitOutput(wt.data_grad, data_grad_mem);
+    return out_grad_mem;  // try reuse it in ScheduleBwdWeight
   }
+  return nullptr;
+}
+
+void MKLDNNDeconvBwd::ScheduleBwdWeight(uint32_t num_group, const std::vector<OpReqType> &req,
+                                        const ReadTensors &rt, const WriteTensors &wt,
+                                        const mkldnn::memory *out_grad_mem) {
   if (req[deconv::kWeight] || req[deconv::kBias]) {
-    if (deconvBwd.GetDataPd().diff_dst_desc() != deconvBwd.GetWeightsPd().diff_dst_desc())
-      out_grad_mem = out_grad.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().diff_dst_desc());
-    auto data_mem = data.GetMKLDNNDataReorder(deconvBwd.GetWeightsPd().src_desc());
-    mkldnn_output_t in_grad_weight;
-    const mkldnn::memory::desc &wei_md = deconvBwd.GetWeightsPd().diff_weights_desc();
-    // swaps are explained in MKLDNNDeconvolutionForward
-    // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-    // of logical swap) We try to reuse in_grad[deconv::kWeight] memory (which, when not swapped, is
-    // always in default format), so here we check if after a swap, wei_md will have a default
-    // format
-    if (req[deconv::kWeight] == OpReqType::kWriteTo &&
-        IsDefaultFormat(IOLogicalSwapDesc(wei_md, param.num_group))) {
-      in_grad_weight = {OutDataOp::Noop,
-                        const_cast<NDArray &>(in_grad[deconv::kWeight]).CreateMKLDNNData(wei_md)};
-    } else {
-      IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group);
-      in_grad_weight =
-          CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], wei_md, req[deconv::kWeight]);
-    }
+    mkldnn_args_map_t net_args;
+    auto weight_grad_mem = WeightGradMem(num_group, req[deconv::kWeight], wt.weight_grad);
+    auto bias_grad_mem = BiasGradMem(req[deconv::kBias], wt.bias_grad);
+
+    net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(rt.out_grad, out_grad_mem)});
+    net_args.insert({MKLDNN_ARG_SRC, *DataMem(rt.data)});
+    net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weight_grad_mem.second});
+    if (bias_grad_mem.second) net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second});
 
-    mkldnn_args_map_t net_args = {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
-                                  {MKLDNN_ARG_SRC, *data_mem},
-                                  {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
-    mkldnn_output_t in_grad_bias;
-    if (!param.no_bias) {
-      in_grad_bias = CreateMKLDNNMem(in_grad[deconv::kBias],
-                                     deconvBwd.GetWeightsPd().diff_bias_desc(), req[deconv::kBias]);
-      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
-    }
-    MKLDNNStream::Get()->RegisterPrimArgs(deconvBwd.GetBwdWeights(), net_args);
-    CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
     // CommitOutput Should run after RegisterPrimArgs for memory dependency
-    if (!param.no_bias) CommitOutput(in_grad[deconv::kBias], in_grad_bias);
+    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weight, net_args);
+    CommitOutput(wt.weight_grad, weight_grad_mem);
+    if (bias_grad_mem.second) CommitOutput(*wt.bias_grad, bias_grad_mem);
   }
-  MKLDNNStream::Get()->Submit();
+}
 
-  // swap back from oihw to iohw
-  if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, param.num_group);
-  if (req[deconv::kWeight] || req[deconv::kBias])
-    IOLogicalSwapMKLDNNMem(in_grad[deconv::kWeight], param.num_group);
+const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
+  return data.GetMKLDNNDataReorder(bwd_weight_pd->src_desc());
+}
+
+const mkldnn::memory *MKLDNNDeconvBwd::WeightMem(uint32_t num_group, const NDArray &weight) const {
+  return GetWeights(weight, bwd_data_pd->weights_desc(), num_group);
+}
+
+const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
+  return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
+}
+
+const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad,
+                                                  const mkldnn::memory *out_grad_mem) const {
+  if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weight_pd->diff_dst_desc())
+    return out_grad.GetMKLDNNDataReorder(bwd_weight_pd->diff_dst_desc());
+  return out_grad_mem;
+}
+
+mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(OpReqType req, const NDArray &data_grad) const {
+  return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
+}
+
+mkldnn_output_t MKLDNNDeconvBwd::WeightGradMem(uint32_t num_group, OpReqType req,
+                                               const NDArray &weight_grad) const {
+  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
+  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weight_grad
+  // memory (which, when not swapped, is always in default format), so here we check if after a
+  // swap, wei_md will have a default format
+  const auto &wei_md = bwd_weight_pd->diff_weights_desc();
+  if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group)))
+    return {OutDataOp::Noop, const_cast<NDArray &>(weight_grad).CreateMKLDNNData(wei_md)};
+  return CreateMKLDNNWeightGrad(weight_grad, wei_md, req);
+}
+
+mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(OpReqType req, const NDArray *bias) const {
+  return bias ? CreateMKLDNNMem(*bias, bwd_weight_pd->diff_bias_desc(), req)
+              : mkldnn_output_t(OutDataOp::Noop, nullptr);
+}
+
+/*############################### DeconvDescCreator ###############################*/
+
+DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data,
+                                     const NDArray &weight, const NDArray *bias, const NDArray &out)
+    : data_md(GetMemDesc(data)),
+      weight_md(GetDeconvWeightDesc(weight, param.num_group)),
+      bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()),
+      out_md(GetMemDesc(out)),
+      strides(param.stride.ndim()),
+      padding(param.pad.ndim()),
+      dilates(param.dilate.ndim()),
+      engine(CpuEngine::Get()->get_engine()) {
+  // assuming only deconv2D is supported for now
+  CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim());
+  CHECK(param.stride.ndim() == 2);
+  for (int i = 0; i < param.stride.ndim(); ++i) {
+    strides[i] = param.stride[i];
+    padding[i] = param.pad[i];
+    dilates[i] = param.dilate[i] - 1;
+  }
+}
+
+void DeconvDescCreator::ImposePlainWherePadding(size_t data_size, size_t weight_size,
+                                                size_t out_size) {
+  if (data_size != GetMemDescSize(data_md)) {
+    CHECK(data_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    data_md = GetDesc(data_md, GetDefaultFormat(data_md));
+  } else if (out_size != GetMemDescSize(out_md)) {
+    CHECK(out_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    out_md = GetDesc(out_md, GetDefaultFormat(out_md));
+  } else if (weight_size != GetMemDescSize(weight_md)) {
+    CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    int num_groups = (weight_md.data.ndims > data_md.data.ndims) ? weight_md.data.dims[0] : 1;
+    weight_md = IOLogicalSwapDesc(weight_md, num_groups);
+    weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups);
+  }
+}
+
+bool DeconvDescCreator::CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const {
+  // MKLDNN introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, MKLDNN operators
+  // still reuse memory from memory planning, so here we need to accept only a
+  // kernel that has the expected memory size requirements (which is suboptimal)
+  return (data_size == GetMemDescSize(data_md) && weight_size == GetMemDescSize(weight_md) &&
+          out_size == GetMemDescSize(out_md));
+}
+
+deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const {
+  // TODO: check if forward_training should be constant
+  return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
+                            mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md,
+                            out_md, strides, dilates, padding, padding);
+}
+
+deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const {
+  return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md,
+                            strides, dilates, padding, padding);
+}
+
+deconv_bwd_weight_t::desc DeconvDescCreator::MakeBwdWeightDesc() const {
+  return deconv_bwd_weight_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md,
+                                   bias_md, out_md, strides, dilates, padding, padding);
+}
+
+// Swaps the logical order of dimensions that in plain format would correspond to input and output
+// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
+mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) {
+  auto &d = desc.data;
+  int offset = int(num_groups > 1);
+  int dim0 = offset + 0;
+  int dim1 = offset + 1;
+  std::swap(d.dims[dim0], d.dims[dim1]);
+  std::swap(d.padded_dims[dim0], d.padded_dims[dim1]);
+  if (d.format_kind != dnnl_format_kind_any) {
+    std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]);
+    // as padding is not supported, these are always zeros?
+    std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]);
+    // for blocked format: change indices
+    for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) {
+      auto &val = d.format_desc.blocking.inner_idxs[i];
+      if (val == dim0) {
+        val = dim1;
+      } else if (val == dim1) {
+        val = dim0;
+      }
+    }
+  }
+  return desc;
+}
+
+// Applies IOLogicalSwapDesc to MKLDNN memory of arr
+void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) {
+  mkldnn::memory::desc desc;
+  if (arr.IsMKLDNNData()) {
+    desc = arr.GetMKLDNNData()->get_desc();
+  } else {
+    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
+    // descriptor from GetWeightDesc but with default format
+    const auto &temp = GetWeightDesc(arr, num_groups);
+    desc = mkldnn::memory::desc(
+        temp.dims(), temp.data_type(),
+        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
+  }
+  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups));
 }
 
 }  // namespace op

From 1c8987bb938314f480b4a1102c0abfd70cf1bb53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Mon, 15 Mar 2021 15:09:35 +0100
Subject: [PATCH 4/9] Refactor deconvolution version 2 and use permute_axes in
 IOLogicalSwapDesc

---
 .../nn/mkldnn/mkldnn_deconvolution-inl.h      | 153 +++---
 .../nn/mkldnn/mkldnn_deconvolution.cc         | 452 +++++++++---------
 2 files changed, 329 insertions(+), 276 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index f2638013ac3d..404cacc500c5 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -20,10 +20,21 @@
 /*!
  * \file mkldnn_deconvolution-inl.h
  * \brief
- * \Author: Paweł Głomski, pawel.glomski@intel.com
+ *          ________
+ * Data---->|Deconv|
+ * Weight-->|  FWD |--->out
+ * Bias---->|______|
+ *               ________
+ * Data_grad<----|Deconv|<---out_grad
+ * Weight_grad<--|  BWD |<---data
+ * Bias_grad<----|      |<---Weight
+ *               |______|<---Bias
+ *
+ * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
+ * "out_grad" to its gradient
  */
-#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
-#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
 
 #if MXNET_USE_MKLDNN == 1
 #include "../deconvolution-inl.h"
@@ -39,18 +50,20 @@ using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
 using deconv_bwd_t = mkldnn::deconvolution_backward_data;
 using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
 
-using deconv_bwd_weight_t = mkldnn::deconvolution_backward_weights;
-using deconv_bwd_weight_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
+using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights;
+using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
 
 class MKLDNNDeconvFwd {
  public:
   struct Tensors {
-    Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias, const NDArray &out);
-    Tensors(bool no_bias, const std::vector<NDArray> &inputs, const std::vector<NDArray> &outputs);
+    Tensors(const NDArray &data, const NDArray &weights, const NDArray *const bias,
+            const NDArray &out);
+    Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
+            const std::vector<NDArray> &outputs);
 
     const NDArray &data;
-    const NDArray &weight;
-    const NDArray *bias;
+    const NDArray &weights;
+    const NDArray *const bias;
     const NDArray &out;
   };
 
@@ -59,15 +72,15 @@ class MKLDNNDeconvFwd {
                                                  const Tensors &tensors);
 
   MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors);
-  void ControlWeightFormat(uint32_t num_group, bool is_train, const NDArray &weight);
-  void Execute(uint32_t num_group, const std::vector<OpReqType> &req, const Tensors &tensors);
+  void ControlWeightsFormat(const uint32_t num_group, const bool is_train, const NDArray &weights);
+  void Execute(const uint32_t num_group, const std::vector<OpReqType> &req, const Tensors &tensors);
 
  private:
   const mkldnn::memory *DataMem(const NDArray &data) const;
-  const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const;
+  const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
   const mkldnn::memory *BiasMem(const NDArray &bias) const;
 
-  mkldnn_output_t OutMem(OpReqType req, const NDArray &out) const;
+  mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const;
 
   std::shared_ptr<deconv_fwd_t> fwd;
   std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
@@ -76,75 +89,89 @@ class MKLDNNDeconvFwd {
 class MKLDNNDeconvBwd {
  public:
   struct ReadTensors {
-    ReadTensors(bool no_bias, const std::vector<NDArray> &inputs);
+    ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs);
     const NDArray &data;
-    const NDArray &weight;
-    const NDArray *bias;
+    const NDArray &weights;
+    const NDArray *const bias;
     const NDArray &out_grad;
   };
   struct WriteTensors {
-    WriteTensors(bool no_bias, const std::vector<NDArray> &outputs);
+    WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs);
     const NDArray &data_grad;
-    const NDArray &weight_grad;
-    const NDArray *bias_grad;
+    const NDArray &weights_grad;
+    const NDArray *const bias_grad;
   };
 
-  static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam &param, const ReadTensors &rt);
+  static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam &param,
+                                    const ReadTensors &read_tensors);
   static std::shared_ptr<deconv_bwd_data_pd_t> MakeDataPD(const DeconvolutionParam &param,
-                                                          const ReadTensors &rt,
+                                                          const ReadTensors &read_tensors,
                                                           const deconv_fwd_pd_t &fwd_pd);
-  static std::shared_ptr<deconv_bwd_weight_pd_t> MakeWeightsPD(const DeconvolutionParam &param,
-                                                               const ReadTensors &rt,
-                                                               const deconv_fwd_pd_t &fwd_pd);
+  static std::shared_ptr<deconv_bwd_weights_pd_t> MakeWeightsPD(const DeconvolutionParam &param,
+                                                                const ReadTensors &read_tensors,
+                                                                const deconv_fwd_pd_t &fwd_pd);
 
-  MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &rt);
-  void Execute(uint32_t num_group, const std::vector<OpReqType> &req, const ReadTensors &rt,
-               const WriteTensors &wt);
+  MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors);
+  void Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
+               const ReadTensors &read_tensors, const WriteTensors &write_tensors);
 
  private:
-  void IOSwapWeightTensors(uint32_t num_group, const std::vector<OpReqType> &req,
-                           const NDArray &weight, const NDArray &weight_grad);
+  void IOSwapWeightsTensors(const uint32_t num_group, const std::vector<OpReqType> &req,
+                            const NDArray &weights, const NDArray &weights_grad);
 
-  const mkldnn::memory *ScheduleBwdData(uint32_t num_group, const std::vector<OpReqType> &req,
-                                        const ReadTensors &rt, const WriteTensors &wt);
+  // returns the output gradient memory used to calculate the data (input) gradient, which
+  // might be reused when calculating the gradient of weights
+  const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const std::vector<OpReqType> &req,
+                                        const ReadTensors &read_tensors,
+                                        const WriteTensors &write_tensors);
 
-  void ScheduleBwdWeight(uint32_t num_group, const std::vector<OpReqType> &req,
-                         const ReadTensors &rt, const WriteTensors &wt,
-                         const mkldnn::memory *out_grad_mem);
+  void ScheduleBwdWeights(const uint32_t num_group, const std::vector<OpReqType> &req,
+                          const ReadTensors &read_tensors, const WriteTensors &write_tensors,
+                          const mkldnn::memory *const out_grad_mem);
 
   const mkldnn::memory *DataMem(const NDArray &data) const;
-  const mkldnn::memory *WeightMem(uint32_t num_group, const NDArray &weight) const;
-  const mkldnn::memory *OutGradMem(const NDArray &out_grad) const;  // for bwd data
-  const mkldnn::memory *OutGradMem(const NDArray &out_grad,         // for bwd weight
-                                   const mkldnn::memory *out_grad_mem) const;
+  const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
+
+  // for calculating the gradient of data (input)
+  const mkldnn::memory *OutGradMem(const NDArray &out_grad) const;
+  // for calculating the gradient of weights
+  const mkldnn::memory *OutGradMem(const NDArray &out_grad,
+                                   const mkldnn::memory *const out_grad_mem) const;
 
-  mkldnn_output_t DataGradMem(OpReqType req, const NDArray &data_grad) const;
-  mkldnn_output_t WeightGradMem(uint32_t num_group, OpReqType req,
-                                const NDArray &weight_grad) const;
-  mkldnn_output_t BiasGradMem(OpReqType req, const NDArray *bias) const;
+  mkldnn_output_t DataGradMem(const OpReqType req, const NDArray &data_grad) const;
+  mkldnn_output_t WeightsGradMem(const uint32_t num_group, const OpReqType req,
+                                 const NDArray &weights_grad) const;
+  mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray *const bias) const;
 
   std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
-  std::shared_ptr<deconv_bwd_weight_pd_t> bwd_weight_pd;
+  std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
   std::shared_ptr<deconv_bwd_t> bwd_data;
-  std::shared_ptr<deconv_bwd_weight_t> bwd_weight;
-};  // namespace op
+  std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
+};
 
+// Utility class for creating operation descriptors of deconvolution primitives
 struct DeconvDescCreator {
-  DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data, const NDArray &weight,
-                    const NDArray *bias, const NDArray &out);
-
-  // Imposes plain formats on memory descriptors with padding
-  // Changing only one at a time, so maybe better implementations will be selected
-  // (than entirely plain one)
-  void ImposePlainWherePadding(size_t data_size, size_t weight_size, size_t out_size);
-  bool CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const;
+  DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+                    const NDArray *const bias, const NDArray &out);
+
+  // Imposes plain formats on memory descriptors with padding (so the next selected implementation
+  // will pass CheckImplSizeReq). After calling this method, new primitive descriptor (with new
+  // operator descriptor) should be created, which should select an implementation with matching
+  // size requirements.
+  // data_size, weights_size, out_size - size requirements of current implementation
+  // Returns whether successfully imposed a plain format on any of the data, weights, and output
+  // memory descriptors.
+  bool ImposePlainWherePadding(const size_t data_size, const size_t weights_size,
+                               const size_t out_size);
+  bool CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+                        const size_t out_size) const;
 
   deconv_fwd_t::desc MakeFwdDesc() const;
   deconv_bwd_t::desc MakeBwdDataDesc() const;
-  deconv_bwd_weight_t::desc MakeBwdWeightDesc() const;
+  deconv_bwd_weights_t::desc MakeBwdWeightsDesc() const;
 
   mkldnn::memory::desc data_md;
-  mkldnn::memory::desc weight_md;
+  mkldnn::memory::desc weights_md;
   mkldnn::memory::desc bias_md;
   mkldnn::memory::desc out_md;
 
@@ -155,15 +182,19 @@ struct DeconvDescCreator {
   mkldnn::engine &engine;
 };
 
-mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups);
-void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups);
+// Swaps the logical order of dimensions that in plain format would correspond to input and output
+// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
+mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups);
+
+// Applies IOLogicalSwapDesc to MKLDNN memory of arr
+void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups);
 
-// Version of GetWeightDesc for deconvolution (with swap)
-static inline mkldnn::memory::desc GetDeconvWeightDesc(const NDArray &weight, int num_groups) {
-  return IOLogicalSwapDesc(GetWeightDesc(weight, num_groups), num_groups);
+// Version of GetWeightsDesc for deconvolution (with swap)
+inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const int num_groups) {
+  return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups);
 }
 
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1
-#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index b5e7e4166284..699318d9beec 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -30,12 +30,11 @@ namespace mxnet {
 namespace op {
 
 bool SupportMKLDNNDeconv(const DeconvolutionParam &params, const NDArray &input) {
-  if (params.kernel.ndim() != 2) return false;
-  return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
-         input.shape().ndim() == 4;
+  return params.kernel.ndim() == 2 && input.shape().ndim() == 4 &&
+         (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
 }
 
-/*############################### Forward ###############################*/
+// Forward
 
 void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                 const std::vector<NDArray> &inputs,
@@ -46,18 +45,18 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c
   const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
   MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
 
-  fwd.ControlWeightFormat(param.num_group, ctx.is_train, tensors.weight);
+  fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
   fwd.Execute(param.num_group, req, tensors);
 }
 
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weight, const NDArray *bias,
-                                  const NDArray &out)
-    : data(data), weight(weight), bias(bias), out(out) {}
+MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights,
+                                  const NDArray *const bias, const NDArray &out)
+    : data(data), weights(weights), bias(bias), out(out) {}
 
-MKLDNNDeconvFwd::Tensors::Tensors(bool no_bias, const std::vector<NDArray> &inputs,
+MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
                                   const std::vector<NDArray> &outputs)
     : data(inputs[deconv::kData]),
-      weight(inputs[deconv::kWeight]),
+      weights(inputs[deconv::kWeight]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias]),
       out(outputs[deconv::kOut]) {}
 
@@ -70,17 +69,16 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
   static MX_THREAD_LOCAL deconv_fwd_map fwds;
 #endif
   DeconvSignature key(param);
-  // Here we can sign the conv op with NDArray because conv primitive will
-  // decide the right layout for the, so we only need to get the shape and the
-  // data type of the arrays.
   key.AddSign(tensors.data);
-  key.AddSign(tensors.weight);
+  key.AddSign(tensors.weights);
   key.AddSign(tensors.out);
-  if (tensors.bias) key.AddSign(*tensors.bias);
+  if (tensors.bias) {
+    key.AddSign(*tensors.bias);
+  }
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd = MKLDNNDeconvFwd(param, tensors);
+    const MKLDNNDeconvFwd fwd(param, tensors);
     it = AddToCache(&fwds, key, fwd);
   }
   return it->second;
@@ -88,17 +86,20 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
 
 std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::MakePD(const DeconvolutionParam &param,
                                                          const Tensors &tensors) {
-  DeconvDescCreator ddc(param, tensors.data, tensors.weight, tensors.bias, tensors.out);
-  auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.MakeFwdDesc(), ddc.engine);
-
-  while (true) {
-    size_t data_size = pd->src_desc().get_size();
-    size_t weight_size = pd->weights_desc().get_size();
-    size_t out_size = pd->dst_desc().get_size();
-    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
-    if (pd->next_impl()) continue;
-    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
-    *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine);
+  DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
+  const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.MakeFwdDesc(), ddc.engine);
+  const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+  const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
+  const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); };
+
+  while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
+    if (!pd->next_impl()) {
+      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
+      // imposed, meaning there is no implementation with plain formats
+      CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
+          << "No implementation of deconvolution forward propagation";
+      *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine);
+    }
   }
   return pd;
 }
@@ -108,243 +109,273 @@ MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors
   fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
 }
 
-void MKLDNNDeconvFwd::ControlWeightFormat(uint32_t num_group, bool is_train,
-                                          const NDArray &weight) {
+void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train,
+                                           const NDArray &weights) {
   if (is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
-    if (weight.IsMKLDNNData())
-      // This asks the engine to change the layout of the weight array after it's used.
-      weight.Reorder2DefaultAsync();
+    if (weights.IsMKLDNNData()) {
+      // This asks the engine to change the layout of the weights array after it's used.
+      weights.Reorder2DefaultAsync();
+    }
   } else {
-    // For inference, we want to reorder the weight array so we don't need to
+    // For inference, we want to reorder the weights array so we don't need to
     // reorder data every time.
-    if (weight.IsDefaultData()) {
-      // We also need to modify the layout on the original weight array. The
-      // data conversion happens after the weight array is used.
-      weight.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
+    if (weights.IsDefaultData()) {
+      // We also need to modify the layout on the original weights array.
+      // The data conversion happens after the weights array is used.
+      weights.MKLDNNDataReorderAsync(IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     } else {
-      CHECK(weight.GetMKLDNNData()->get_desc() ==
+      CHECK(weights.GetMKLDNNData()->get_desc() ==
             IOLogicalSwapDesc(fwd_pd->weights_desc(), num_group));
     }
   }
 }
 
-void MKLDNNDeconvFwd::Execute(uint32_t num_group, const std::vector<OpReqType> &req,
+void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
                               const Tensors &tensors) {
   // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
-  // For that, we would pass input tensor in place of output and output tensor in place of
-  // input (for appropriate convolution primitives: deconvolution forward = convolution backward
-  // data, deconvolution backward data = convolution forward). Convolution primitive expects
-  // weight tensor with shape (o, i, h, w), but because we swapped input and output tensors:
-  // o = input_channels, i = output_channels. So in that case, deconvolution needs a weight
-  // tensor with shape (input_channels, output_channels, h, w) and MXNet provides such tensor.
+  // For that, we would pass input tensor in place of output and output tensor in place of input
+  // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
+  // deconvolution backward data = convolution forward).
+  // The convolution primitive expects weights tensor with the shape of
+  // (primitive_out_channels, primitive_in_channels, h, w), but with swapped input and output:
+  // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels,
+  // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor.
   //
-  // MKLDNN's deconvolution primitive also expects weight tensor with shape (o, i, h, w),
-  // but this time we don't swap input and output tensors, so o = output_channels, i =
-  // input_channels, so the current weight tensor won't fit (when oihw != iohw). But actually,
-  // underneath deconvolution MKLDNN also uses convolution, so even though it expects the weight
-  // tensor with shape (o, i, h, w), it wants it in iohw format, so it's physical representation
-  // match current weight tensor.
+  // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of
+  // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and
+  // output tensors, so:
+  // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels,
+  // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels).
+  // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the
+  // weights tensor with the logical order of oihw, it wants its physical representation to
+  // match the order of iohw, which is the same as current weights tensor.
   //
-  // So here we swap logical order of input and output dimensions for weight tensor just for
-  // MKLDNN operations
-  IOLogicalSwapMKLDNNMem(tensors.weight, num_group);
+  // So here we swap logical order of input and output dimensions for weights tensor just for
+  // MKLDNN operations.
+  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
   {
     mkldnn_args_map_t net_args;
-    auto out_mem = OutMem(req[deconv::kOut], tensors.out);
+    const auto &out_mem = OutMem(req[deconv::kOut], tensors.out);
 
     net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, tensors.weight)});
+    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
     net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
-    if (tensors.bias) net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)});
+    if (tensors.bias) {
+      net_args.insert({MKLDNN_ARG_BIAS, *BiasMem(*tensors.bias)});
+    }
 
-    // CommitOutput Should run after RegisterPrimArgs for memory dependency
+    // CommitOutput should run after RegisterPrimArgs for memory dependency
     MKLDNNStream::Get()->RegisterPrimArgs(*fwd, net_args);
     CommitOutput(tensors.out, out_mem);
     MKLDNNStream::Get()->Submit();
   }
-  IOLogicalSwapMKLDNNMem(tensors.weight, num_group);  // swap back from oihw to iohw
+  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);  // swap back from oihw to iohw
 }
 
 const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
   return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
 }
 
-const mkldnn::memory *MKLDNNDeconvFwd::WeightMem(uint32_t num_group, const NDArray &weight) const {
-  return GetWeights(weight, fwd_pd->weights_desc(), num_group);
+const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
+                                                  const NDArray &weights) const {
+  return GetWeights(weights, fwd_pd->weights_desc(), num_group);
 }
 
 const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
   return bias.GetMKLDNNData();
 }
 
-mkldnn_output_t MKLDNNDeconvFwd::OutMem(OpReqType req, const NDArray &out) const {
+mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const {
   return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
 }
 
-/*############################### Backward ###############################*/
+// Backward
 
 void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                  const std::vector<NDArray> &inputs,
                                  const std::vector<OpReqType> &req,
                                  const std::vector<NDArray> &outputs) {
-  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace";
 
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto &rt = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
-  const auto &wt = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
-  MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, rt);
+  const auto &read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto &write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
+  MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors);
 
-  bwd.Execute(param.num_group, req, rt, wt);
+  bwd.Execute(param.num_group, req, read_tensors, write_tensors);
 }
 
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(bool no_bias, const std::vector<NDArray> &inputs)
+MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs)
     : data(inputs[deconv::kData + 1]),
-      weight(inputs[deconv::kWeight + 1]),
+      weights(inputs[deconv::kWeight + 1]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
       out_grad(inputs[deconv::kOut]) {}
 
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(bool no_bias, const std::vector<NDArray> &outputs)
+MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs)
     : data_grad(outputs[deconv::kData]),
-      weight_grad(outputs[deconv::kWeight]),
+      weights_grad(outputs[deconv::kWeight]),
       bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
 
 MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
-                                            const ReadTensors &rt) {
-  using mkldnn_deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
+                                            const ReadTensors &read_tensors) {
+  using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
-  static thread_local mkldnn_deconv_bwd_map bwds;
+  static thread_local deconv_bwd_map bwds;
 #else
-  static MX_THREAD_LOCAL mkldnn_deconv_bwd_map bwds;
+  static MX_THREAD_LOCAL deconv_bwd_map bwds;
 #endif
   DeconvSignature key(param);
-  // Here we can sign the conv op with NDArray because conv primitive will
-  // decide the right layout for the, so we only need to get the shape and the
-  // data type of the arrays.
-  key.AddSign(rt.data);
-  key.AddSign(rt.weight);
-  key.AddSign(rt.out_grad);
-  if (rt.bias) key.AddSign(*rt.bias);
+  key.AddSign(read_tensors.data);
+  key.AddSign(read_tensors.weights);
+  key.AddSign(read_tensors.out_grad);
+  if (read_tensors.bias) {
+    key.AddSign(*read_tensors.bias);
+  }
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
-    auto bwd = MKLDNNDeconvBwd(param, rt);
+    const MKLDNNDeconvBwd bwd(param, read_tensors);
     it = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
 std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam &param,
-                                                                  const ReadTensors &rt,
+                                                                  const ReadTensors &read_tensors,
                                                                   const deconv_fwd_pd_t &fwd_pd) {
-  DeconvDescCreator ddc(param, rt.data, rt.weight, nullptr, rt.out_grad);
-  auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
-
-  while (true) {
-    size_t data_size = pd->diff_src_desc().get_size();
-    size_t weight_size = pd->weights_desc().get_size();
-    size_t out_size = pd->diff_dst_desc().get_size();
-    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
-    if (pd->next_impl()) continue;
-    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
-    *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+  DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr,
+                        read_tensors.out_grad);
+  const auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+  const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); };
+  const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
+  const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+
+  while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
+    if (!pd->next_impl()) {
+      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
+      // imposed, meaning there is no implementation with plain formats
+      CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
+          << "No implementation of deconvolution backward propagation";
+      *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+    }
   }
   return pd;
 }
 
-std::shared_ptr<deconv_bwd_weight_pd_t> MKLDNNDeconvBwd::MakeWeightsPD(
-    const DeconvolutionParam &param, const ReadTensors &rt, const deconv_fwd_pd_t &fwd_pd) {
-  DeconvDescCreator ddc(param, rt.data, rt.weight, rt.bias, rt.out_grad);
-  auto pd = std::make_shared<deconv_bwd_weight_pd_t>(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd);
-
-  while (true) {
-    size_t data_size = pd->src_desc().get_size();
-    size_t weight_size = pd->diff_weights_desc().get_size();
-    size_t out_size = pd->diff_dst_desc().get_size();
-    if (ddc.CheckImpl(data_size, weight_size, out_size)) break;
-    if (pd->next_impl()) continue;
-    ddc.ImposePlainWherePadding(data_size, weight_size, out_size);
-    *pd = deconv_bwd_weight_pd_t(ddc.MakeBwdWeightDesc(), ddc.engine, fwd_pd);
+std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::MakeWeightsPD(
+    const DeconvolutionParam &param, const ReadTensors &read_tensors,
+    const deconv_fwd_pd_t &fwd_pd) {
+  DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias,
+                        read_tensors.out_grad);
+  const auto pd =
+      std::make_shared<deconv_bwd_weights_pd_t>(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd);
+  const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+  const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); };
+  const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+
+  while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
+    if (!pd->next_impl()) {
+      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
+      // imposed, meaning there is no implementation with plain formats
+      CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
+          << "No implementation of calculating deconvolution weights gradient";
+      *pd = deconv_bwd_weights_pd_t(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd);
+    }
   }
   return pd;
 }
 
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &rt) {
-  const auto fwd_pd = MKLDNNDeconvFwd::MakePD(  // TODO: use cached?
-      param, MKLDNNDeconvFwd::Tensors(rt.data, rt.weight, rt.bias, rt.out_grad));
-  bwd_data_pd = MakeDataPD(param, rt, *fwd_pd);
-  bwd_weight_pd = MakeWeightsPD(param, rt, *fwd_pd);
+MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors) {
+  const auto &fwd_pd = MKLDNNDeconvFwd::MakePD(
+      param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias,
+                                      read_tensors.out_grad));
+  bwd_data_pd = MakeDataPD(param, read_tensors, *fwd_pd);
+  bwd_weights_pd = MakeWeightsPD(param, read_tensors, *fwd_pd);
   bwd_data = std::make_shared<deconv_bwd_t>(*bwd_data_pd);
-  bwd_weight = std::make_shared<deconv_bwd_weight_t>(*bwd_weight_pd);
+  bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
 }
 
-void MKLDNNDeconvBwd::Execute(uint32_t num_group, const std::vector<OpReqType> &req,
-                              const ReadTensors &rt, const WriteTensors &wt) {
+void MKLDNNDeconvBwd::Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
+                              const ReadTensors &read_tensors, const WriteTensors &write_tensors) {
   // swaps are explained in MKLDNNDeconvFwd::Execute
-  IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad);
+  IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
   {
-    auto out_grad_mem = ScheduleBwdData(num_group, req, rt, wt);
-    ScheduleBwdWeight(num_group, req, rt, wt, out_grad_mem);
+    auto *const out_grad_mem = ScheduleBwdData(num_group, req, read_tensors, write_tensors);
+    ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem);
     MKLDNNStream::Get()->Submit();
   }
-  IOSwapWeightTensors(num_group, req, rt.weight, wt.weight_grad);
+  IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
 }
 
-void MKLDNNDeconvBwd::IOSwapWeightTensors(uint32_t num_group, const std::vector<OpReqType> &req,
-                                          const NDArray &weight, const NDArray &weight_grad) {
-  if (req[deconv::kData]) IOLogicalSwapMKLDNNMem(weight, num_group);
-  if (req[deconv::kWeight] || req[deconv::kBias]) IOLogicalSwapMKLDNNMem(weight_grad, num_group);
+void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
+                                           const std::vector<OpReqType> &req,
+                                           const NDArray &weights, const NDArray &weights_grad) {
+  if (req[deconv::kData]) {
+    IOLogicalSwapMKLDNNMem(weights, num_group);
+  }
+  if (req[deconv::kWeight] || req[deconv::kBias]) {
+    IOLogicalSwapMKLDNNMem(weights_grad, num_group);
+  }
 }
 
-const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(uint32_t num_group,
+const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group,
                                                        const std::vector<OpReqType> &req,
-                                                       const ReadTensors &rt,
-                                                       const WriteTensors &wt) {
+                                                       const ReadTensors &read_tensors,
+                                                       const WriteTensors &write_tensors) {
   if (req[deconv::kData]) {
     mkldnn_args_map_t net_args;
-    auto out_grad_mem = OutGradMem(rt.out_grad);
-    auto data_grad_mem = DataGradMem(req[deconv::kData], wt.data_grad);
+    auto *const out_grad_mem = OutGradMem(read_tensors.out_grad);
+    const auto &data_grad_mem = DataGradMem(req[deconv::kData], write_tensors.data_grad);
 
     net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem});
-    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightMem(num_group, rt.weight)});
+    net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
     net_args.insert({MKLDNN_ARG_DIFF_SRC, *data_grad_mem.second});
 
-    // CommitOutput Should run after RegisterPrimArgs for memory dependency
+    // CommitOutput should run after RegisterPrimArgs for memory dependency
     MKLDNNStream::Get()->RegisterPrimArgs(*bwd_data, net_args);
-    CommitOutput(wt.data_grad, data_grad_mem);
-    return out_grad_mem;  // try reuse it in ScheduleBwdWeight
+    CommitOutput(write_tensors.data_grad, data_grad_mem);
+    return out_grad_mem;
   }
   return nullptr;
 }
 
-void MKLDNNDeconvBwd::ScheduleBwdWeight(uint32_t num_group, const std::vector<OpReqType> &req,
-                                        const ReadTensors &rt, const WriteTensors &wt,
-                                        const mkldnn::memory *out_grad_mem) {
+void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
+                                         const std::vector<OpReqType> &req,
+                                         const ReadTensors &read_tensors,
+                                         const WriteTensors &write_tensors,
+                                         const mkldnn::memory *const out_grad_mem) {
   if (req[deconv::kWeight] || req[deconv::kBias]) {
     mkldnn_args_map_t net_args;
-    auto weight_grad_mem = WeightGradMem(num_group, req[deconv::kWeight], wt.weight_grad);
-    auto bias_grad_mem = BiasGradMem(req[deconv::kBias], wt.bias_grad);
-
-    net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(rt.out_grad, out_grad_mem)});
-    net_args.insert({MKLDNN_ARG_SRC, *DataMem(rt.data)});
-    net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weight_grad_mem.second});
-    if (bias_grad_mem.second) net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second});
-
-    // CommitOutput Should run after RegisterPrimArgs for memory dependency
-    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weight, net_args);
-    CommitOutput(wt.weight_grad, weight_grad_mem);
-    if (bias_grad_mem.second) CommitOutput(*wt.bias_grad, bias_grad_mem);
+    const auto &weights_grad_mem =
+        WeightsGradMem(num_group, req[deconv::kWeight], write_tensors.weights_grad);
+    const auto &bias_grad_mem = BiasGradMem(req[deconv::kBias], write_tensors.bias_grad);
+
+    net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
+    net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)});
+    net_args.insert({MKLDNN_ARG_DIFF_WEIGHTS, *weights_grad_mem.second});
+    if (bias_grad_mem.second) {
+      net_args.insert({MKLDNN_ARG_DIFF_BIAS, *bias_grad_mem.second});
+    }
+
+    // CommitOutput should run after RegisterPrimArgs for memory dependency
+    MKLDNNStream::Get()->RegisterPrimArgs(*bwd_weights, net_args);
+    CommitOutput(write_tensors.weights_grad, weights_grad_mem);
+    if (bias_grad_mem.second) {
+      CommitOutput(*write_tensors.bias_grad, bias_grad_mem);
+    }
   }
 }
 
 const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
-  return data.GetMKLDNNDataReorder(bwd_weight_pd->src_desc());
+  return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
 }
 
-const mkldnn::memory *MKLDNNDeconvBwd::WeightMem(uint32_t num_group, const NDArray &weight) const {
-  return GetWeights(weight, bwd_data_pd->weights_desc(), num_group);
+const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
+                                                  const NDArray &weights) const {
+  return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
 }
 
 const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
@@ -352,39 +383,42 @@ const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const
 }
 
 const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad,
-                                                  const mkldnn::memory *out_grad_mem) const {
-  if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weight_pd->diff_dst_desc())
-    return out_grad.GetMKLDNNDataReorder(bwd_weight_pd->diff_dst_desc());
+                                                  const mkldnn::memory *const out_grad_mem) const {
+  if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weights_pd->diff_dst_desc()) {
+    return out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
+  }
   return out_grad_mem;
 }
 
-mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(OpReqType req, const NDArray &data_grad) const {
+mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req, const NDArray &data_grad) const {
   return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
 }
 
-mkldnn_output_t MKLDNNDeconvBwd::WeightGradMem(uint32_t num_group, OpReqType req,
-                                               const NDArray &weight_grad) const {
+mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group, const OpReqType req,
+                                                const NDArray &weights_grad) const {
   // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weight_grad
+  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
   // memory (which, when not swapped, is always in default format), so here we check if after a
   // swap, wei_md will have a default format
-  const auto &wei_md = bwd_weight_pd->diff_weights_desc();
-  if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group)))
-    return {OutDataOp::Noop, const_cast<NDArray &>(weight_grad).CreateMKLDNNData(wei_md)};
-  return CreateMKLDNNWeightGrad(weight_grad, wei_md, req);
+  const auto &wei_md = bwd_weights_pd->diff_weights_desc();
+  if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) {
+    return {OutDataOp::Noop, const_cast<NDArray &>(weights_grad).CreateMKLDNNData(wei_md)};
+  }
+  return CreateMKLDNNWeightGrad(weights_grad, wei_md, req);
 }
 
-mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(OpReqType req, const NDArray *bias) const {
-  return bias ? CreateMKLDNNMem(*bias, bwd_weight_pd->diff_bias_desc(), req)
+mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req, const NDArray *const bias) const {
+  return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
               : mkldnn_output_t(OutDataOp::Noop, nullptr);
 }
 
-/*############################### DeconvDescCreator ###############################*/
+// DeconvDescCreator
 
 DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data,
-                                     const NDArray &weight, const NDArray *bias, const NDArray &out)
+                                     const NDArray &weights, const NDArray *const bias,
+                                     const NDArray &out)
     : data_md(GetMemDesc(data)),
-      weight_md(GetDeconvWeightDesc(weight, param.num_group)),
+      weights_md(GetDeconvWeightsDesc(weights, param.num_group)),
       bias_md(bias ? GetMemDesc(*bias) : mkldnn::memory::desc()),
       out_md(GetMemDesc(out)),
       strides(param.stride.ndim()),
@@ -401,76 +435,64 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDAr
   }
 }
 
-void DeconvDescCreator::ImposePlainWherePadding(size_t data_size, size_t weight_size,
-                                                size_t out_size) {
-  if (data_size != GetMemDescSize(data_md)) {
-    CHECK(data_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+bool DeconvDescCreator::ImposePlainWherePadding(const size_t data_size, const size_t weights_size,
+                                                const size_t out_size) {
+  // Changing only one at a time, so maybe better implementations will be selected (than entirely
+  // plain one)
+  if (data_md.data.format_kind == dnnl_format_kind_any && data_size != GetMemDescSize(data_md)) {
     data_md = GetDesc(data_md, GetDefaultFormat(data_md));
-  } else if (out_size != GetMemDescSize(out_md)) {
-    CHECK(out_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
+    return true;
+  } else if (out_md.data.format_kind == dnnl_format_kind_any &&
+             out_size != GetMemDescSize(out_md)) {
     out_md = GetDesc(out_md, GetDefaultFormat(out_md));
-  } else if (weight_size != GetMemDescSize(weight_md)) {
-    CHECK(weight_md.data.format_kind == dnnl_format_kind_any) << "No implementation";
-    int num_groups = (weight_md.data.ndims > data_md.data.ndims) ? weight_md.data.dims[0] : 1;
-    weight_md = IOLogicalSwapDesc(weight_md, num_groups);
-    weight_md = IOLogicalSwapDesc(GetDesc(weight_md, GetDefaultFormat(weight_md)), num_groups);
+    return true;
+  } else if (weights_md.data.format_kind == dnnl_format_kind_any &&
+             weights_size != GetMemDescSize(weights_md)) {
+    const int num_gr = (weights_md.data.ndims > data_md.data.ndims) ? weights_md.data.dims[0] : 1;
+    weights_md = IOLogicalSwapDesc(weights_md, num_gr);
+    weights_md = IOLogicalSwapDesc(GetDesc(weights_md, GetDefaultFormat(weights_md)), num_gr);
+    return true;
   }
+  return false;
 }
 
-bool DeconvDescCreator::CheckImpl(size_t data_size, size_t weight_size, size_t out_size) const {
+bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+                                         const size_t out_size) const {
   // MKLDNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKLDNN operators
   // still reuse memory from memory planning, so here we need to accept only a
   // kernel that has the expected memory size requirements (which is suboptimal)
-  return (data_size == GetMemDescSize(data_md) && weight_size == GetMemDescSize(weight_md) &&
+  return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) &&
           out_size == GetMemDescSize(out_md));
 }
 
 deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const {
-  // TODO: check if forward_training should be constant
   return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
-                            mkldnn::algorithm::deconvolution_direct, data_md, weight_md, bias_md,
+                            mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md,
                             out_md, strides, dilates, padding, padding);
 }
 
 deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const {
-  return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md, out_md,
+  return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, out_md,
                             strides, dilates, padding, padding);
 }
 
-deconv_bwd_weight_t::desc DeconvDescCreator::MakeBwdWeightDesc() const {
-  return deconv_bwd_weight_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weight_md,
-                                   bias_md, out_md, strides, dilates, padding, padding);
-}
-
-// Swaps the logical order of dimensions that in plain format would correspond to input and output
-// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-mkldnn::memory::desc IOLogicalSwapDesc(mkldnn::memory::desc desc, int num_groups) {
-  auto &d = desc.data;
-  int offset = int(num_groups > 1);
-  int dim0 = offset + 0;
-  int dim1 = offset + 1;
-  std::swap(d.dims[dim0], d.dims[dim1]);
-  std::swap(d.padded_dims[dim0], d.padded_dims[dim1]);
-  if (d.format_kind != dnnl_format_kind_any) {
-    std::swap(d.format_desc.blocking.strides[dim0], d.format_desc.blocking.strides[dim1]);
-    // as padding is not supported, these are always zeros?
-    std::swap(d.padded_offsets[dim0], d.padded_offsets[dim1]);
-    // for blocked format: change indices
-    for (int i = 0; i < d.format_desc.blocking.inner_nblks; ++i) {
-      auto &val = d.format_desc.blocking.inner_idxs[i];
-      if (val == dim0) {
-        val = dim1;
-      } else if (val == dim1) {
-        val = dim0;
-      }
-    }
-  }
-  return desc;
+deconv_bwd_weights_t::desc DeconvDescCreator::MakeBwdWeightsDesc() const {
+  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
+                                    bias_md, out_md, strides, dilates, padding, padding);
+}
+
+// Utilities
+
+mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups) {
+  std::vector<int> order(desc.data.ndims);
+  std::iota(std::begin(order), std::end(order), 0);
+  const int offset = int(num_groups > 1);
+  std::swap(order[offset + 0], order[offset + 1]);
+  return desc.permute_axes(order);
 }
 
-// Applies IOLogicalSwapDesc to MKLDNN memory of arr
-void IOLogicalSwapMKLDNNMem(const NDArray &arr, int num_groups) {
+void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups) {
   mkldnn::memory::desc desc;
   if (arr.IsMKLDNNData()) {
     desc = arr.GetMKLDNNData()->get_desc();

From 4112d44b5c5e79ba54d18dbccd4985fa2208be13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Fri, 26 Mar 2021 12:20:24 +0100
Subject: [PATCH 5/9] Refactor deconvolution version 3

---
 .../nn/mkldnn/mkldnn_deconvolution-inl.h      | 261 +++++++++++++++---
 .../nn/mkldnn/mkldnn_deconvolution.cc         | 234 +++-------------
 2 files changed, 257 insertions(+), 238 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index 404cacc500c5..bd5934dcfb07 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -19,19 +19,19 @@
 
 /*!
  * \file mkldnn_deconvolution-inl.h
- * \brief
- *          ________
- * Data---->|Deconv|
- * Weight-->|  FWD |--->out
- * Bias---->|______|
- *               ________
- * Data_grad<----|Deconv|<---out_grad
- * Weight_grad<--|  BWD |<---data
- * Bias_grad<----|      |<---Weight
- *               |______|<---Bias
+ * Naming convention:
+ *                 ________
+ *  (src) data --->|Deconv|
+ *     weights --->|  FWD |---> out (dst)
+ *        bias --->|______|
+ *                                 ________
+ *        (diff_src) data_grad <---|Deconv|<--- out_grad (diff_dst)
+ *  (diff_weight) weights_grad <---|  BWD |<--- data (src)
+ *       (diff_bias) bias_grad <---|      |<--- weight
+ *                                 |______|<--- bias
  *
  * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
- * "out_grad" to its gradient
+ * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses.
  */
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
@@ -47,12 +47,48 @@ namespace op {
 using deconv_fwd_t = mkldnn::deconvolution_forward;
 using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
 
-using deconv_bwd_t = mkldnn::deconvolution_backward_data;
+using deconv_bwd_data_t = mkldnn::deconvolution_backward_data;
 using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
 
 using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights;
 using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
 
+
+
+// Swaps the logical order of dimensions that in plain format would correspond to input and output
+// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
+inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
+                                              const uint32_t num_group) {
+  std::vector<int> order(desc.data.ndims);
+  std::iota(std::begin(order), std::end(order), 0);
+  const int offset = int(num_group > 1);
+  std::swap(order[offset + 0], order[offset + 1]);
+  return desc.permute_axes(order);
+}
+
+// Applies IOLogicalSwapDesc to MKLDNN memory of arr
+inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, const uint32_t num_group) {
+  mkldnn::memory::desc desc;
+  if (arr.IsMKLDNNData()) {
+    desc = arr.GetMKLDNNData()->get_desc();
+  } else {
+    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
+    // descriptor from GetWeightDesc but with default format
+    const auto &temp = GetWeightDesc(arr, num_group);
+    desc = mkldnn::memory::desc(
+        temp.dims(), temp.data_type(),
+        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
+  }
+  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
+}
+
+// Version of GetWeightsDesc for deconvolution (with swap)
+inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const uint32_t num_group) {
+  return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group);
+}
+
+
+
 class MKLDNNDeconvFwd {
  public:
   struct Tensors {
@@ -68,12 +104,13 @@ class MKLDNNDeconvFwd {
   };
 
   static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam &param, const Tensors &tensors);
-  static std::shared_ptr<deconv_fwd_pd_t> MakePD(const DeconvolutionParam &param,
-                                                 const Tensors &tensors);
+  static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam &param,
+                                                              const Tensors &tensors);
 
   MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors);
-  void ControlWeightsFormat(const uint32_t num_group, const bool is_train, const NDArray &weights);
-  void Execute(const uint32_t num_group, const std::vector<OpReqType> &req, const Tensors &tensors);
+  void ControlWeightsFormat(const uint32_t num_group, const bool is_train,
+                            const NDArray &weights) const;
+  void Execute(const uint32_t num_group, const OpReqType req, const Tensors &tensors) const;
 
  private:
   const mkldnn::memory *DataMem(const NDArray &data) const;
@@ -82,10 +119,47 @@ class MKLDNNDeconvFwd {
 
   mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const;
 
+ private:
   std::shared_ptr<deconv_fwd_t> fwd;
   std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
 };
 
+
+MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
+                                  const std::vector<NDArray> &outputs)
+    : data(inputs[deconv::kData]),
+      weights(inputs[deconv::kWeight]),
+      bias(no_bias ? nullptr : &inputs[deconv::kBias]),
+      out(outputs[deconv::kOut]) {}
+
+MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights,
+                                  const NDArray *const bias, const NDArray &out)
+    : data(data), weights(weights), bias(bias), out(out) {}
+
+MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors)
+    : fwd_pd(CreatePrimitiveDesc(param, tensors)) {
+  fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
+}
+
+inline const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
+  return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
+}
+
+inline const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
+                                                         const NDArray &weights) const {
+  return GetWeights(weights, fwd_pd->weights_desc(), num_group);
+}
+
+inline const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
+  return bias.GetMKLDNNData();
+}
+
+inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const {
+  return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
+}
+
+
+
 class MKLDNNDeconvBwd {
  public:
   struct ReadTensors {
@@ -104,30 +178,33 @@ class MKLDNNDeconvBwd {
 
   static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam &param,
                                     const ReadTensors &read_tensors);
-  static std::shared_ptr<deconv_bwd_data_pd_t> MakeDataPD(const DeconvolutionParam &param,
-                                                          const ReadTensors &read_tensors,
-                                                          const deconv_fwd_pd_t &fwd_pd);
-  static std::shared_ptr<deconv_bwd_weights_pd_t> MakeWeightsPD(const DeconvolutionParam &param,
-                                                                const ReadTensors &read_tensors,
-                                                                const deconv_fwd_pd_t &fwd_pd);
+
+  static std::shared_ptr<deconv_bwd_data_pd_t> CreateDataPrimitiveDesc(
+      const DeconvolutionParam &param, const ReadTensors &read_tensors,
+      const deconv_fwd_pd_t &fwd_pd);
+
+  static std::shared_ptr<deconv_bwd_weights_pd_t> CreateWeightsPrimitiveDesc(
+      const DeconvolutionParam &param, const ReadTensors &read_tensors,
+      const deconv_fwd_pd_t &fwd_pd);
 
   MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors);
+
   void Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
-               const ReadTensors &read_tensors, const WriteTensors &write_tensors);
+               const ReadTensors &read_tensors, const WriteTensors &write_tensors) const;
 
  private:
   void IOSwapWeightsTensors(const uint32_t num_group, const std::vector<OpReqType> &req,
-                            const NDArray &weights, const NDArray &weights_grad);
+                            const NDArray &weights, const NDArray &weights_grad) const;
 
-  // returns the output gradient memory used to calculate the data (input) gradient, which
-  // might be reused when calculating the gradient of weights
-  const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const std::vector<OpReqType> &req,
+  // returns the output gradient memory used to calculate the data (input) gradient,
+  // which might be reused when calculating the gradient of weights
+  const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const OpReqType req,
                                         const ReadTensors &read_tensors,
-                                        const WriteTensors &write_tensors);
+                                        const WriteTensors &write_tensors) const;
 
   void ScheduleBwdWeights(const uint32_t num_group, const std::vector<OpReqType> &req,
                           const ReadTensors &read_tensors, const WriteTensors &write_tensors,
-                          const mkldnn::memory *const out_grad_mem);
+                          const mkldnn::memory *const out_grad_mem) const;
 
   const mkldnn::memory *DataMem(const NDArray &data) const;
   const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
@@ -145,12 +222,94 @@ class MKLDNNDeconvBwd {
 
   std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
   std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
-  std::shared_ptr<deconv_bwd_t> bwd_data;
+  std::shared_ptr<deconv_bwd_data_t> bwd_data;
   std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
 };
 
+
+MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs)
+    : data(inputs[deconv::kData + 1]),
+      weights(inputs[deconv::kWeight + 1]),
+      bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
+      out_grad(inputs[deconv::kOut]) {}
+
+MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs)
+    : data_grad(outputs[deconv::kData]),
+      weights_grad(outputs[deconv::kWeight]),
+      bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
+
+MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors) {
+  const auto &fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
+      param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias,
+                                      read_tensors.out_grad));
+  bwd_data_pd = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
+  bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd);
+  bwd_data = std::make_shared<deconv_bwd_data_t>(*bwd_data_pd);
+  bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
+}
+
+inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
+                                                  const std::vector<OpReqType> &req,
+                                                  const NDArray &weights,
+                                                  const NDArray &weights_grad) const {
+  if (req[deconv::kData]) {
+    IOLogicalSwapMKLDNNMem(weights, num_group);
+  }
+  if (req[deconv::kWeight] || (req.size() < deconv::kBias && req[deconv::kBias])) {
+    IOLogicalSwapMKLDNNMem(weights_grad, num_group);
+  }
+}
+
+inline const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
+  return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
+}
+
+inline const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
+                                                         const NDArray &weights) const {
+  return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
+}
+
+inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
+  return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
+}
+
+inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(
+    const NDArray &out_grad, const mkldnn::memory *const out_grad_mem) const {
+  return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
+             ? out_grad_mem
+             : out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
+}
+
+inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req,
+                                                    const NDArray &data_grad) const {
+  return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
+}
+
+inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group,
+                                                       const OpReqType req,
+                                                       const NDArray &weights_grad) const {
+  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
+  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
+  // memory (which, when not swapped, is always in default format), so here we check if after a
+  // swap, weights_md will have a default format
+  const auto &weights_md = bwd_weights_pd->diff_weights_desc();
+  if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) {
+    return {OutDataOp::Noop, const_cast<NDArray &>(weights_grad).CreateMKLDNNData(weights_md)};
+  }
+  return CreateMKLDNNWeightGrad(weights_grad, weights_md, req);
+}
+
+inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req,
+                                                    const NDArray *const bias) const {
+  return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
+              : mkldnn_output_t(OutDataOp::Noop, nullptr);
+}
+
+
+
 // Utility class for creating operation descriptors of deconvolution primitives
-struct DeconvDescCreator {
+class DeconvDescCreator {
+ public:
   DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
                     const NDArray *const bias, const NDArray &out);
 
@@ -166,10 +325,11 @@ struct DeconvDescCreator {
   bool CheckImplSizeReq(const size_t data_size, const size_t weights_size,
                         const size_t out_size) const;
 
-  deconv_fwd_t::desc MakeFwdDesc() const;
-  deconv_bwd_t::desc MakeBwdDataDesc() const;
-  deconv_bwd_weights_t::desc MakeBwdWeightsDesc() const;
+  deconv_fwd_t::desc CreateFwdDesc() const;
+  deconv_bwd_data_t::desc CreateBwdDataDesc() const;
+  deconv_bwd_weights_t::desc CreateBwdWeightsDesc() const;
 
+ private:
   mkldnn::memory::desc data_md;
   mkldnn::memory::desc weights_md;
   mkldnn::memory::desc bias_md;
@@ -178,20 +338,33 @@ struct DeconvDescCreator {
   mkldnn::memory::dims strides;
   mkldnn::memory::dims padding;
   mkldnn::memory::dims dilates;
-
-  mkldnn::engine &engine;
 };
 
-// Swaps the logical order of dimensions that in plain format would correspond to input and output
-// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups);
 
-// Applies IOLogicalSwapDesc to MKLDNN memory of arr
-void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups);
+inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+                                                const size_t out_size) const {
+  // MKLDNN introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, MKLDNN operators
+  // still reuse memory from memory planning, so here we need to accept only a
+  // kernel that has the expected memory size requirements (which is suboptimal)
+  return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) &&
+          out_size == GetMemDescSize(out_md));
+}
 
-// Version of GetWeightsDesc for deconvolution (with swap)
-inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const int num_groups) {
-  return IOLogicalSwapDesc(GetWeightDesc(weights, num_groups), num_groups);
+inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
+  return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
+                            mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md,
+                            out_md, strides, dilates, padding, padding);
+}
+
+inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
+  return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
+                                 out_md, strides, dilates, padding, padding);
+}
+
+inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const {
+  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
+                                    bias_md, out_md, strides, dilates, padding, padding);
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 699318d9beec..f248259dbd23 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -19,7 +19,6 @@
 
 /*!
  * \file mkldnn_deconvolution.cc
- * \brief
  */
 
 #if MXNET_USE_MKLDNN == 1
@@ -34,7 +33,7 @@ bool SupportMKLDNNDeconv(const DeconvolutionParam &params, const NDArray &input)
          (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
 }
 
-// Forward
+
 
 void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                 const std::vector<NDArray> &inputs,
@@ -42,24 +41,13 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &c
                                 const std::vector<NDArray> &outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto &tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
-  MKLDNNDeconvFwd &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
+  const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
+  const auto &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
 
   fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
-  fwd.Execute(param.num_group, req, tensors);
+  fwd.Execute(param.num_group, req[deconv::kOut], tensors);
 }
 
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights,
-                                  const NDArray *const bias, const NDArray &out)
-    : data(data), weights(weights), bias(bias), out(out) {}
-
-MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
-                                  const std::vector<NDArray> &outputs)
-    : data(inputs[deconv::kData]),
-      weights(inputs[deconv::kWeight]),
-      bias(no_bias ? nullptr : &inputs[deconv::kBias]),
-      out(outputs[deconv::kOut]) {}
-
 MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
                                             const Tensors &tensors) {
   using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
@@ -84,10 +72,11 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
   return it->second;
 }
 
-std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::MakePD(const DeconvolutionParam &param,
-                                                         const Tensors &tensors) {
+std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
+    const DeconvolutionParam &param, const Tensors &tensors) {
   DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
-  const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.MakeFwdDesc(), ddc.engine);
+  const auto &engine = CpuEngine::Get()->get_engine();
+  const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
   const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
   const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); };
@@ -98,19 +87,14 @@ std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::MakePD(const DeconvolutionPara
       // imposed, meaning there is no implementation with plain formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of deconvolution forward propagation";
-      *pd = deconv_fwd_pd_t(ddc.MakeFwdDesc(), ddc.engine);
+      *pd = deconv_fwd_pd_t(ddc.CreateFwdDesc(), engine);
     }
   }
   return pd;
 }
 
-MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors)
-    : fwd_pd(MakePD(param, tensors)) {
-  fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
-}
-
 void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train,
-                                           const NDArray &weights) {
+                                           const NDArray &weights) const {
   if (is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
@@ -132,8 +116,8 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool
   }
 }
 
-void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
-                              const Tensors &tensors) {
+void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req,
+                              const Tensors &tensors) const {
   // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
   // For that, we would pass input tensor in place of output and output tensor in place of input
   // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
@@ -157,7 +141,7 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector<OpReqT
   IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
   {
     mkldnn_args_map_t net_args;
-    const auto &out_mem = OutMem(req[deconv::kOut], tensors.out);
+    const auto &out_mem = OutMem(req, tensors.out);
 
     net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
     net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
@@ -174,24 +158,7 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const std::vector<OpReqT
   IOLogicalSwapMKLDNNMem(tensors.weights, num_group);  // swap back from oihw to iohw
 }
 
-const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
-  return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
-}
 
-const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
-                                                  const NDArray &weights) const {
-  return GetWeights(weights, fwd_pd->weights_desc(), num_group);
-}
-
-const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
-  return bias.GetMKLDNNData();
-}
-
-mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const {
-  return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
-}
-
-// Backward
 
 void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
                                  const std::vector<NDArray> &inputs,
@@ -201,24 +168,13 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &
 
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
   const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto &read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
-  const auto &write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
+  const auto read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
   MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors);
 
   bwd.Execute(param.num_group, req, read_tensors, write_tensors);
 }
 
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs)
-    : data(inputs[deconv::kData + 1]),
-      weights(inputs[deconv::kWeight + 1]),
-      bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
-      out_grad(inputs[deconv::kOut]) {}
-
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs)
-    : data_grad(outputs[deconv::kData]),
-      weights_grad(outputs[deconv::kWeight]),
-      bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
-
 MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
                                             const ReadTensors &read_tensors) {
   using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
@@ -243,12 +199,13 @@ MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
   return it->second;
 }
 
-std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::MakeDataPD(const DeconvolutionParam &param,
-                                                                  const ReadTensors &read_tensors,
-                                                                  const deconv_fwd_pd_t &fwd_pd) {
+std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
+    const DeconvolutionParam &param, const ReadTensors &read_tensors,
+    const deconv_fwd_pd_t &fwd_pd) {
   DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr,
                         read_tensors.out_grad);
-  const auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+  const auto &engine = CpuEngine::Get()->get_engine();
+  const auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.CreateBwdDataDesc(), engine, fwd_pd);
   const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
   const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
@@ -259,19 +216,20 @@ std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::MakeDataPD(const Deconvol
       // imposed, meaning there is no implementation with plain formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of deconvolution backward propagation";
-      *pd = deconv_bwd_data_pd_t(ddc.MakeBwdDataDesc(), ddc.engine, fwd_pd);
+      *pd = deconv_bwd_data_pd_t(ddc.CreateBwdDataDesc(), engine, fwd_pd);
     }
   }
   return pd;
 }
 
-std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::MakeWeightsPD(
+std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc(
     const DeconvolutionParam &param, const ReadTensors &read_tensors,
     const deconv_fwd_pd_t &fwd_pd) {
   DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias,
                         read_tensors.out_grad);
+  const auto &engine = CpuEngine::Get()->get_engine();
   const auto pd =
-      std::make_shared<deconv_bwd_weights_pd_t>(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd);
+      std::make_shared<deconv_bwd_weights_pd_t>(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
   const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); };
   const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
@@ -282,53 +240,34 @@ std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::MakeWeightsPD(
       // imposed, meaning there is no implementation with plain formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of calculating deconvolution weights gradient";
-      *pd = deconv_bwd_weights_pd_t(ddc.MakeBwdWeightsDesc(), ddc.engine, fwd_pd);
+      *pd = deconv_bwd_weights_pd_t(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
     }
   }
   return pd;
 }
 
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors) {
-  const auto &fwd_pd = MKLDNNDeconvFwd::MakePD(
-      param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias,
-                                      read_tensors.out_grad));
-  bwd_data_pd = MakeDataPD(param, read_tensors, *fwd_pd);
-  bwd_weights_pd = MakeWeightsPD(param, read_tensors, *fwd_pd);
-  bwd_data = std::make_shared<deconv_bwd_t>(*bwd_data_pd);
-  bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
-}
-
 void MKLDNNDeconvBwd::Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
-                              const ReadTensors &read_tensors, const WriteTensors &write_tensors) {
+                              const ReadTensors &read_tensors,
+                              const WriteTensors &write_tensors) const {
   // swaps are explained in MKLDNNDeconvFwd::Execute
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
   {
-    auto *const out_grad_mem = ScheduleBwdData(num_group, req, read_tensors, write_tensors);
+    auto *const out_grad_mem =
+        ScheduleBwdData(num_group, req[deconv::kData], read_tensors, write_tensors);
     ScheduleBwdWeights(num_group, req, read_tensors, write_tensors, out_grad_mem);
     MKLDNNStream::Get()->Submit();
   }
   IOSwapWeightsTensors(num_group, req, read_tensors.weights, write_tensors.weights_grad);
 }
 
-void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
-                                           const std::vector<OpReqType> &req,
-                                           const NDArray &weights, const NDArray &weights_grad) {
-  if (req[deconv::kData]) {
-    IOLogicalSwapMKLDNNMem(weights, num_group);
-  }
-  if (req[deconv::kWeight] || req[deconv::kBias]) {
-    IOLogicalSwapMKLDNNMem(weights_grad, num_group);
-  }
-}
-
 const mkldnn::memory *MKLDNNDeconvBwd::ScheduleBwdData(const uint32_t num_group,
-                                                       const std::vector<OpReqType> &req,
+                                                       const OpReqType req,
                                                        const ReadTensors &read_tensors,
-                                                       const WriteTensors &write_tensors) {
-  if (req[deconv::kData]) {
+                                                       const WriteTensors &write_tensors) const {
+  if (req) {
     mkldnn_args_map_t net_args;
     auto *const out_grad_mem = OutGradMem(read_tensors.out_grad);
-    const auto &data_grad_mem = DataGradMem(req[deconv::kData], write_tensors.data_grad);
+    const auto &data_grad_mem = DataGradMem(req, write_tensors.data_grad);
 
     net_args.insert({MKLDNN_ARG_DIFF_DST, *out_grad_mem});
     net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, read_tensors.weights)});
@@ -346,12 +285,14 @@ void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
                                          const std::vector<OpReqType> &req,
                                          const ReadTensors &read_tensors,
                                          const WriteTensors &write_tensors,
-                                         const mkldnn::memory *const out_grad_mem) {
-  if (req[deconv::kWeight] || req[deconv::kBias]) {
+                                         const mkldnn::memory *const out_grad_mem) const {
+  OpReqType weight_req = req[deconv::kWeight];
+  OpReqType bias_req = req.size() > deconv::kBias ? req[deconv::kBias] : OpReqType::kNullOp;
+  if (weight_req || bias_req) {
     mkldnn_args_map_t net_args;
     const auto &weights_grad_mem =
-        WeightsGradMem(num_group, req[deconv::kWeight], write_tensors.weights_grad);
-    const auto &bias_grad_mem = BiasGradMem(req[deconv::kBias], write_tensors.bias_grad);
+        WeightsGradMem(num_group, weight_req, write_tensors.weights_grad);
+    const auto &bias_grad_mem = BiasGradMem(bias_req, write_tensors.bias_grad);
 
     net_args.insert({MKLDNN_ARG_DIFF_DST, *OutGradMem(read_tensors.out_grad, out_grad_mem)});
     net_args.insert({MKLDNN_ARG_SRC, *DataMem(read_tensors.data)});
@@ -369,50 +310,7 @@ void MKLDNNDeconvBwd::ScheduleBwdWeights(const uint32_t num_group,
   }
 }
 
-const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
-  return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
-}
-
-const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
-                                                  const NDArray &weights) const {
-  return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
-}
-
-const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
-  return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
-}
-
-const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad,
-                                                  const mkldnn::memory *const out_grad_mem) const {
-  if (!out_grad_mem || bwd_data_pd->diff_dst_desc() != bwd_weights_pd->diff_dst_desc()) {
-    return out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
-  }
-  return out_grad_mem;
-}
-
-mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req, const NDArray &data_grad) const {
-  return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
-}
 
-mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group, const OpReqType req,
-                                                const NDArray &weights_grad) const {
-  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
-  // memory (which, when not swapped, is always in default format), so here we check if after a
-  // swap, wei_md will have a default format
-  const auto &wei_md = bwd_weights_pd->diff_weights_desc();
-  if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(wei_md, num_group))) {
-    return {OutDataOp::Noop, const_cast<NDArray &>(weights_grad).CreateMKLDNNData(wei_md)};
-  }
-  return CreateMKLDNNWeightGrad(weights_grad, wei_md, req);
-}
-
-mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req, const NDArray *const bias) const {
-  return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
-              : mkldnn_output_t(OutDataOp::Noop, nullptr);
-}
-
-// DeconvDescCreator
 
 DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data,
                                      const NDArray &weights, const NDArray *const bias,
@@ -423,8 +321,7 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDAr
       out_md(GetMemDesc(out)),
       strides(param.stride.ndim()),
       padding(param.pad.ndim()),
-      dilates(param.dilate.ndim()),
-      engine(CpuEngine::Get()->get_engine()) {
+      dilates(param.dilate.ndim()) {
   // assuming only deconv2D is supported for now
   CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim());
   CHECK(param.stride.ndim() == 2);
@@ -456,57 +353,6 @@ bool DeconvDescCreator::ImposePlainWherePadding(const size_t data_size, const si
   return false;
 }
 
-bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size,
-                                         const size_t out_size) const {
-  // MKLDNN introduced padded formats since 0.15 which require more memory
-  // compared to the actual size of the tensor. Currently, MKLDNN operators
-  // still reuse memory from memory planning, so here we need to accept only a
-  // kernel that has the expected memory size requirements (which is suboptimal)
-  return (data_size == GetMemDescSize(data_md) && weights_size == GetMemDescSize(weights_md) &&
-          out_size == GetMemDescSize(out_md));
-}
-
-deconv_fwd_t::desc DeconvDescCreator::MakeFwdDesc() const {
-  return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
-                            mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md,
-                            out_md, strides, dilates, padding, padding);
-}
-
-deconv_bwd_t::desc DeconvDescCreator::MakeBwdDataDesc() const {
-  return deconv_bwd_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md, out_md,
-                            strides, dilates, padding, padding);
-}
-
-deconv_bwd_weights_t::desc DeconvDescCreator::MakeBwdWeightsDesc() const {
-  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
-                                    bias_md, out_md, strides, dilates, padding, padding);
-}
-
-// Utilities
-
-mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc, const int num_groups) {
-  std::vector<int> order(desc.data.ndims);
-  std::iota(std::begin(order), std::end(order), 0);
-  const int offset = int(num_groups > 1);
-  std::swap(order[offset + 0], order[offset + 1]);
-  return desc.permute_axes(order);
-}
-
-void IOLogicalSwapMKLDNNMem(const NDArray &arr, const int num_groups) {
-  mkldnn::memory::desc desc;
-  if (arr.IsMKLDNNData()) {
-    desc = arr.GetMKLDNNData()->get_desc();
-  } else {
-    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
-    // descriptor from GetWeightDesc but with default format
-    const auto &temp = GetWeightDesc(arr, num_groups);
-    desc = mkldnn::memory::desc(
-        temp.dims(), temp.data_type(),
-        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
-  }
-  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_groups));
-}
-
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1

From 73e6d0b07fb2a820b847617d10a8f54a4ea09e36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Wed, 31 Mar 2021 09:32:55 +0200
Subject: [PATCH 6/9] Enable Deconvolution2D test

---
 tests/python/mkl/test_mkldnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 60ebbfb97477..579826f9b4b9 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -469,10 +469,10 @@ def check_convolution_training(stype):
 
 
 @with_seed()
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
+# @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
 def test_Deconvolution():
     def check_Deconvolution_training(stype):
-        for shape in [(3, 3, 10), (3, 3, 10, 10)]:
+        for shape in [(3, 3, 10, 10)]: # testing only 2D for now
             data_tmp = np.random.randint(256, size=shape)
             data = mx.symbol.Variable('data', stype=stype)
 

From c79dff98c16e0ec8824620922144602d84b4bea3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Thu, 1 Apr 2021 09:22:08 +0200
Subject: [PATCH 7/9] Fix sanity

---
 src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h | 5 ++++-
 src/operator/nn/mkldnn/mkldnn_deconvolution.cc    | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index bd5934dcfb07..db957da056e8 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -37,6 +37,9 @@
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
 
 #if MXNET_USE_MKLDNN == 1
+#include <utility>
+#include <vector>
+
 #include "../deconvolution-inl.h"
 #include "./mkldnn_base-inl.h"
 #include "./mkldnn_ops-inl.h"
@@ -61,7 +64,7 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
                                               const uint32_t num_group) {
   std::vector<int> order(desc.data.ndims);
   std::iota(std::begin(order), std::end(order), 0);
-  const int offset = int(num_group > 1);
+  const int offset = static_cast<int>(num_group > 1);
   std::swap(order[offset + 0], order[offset + 1]);
   return desc.permute_axes(order);
 }
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index f248259dbd23..7678567d95c8 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -323,8 +323,9 @@ DeconvDescCreator::DeconvDescCreator(const DeconvolutionParam &param, const NDAr
       padding(param.pad.ndim()),
       dilates(param.dilate.ndim()) {
   // assuming only deconv2D is supported for now
-  CHECK(param.stride.ndim() == param.pad.ndim() && param.stride.ndim() == param.dilate.ndim());
-  CHECK(param.stride.ndim() == 2);
+  CHECK_EQ(param.stride.ndim(), param.pad.ndim());
+  CHECK_EQ(param.stride.ndim(), param.dilate.ndim());
+  CHECK_EQ(param.stride.ndim(), 2);
   for (int i = 0; i < param.stride.ndim(); ++i) {
     strides[i] = param.stride[i];
     padding[i] = param.pad[i];

From cab15627b1e963cd2ff7bdaa02d40dd55078ffb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Thu, 1 Apr 2021 11:13:00 +0200
Subject: [PATCH 8/9] Fix windows builds

---
 src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h | 1 +
 tests/python/mkl/test_mkldnn.py                   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index db957da056e8..b51ec2a85650 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -39,6 +39,7 @@
 #if MXNET_USE_MKLDNN == 1
 #include <utility>
 #include <vector>
+#include <numeric>
 
 #include "../deconvolution-inl.h"
 #include "./mkldnn_base-inl.h"
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 579826f9b4b9..de0c249f52ab 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -469,7 +469,6 @@ def check_convolution_training(stype):
 
 
 @with_seed()
-# @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12579")
 def test_Deconvolution():
     def check_Deconvolution_training(stype):
         for shape in [(3, 3, 10, 10)]: # testing only 2D for now

From 74cc40dc40eac6af307b7cbb825089741a9ffed6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20G=C5=82omski?= <pawel.glomski@intel.com>
Date: Mon, 19 Apr 2021 12:42:25 +0200
Subject: [PATCH 9/9] Fix deconvolution with bias test

---
 tests/python/unittest/test_operator.py | 34 +++++++++++++++++++-------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d02ff9537667..29b2f39d5178 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1660,22 +1660,38 @@ def test_deconvolution_forward_with_bias():
     def check_deconvolution_forward_with_bias(shape=(1, 16, 5, 5), num_filter=32, num_group=1, kernel=(3, 3), pad=(1, 1)):
         x = mx.sym.Variable('x')
         w = mx.sym.Variable('w')
-        input_data = mx.random.uniform(-5, 5, shape, ctx=mx.cpu())
-        y = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad)
-        exe = y.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
+        b = mx.sym.Variable('b')
+        y_nb = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=True, pad=pad)
+        y_b = mx.sym.Deconvolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad)
+        
+        
+        exe_nb = y_nb.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
+        exe_b = y_b.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
+
+        
+        data = np.random.uniform(-5, 5, size=exe_b.arg_arrays[0].shape)
+        weights = np.random.normal(size=exe_b.arg_arrays[1].shape)
+        bias = np.random.normal(size=exe_b.arg_arrays[2].shape)
+        
+        def exe_forward(exe):
+            exe.arg_arrays[0][:] = data
+            exe.arg_arrays[1][:] = weights
+            if len(exe.arg_arrays) == 3:
+                exe.arg_arrays[2][:] = bias
+            return exe.forward(is_train=False)[0].asnumpy()
+        
+        out_nb = exe_forward(exe_nb)
+        out_b = exe_forward(exe_b)
+        bias = np.broadcast_to(bias, [np.prod(out_nb.shape[2:])] + [num_filter]).T
+        bias = np.broadcast_to(bias.reshape((num_filter, *out_nb.shape[2:])), out_b.shape)
+        assert_almost_equal(out_nb + bias, out_b)
 
-        exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
-        exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
 
-        exe.forward(is_train=False)
-        o = exe.outputs[0]
-        t = o.asnumpy()
     check_deconvolution_forward_with_bias((1, 16, 5), 32, 1, (3,), (1,))
     check_deconvolution_forward_with_bias((32, 16, 5), 32, 1, (3,), (1,))
     check_deconvolution_forward_with_bias((1, 16, 5, 5), 32, 1, (3, 3), (1, 1))
     check_deconvolution_forward_with_bias((32, 16, 5, 5), 32, 1, (3, 3), (1, 1))
 
-
 def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
     arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
     arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}