diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
index ff47ef35f98f..66cb851c99e1 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
@@ -40,12 +40,9 @@ class MKLDNNConcatFwd {
  public:
   mkldnn::concat::primitive_desc fwd_pd;
 
-  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md)
-      : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
-      fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
-  }
+  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md);
 
-  const mkldnn::concat &GetFwd() const;
+  const mkldnn::concat &GetFwd() const { return *fwd_; }
 
  private:
   std::shared_ptr<mkldnn::concat> fwd_;
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index aa30ffc557a1..1dd2dc31ee0c 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -29,7 +29,32 @@
 namespace mxnet {
 namespace op {
 
-const mkldnn::concat &MKLDNNConcatFwd::GetFwd() const { return *fwd_; }
+static inline bool IsUsingPadding(const mkldnn::memory::desc &dst_md) {
+  // make sure a blocked format is used (at least one dimension is blocked)
+  bool is_blocked_format = dst_md.data.format_kind == mkldnn_blocked &&
+                           dst_md.data.format_desc.blocking.inner_nblks > 0;
+  return is_blocked_format && !std::equal(dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims,
+                                          dst_md.data.padded_dims);
+}
+
+MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md)
+    : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // compared to the actual size of the tensor. Currently, MKL-DNN operators
+  // still reuse memory from memory planning, so here we need to select a
+  // format that has the expected memory size requirements (a plain format)
+
+  // When fwd_pd uses padding, impose a plain format
+  const auto &dst_md = fwd_pd.dst_desc();
+  if (IsUsingPadding(dst_md)) {
+    auto plain_dst_tag = static_cast<mkldnn::memory::format_tag>(
+        GetDefaultFormat(dst_md.data.ndims));
+    auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
+    fwd_pd = mkldnn::concat::primitive_desc(plain_dst_md, concat_dim, data_md,
+                                            CpuEngine::Get()->get_engine());
+  }
+  fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
+}
 
 void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                          const std::vector<NDArray> &in_data,
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 42cbb72cf433..3d361ea9bfe1 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -116,6 +116,10 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
                        &attr](const mkldnn::convolution_forward::desc &desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
+      // MKL-DNN introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // still reuse memory from memory planning, so here we need to select a
+      // suboptimal kernel for computation that has the expected memory size requirements
       auto conv_pd =
           std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
       while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
@@ -216,6 +220,10 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
                            &fwd_pd](const mkldnn::convolution_backward_data::desc &desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
+      // MKL-DNN introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // still reuse memory from memory planning, so here we need to select a
+      // suboptimal kernel for computation that has the expected memory size requirements
       auto conv_pd =
           std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
@@ -299,6 +307,10 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
                               &fwd_pd](const mkldnn::convolution_backward_weights::desc &desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
+      // MKL-DNN introduced padded formats since 0.15 which require more memory
+      // compared to the actual size of the tensor. Currently, MKL-DNN operators
+      // still reuse memory from memory planning, so here we need to select a
+      // suboptimal kernel for computation that has the expected memory size requirements
       auto conv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
           desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index cdf3639cd86f..65bf93298b95 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -53,10 +53,9 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwd_(
     const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
     const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
   // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // for computation compared with the actual tensor size. Currently, MKL-DNN
-  // operators are still reusing those memory from memory planning and the
-  // memory size may smaller than what MKL-DNN kernels require. So here we need
-  // select suboptimal kernel for computation according to tensor sizes.
+  // compared to the actual size of the tensor. Currently, MKL-DNN operators
+  // still reuse memory from memory planning, so here we need to select a
+  // suboptimal kernel for computation that has the expected memory size requirements
   if (!has_bias) {
     mkldnn::convolution_forward::desc desc(
         mkldnn::prop_kind::forward_training,
@@ -117,10 +116,9 @@ GetDeconvFwdImpl(const DeconvolutionParam &param, const NDArray &data,
       std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
           desc, engine, *bwd_pd);
   // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // for computation compared with the actual tensor size. Currently, MKL-DNN
-  // operators are still reusing those memory from memory planning and the
-  // memory size may smaller than what MKL-DNN kernels require. So here we need
-  // select suboptimal kernel for computation according to tensor sizes.
+  // compared to the actual size of the tensor. Currently, MKL-DNN operators
+  // still reuse memory from memory planning, so here we need to select a
+  // suboptimal kernel for computation that has the expected memory size requirements
   while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
          deconv_pd->diff_src_desc().get_size() != GetMemDescSize(out_md) ||
          deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) {
@@ -176,10 +174,9 @@ GetDeconvBwdWeightsImpl(
   dilate[1] = param.dilate[1] - 1;
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // for computation compared with the actual tensor size. Currently, MKL-DNN
-  // operators are still reusing those memory from memory planning and the
-  // memory size may smaller than what MKL-DNN kernels require. So here we need
-  // select suboptimal kernel for computation according to tensor sizes.
+  // compared to the actual size of the tensor. Currently, MKL-DNN operators
+  // still reuse memory from memory planning, so here we need to select a
+  // suboptimal kernel for computation that has the expected memory size requirements
   if (!has_bias) {
     mkldnn::convolution_backward_weights::desc desc(
         mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md,
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 2fafc7821b5e..60ebbfb97477 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -692,6 +692,38 @@ def check_concat_training(stype):
     for stype in stypes:
         check_concat_training(stype)
 
+
+@with_seed()
+def test_concat_blocked():
+    ctx = mx.cpu()
+    axis = 1
+    filters = 32  # must be a power of 2 and >= 16
+    kernel = (3, 3)
+    for in_dim_size in range(1, 17):  # check cases with and without padding
+        in_shape = (1, in_dim_size, 64, 64)
+        in_data = mx.nd.random.uniform(-1, 1, in_shape, ctx=ctx)
+        conv_weights = mx.nd.random.uniform(-1, 1, (filters, in_shape[1], kernel[0], kernel[1]), ctx=ctx)
+
+        def calc_output_of_layer(layer):
+            ex = layer.simple_bind(ctx, x=in_shape)
+            in_data.copyto(ex.arg_arrays[0])
+            conv_weights.copyto(ex.arg_arrays[1])
+            return ex.forward()[0].asnumpy()
+
+        x = mx.sym.Variable('x')
+        w = mx.sym.Variable('w')
+        # convolution, so a blocked format is selected
+        conv = mx.sym.Convolution(data=x, weight=w, num_filter=filters, kernel=kernel, pad=(1, 1), no_bias=True)
+        conc = mx.sym.concat(conv, x, dim=axis)
+
+        # first calculate the output of the convolution to determine ref_out
+        conv_out = calc_output_of_layer(conv)
+        ref_out = np.concatenate((conv_out, in_data.asnumpy()), axis=axis)
+
+        out = calc_output_of_layer(conc)
+        assert_almost_equal(out, ref_out)
+
+
 @with_seed()
 def test_elemwise_add():
     def ref_add(a, b):