diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/mkldnn/mkldnn_concat-inl.h index ff47ef35f98f..66cb851c99e1 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_concat-inl.h @@ -40,12 +40,9 @@ class MKLDNNConcatFwd { public: mkldnn::concat::primitive_desc fwd_pd; - MKLDNNConcatFwd(int concat_dim, const std::vector &data_md) - : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) { - fwd_ = std::make_shared(fwd_pd); - } + MKLDNNConcatFwd(int concat_dim, const std::vector &data_md); - const mkldnn::concat &GetFwd() const; + const mkldnn::concat &GetFwd() const { return *fwd_; } private: std::shared_ptr fwd_; diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc index aa30ffc557a1..1dd2dc31ee0c 100644 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -29,7 +29,32 @@ namespace mxnet { namespace op { -const mkldnn::concat &MKLDNNConcatFwd::GetFwd() const { return *fwd_; } +static inline bool IsUsingPadding(const mkldnn::memory::desc &dst_md) { + // make sure a blocked format is used (at least one dimension is blocked) + bool is_blocked_format = dst_md.data.format_kind == mkldnn_blocked && + dst_md.data.format_desc.blocking.inner_nblks > 0; + return is_blocked_format && !std::equal(dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, + dst_md.data.padded_dims); +} + +MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector &data_md) + : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) { + // MKL-DNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // format that has the expected memory size requirements (a plain format) + + // When fwd_pd uses padding, impose a plain format + const auto &dst_md = fwd_pd.dst_desc(); + if (IsUsingPadding(dst_md)) { + auto plain_dst_tag = static_cast( + GetDefaultFormat(dst_md.data.ndims)); + auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag); + fwd_pd = mkldnn::concat::primitive_desc(plain_dst_md, concat_dim, data_md, + CpuEngine::Get()->get_engine()); + } + fwd_ = std::make_shared(fwd_pd); +} void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &in_data, diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc index 42cbb72cf433..3d361ea9bfe1 100644 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -116,6 +116,10 @@ std::shared_ptr GetConvFwdImpl( &attr](const mkldnn::convolution_forward::desc &desc) { auto engine = CpuEngine::Get()->get_engine(); try { + // MKL-DNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements auto conv_pd = std::make_shared(desc, attr, engine); while (conv_pd->dst_desc().get_size() != GetArraySize(output) || @@ -216,6 +220,10 @@ static std::shared_ptr GetCon &fwd_pd](const mkldnn::convolution_backward_data::desc &desc) { auto engine = CpuEngine::Get()->get_engine(); try { + // MKL-DNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements auto conv_pd = std::make_shared(desc, engine, fwd_pd); while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) || @@ -299,6 +307,10 @@ static std::shared_ptr Get &fwd_pd](const mkldnn::convolution_backward_weights::desc &desc) { auto engine = CpuEngine::Get()->get_engine(); try { + // MKL-DNN introduced padded formats since 0.15 which require more memory + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements auto conv_pd = std::make_shared( desc, engine, fwd_pd); while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) || diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc index cdf3639cd86f..65bf93298b95 100644 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -53,10 +53,9 @@ std::shared_ptr GetDeconvBwd_( const mkldnn::engine &engine, const mkldnn::memory::dims &strides, const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) { // MKL-DNN introduced padded formats since 0.15 which require more memory - // for computation compared with the actual tensor size. Currently, MKL-DNN - // operators are still reusing those memory from memory planning and the - // memory size may smaller than what MKL-DNN kernels require. So here we need - // select suboptimal kernel for computation according to tensor sizes. + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements if (!has_bias) { mkldnn::convolution_forward::desc desc( mkldnn::prop_kind::forward_training, @@ -117,10 +116,9 @@ GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data, std::make_shared( desc, engine, *bwd_pd); // MKL-DNN introduced padded formats since 0.15 which require more memory - // for computation compared with the actual tensor size. Currently, MKL-DNN - // operators are still reusing those memory from memory planning and the - // memory size may smaller than what MKL-DNN kernels require. So here we need - // select suboptimal kernel for computation according to tensor sizes. + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) || deconv_pd->diff_src_desc().get_size() != GetMemDescSize(out_md) || deconv_pd->weights_desc().get_size() != GetMemDescSize(weight_md)) { @@ -176,10 +174,9 @@ GetDeconvBwdWeightsImpl( dilate[1] = param.dilate[1] - 1; // MKL-DNN introduced padded formats since 0.15 which require more memory - // for computation compared with the actual tensor size. Currently, MKL-DNN - // operators are still reusing those memory from memory planning and the - // memory size may smaller than what MKL-DNN kernels require. So here we need - // select suboptimal kernel for computation according to tensor sizes. + // compared to the actual size of the tensor. Currently, MKL-DNN operators + // still reuse memory from memory planning, so here we need to select a + // suboptimal kernel for computation that has the expected memory size requirements if (!has_bias) { mkldnn::convolution_backward_weights::desc desc( mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md, diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py index 2fafc7821b5e..60ebbfb97477 100644 --- a/tests/python/mkl/test_mkldnn.py +++ b/tests/python/mkl/test_mkldnn.py @@ -692,6 +692,38 @@ def check_concat_training(stype): for stype in stypes: check_concat_training(stype) + +@with_seed() +def test_concat_blocked(): + ctx = mx.cpu() + axis = 1 + filters = 32 # must be a power of 2 and >= 16 + kernel = (3, 3) + for in_dim_size in range(1, 17): # check cases with and without padding + in_shape = (1, in_dim_size, 64, 64) + in_data = mx.nd.random.uniform(-1, 1, in_shape, ctx=ctx) + conv_weights = mx.nd.random.uniform(-1, 1, (filters, in_shape[1], kernel[0], kernel[1]), ctx=ctx) + + def calc_output_of_layer(layer): + ex = layer.simple_bind(ctx, x=in_shape) + in_data.copyto(ex.arg_arrays[0]) + conv_weights.copyto(ex.arg_arrays[1]) + return ex.forward()[0].asnumpy() + + x = mx.sym.Variable('x') + w = mx.sym.Variable('w') + # convolution, so a blocked format is selected + conv = mx.sym.Convolution(data=x, weight=w, num_filter=filters, kernel=kernel, pad=(1, 1), no_bias=True) + conc = mx.sym.concat(conv, x, dim=axis) + + # first calculate the output of the convolution to determine ref_out + conv_out = calc_output_of_layer(conv) + ref_out = np.concatenate((conv_out, in_data.asnumpy()), axis=axis) + + out = calc_output_of_layer(conc) + assert_almost_equal(out, ref_out) + + @with_seed() def test_elemwise_add(): def ref_add(a, b):