From 9fe0589a754acf376fd2769bd40642901e279e33 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 12:11:52 -0800 Subject: [PATCH 01/26] replace with im2col/col2im functions --- src/operator/nn/deconvolution-inl.h | 55 +++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index d89a489c0183..a9124110cad7 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -36,6 +36,7 @@ #include #include "../operator_common.h" #include "../linalg.h" +#include "./im2col.h" namespace mxnet { @@ -242,6 +243,7 @@ class DeconvolutionOp { } auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); + auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); @@ -272,13 +274,24 @@ class DeconvolutionOp { shape_dstunit_[2] * step), s); temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); if (o_pad[0] == 0 && o_pad[1] == 0) { - temp_col = unpack_patch2col(out.Slice(i, i + step), - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]); + // temp_col = unpack_patch2col(out.Slice(i, i + step), + // kernel[0], + // kernel[1], + // stride[0], + // stride[1], + // dilate[0], + // dilate[1]); + im2col( + s, + (out.Slice(i, i+step)).dptr_, + out.shape_, + temp_col.shape_, + kernel, + padding, + stride, + dilate, + temp_col.dptr_ + ); } else { temp_col = unpack_patch2col(pad(out.Slice(i, i + step), o_pad[0], o_pad[1]), @@ -298,14 +311,26 @@ class DeconvolutionOp { linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); } if (o_pad[0] == 0 && o_pad[1] == 0) { - out.Slice(i, i + step) = pack_col2patch(temp_col, - out.Slice(i, i + step).shape_, - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]); + // out.Slice(i, i + step) = pack_col2patch(temp_col, + // out.Slice(i, i + step).shape_, + // kernel[0], + // kernel[1], + // stride[0], + // stride[1], + // dilate[0], + // dilate[1]); + col2im( + s, + temp_col.dptr_, + out.Slice(i, i + step).shape_, + temp_col.shape_, + kernel, + padding, + stride, + dilate, + out.Slice(i, i+step).dptr_, + req[deconv::kOut] + ); } else { Shape<4> pshape = out.Slice(i, i + step).shape_; pshape[2] += 2 * o_pad[0]; From 19dfcb539c25bd8de995fdb304e1a0e1e3dba1f8 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 15:15:24 -0800 Subject: [PATCH 02/26] fixed padding problem in transpose conv forward --- src/operator/nn/deconvolution-inl.h | 101 +++++++++------------------- 1 file changed, 33 insertions(+), 68 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index a9124110cad7..62829fc6ad70 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -227,6 +227,10 @@ class DeconvolutionOp { CHECK_EQ(in_data.size(), expected); CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); +#if defined(__CUDACC__) + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; +#endif auto in_data_shape = in_data[deconv::kData].shape_; Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor out = TBlobTo4DTensor(out_data[deconv::kOut], s); @@ -253,55 +257,37 @@ class DeconvolutionOp { param_.num_filter / param_.num_group * kernel_size); Tensor wmat = in_data[deconv::kWeight].get_with_shape(wmat_shape, s); -#if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; -#endif const index_t nbatch = data.size(0); Tensor workspace = ctx.requested[deconv::kTempSpace].get_space_typed( Shape1(this->InitTemp(out.shape_, data.shape_)), s); for (index_t i = 0; i < nbatch; i += nstep_) { const index_t step = std::min(nstep_, nbatch - i); + // temp_col: (N*kernel_size, OW * OH) Tensor temp_col = Tensor( workspace.dptr_, Shape2(shape_colunit_[0], shape_colunit_[1] * step), s); + // temp_dst: (N, N/n_grup, OW * OH) Tensor temp_dst = Tensor( workspace.dptr_ + temp_col.shape_.Size(), Shape3(shape_dstunit_[0], shape_dstunit_[1], shape_dstunit_[2] * step), s); temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); - if (o_pad[0] == 0 && o_pad[1] == 0) { - // temp_col = unpack_patch2col(out.Slice(i, i + step), - // kernel[0], - // kernel[1], - // stride[0], - // stride[1], - // dilate[0], - // dilate[1]); - im2col( - s, - (out.Slice(i, i+step)).dptr_, - out.shape_, - temp_col.shape_, - kernel, - padding, - stride, - dilate, - temp_col.dptr_ - ); - } else { - temp_col = unpack_patch2col(pad(out.Slice(i, i + step), - o_pad[0], o_pad[1]), - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]); - } + + im2col( + s, + (out.Slice(i, i+step)).dptr_, + out.shape_, + temp_col.shape_, + kernel, + padding, + stride, + dilate, + temp_col.dptr_ + ); + const index_t gstride = temp_col.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, @@ -310,42 +296,21 @@ class DeconvolutionOp { // tmpc = dot(wmat[gid].T(), temp_dst[gid]); linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); } - if (o_pad[0] == 0 && o_pad[1] == 0) { - // out.Slice(i, i + step) = pack_col2patch(temp_col, - // out.Slice(i, i + step).shape_, - // kernel[0], - // kernel[1], - // stride[0], - // stride[1], - // dilate[0], - // dilate[1]); - col2im( - s, - temp_col.dptr_, - out.Slice(i, i + step).shape_, - temp_col.shape_, - kernel, - padding, - stride, - dilate, - out.Slice(i, i+step).dptr_, - req[deconv::kOut] - ); - } else { - Shape<4> pshape = out.Slice(i, i + step).shape_; - pshape[2] += 2 * o_pad[0]; - pshape[3] += 2 * o_pad[1]; - out.Slice(i, i + step) = crop(pack_col2patch(temp_col, - pshape, - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]), - out[i][0].shape_); - } + + col2im( + s, + temp_col.dptr_, + out.Slice(i, i + step).shape_, + temp_col.shape_, + kernel, + padding, + stride, + dilate, + out.Slice(i, i+step).dptr_, + req[deconv::kOut] + ); } + if (!param_.no_bias) { // add bias, broadcast bias to dim 1: channel Tensor bias = in_data[deconv::kBias].get(s); From 747df6cdf65afb55bd241093f3188bb111c519ca Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 15:55:01 -0800 Subject: [PATCH 03/26] fix backward deconvolution --- src/operator/nn/deconvolution-inl.h | 45 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 62829fc6ad70..affe5949293f 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -263,7 +263,7 @@ class DeconvolutionOp { Shape1(this->InitTemp(out.shape_, data.shape_)), s); for (index_t i = 0; i < nbatch; i += nstep_) { const index_t step = std::min(nstep_, nbatch - i); - // temp_col: (N*kernel_size, OW * OH) + // temp_col: (N * kernel_size, OW * OH) Tensor temp_col = Tensor( workspace.dptr_, Shape2(shape_colunit_[0], @@ -293,7 +293,7 @@ class DeconvolutionOp { mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); // Legacy approach shown here for comparison: - // tmpc = dot(wmat[gid].T(), temp_dst[gid]); + // tmpc = dot(wmat[gid].T(), temp_dst[gid]); linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); } @@ -334,6 +334,10 @@ class DeconvolutionOp { CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); // get data Stream *s = ctx.get_stream(); +#if defined(__CUDACC__) + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; +#endif auto in_data_shape = in_data[deconv::kData].shape_; Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor grad = TBlobTo4DTensor(out_grad[deconv::kOut], s); @@ -352,6 +356,7 @@ class DeconvolutionOp { } auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); + auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); @@ -363,10 +368,6 @@ class DeconvolutionOp { in_data[deconv::kWeight].get_with_shape(wmat_shape, s); Tensor gwmat = in_grad[deconv::kWeight].get_with_shape(wmat_shape, s); -#if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; -#endif const index_t nbatch = data.size(0); Tensor workspace = @@ -384,23 +385,19 @@ class DeconvolutionOp { shape_dstunit_[1], shape_dstunit_[2] * step), s); temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); - if (o_pad[0] == 0 && o_pad[1] == 0) { - temp_col = unpack_patch2col(grad.Slice(i, i + step), - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]); - } else { - temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]), - kernel[0], - kernel[1], - stride[0], - stride[1], - dilate[0], - dilate[1]); - } + + im2col( + s, + (grad.Slice(i, i + step)).dptr_, + grad.shape_, + temp_col.shape_, + kernel, + padding, + stride, + dilate, + temp_col.dptr_ + ); + const index_t gstride = temp_col.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); @@ -421,7 +418,7 @@ class DeconvolutionOp { for (uint32_t gid = 0; gid < param_.num_group; ++gid) { Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); // Legacy approach shown here for comparison: - // temp_dst[gid] = dot(wmat[gid], tmpc); + // temp_dst[gid] = dot(wmat[gid], tmpc); linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); } Assign(gdata.Slice(i, i + step), From 854cff2275a39c89a0ebbd97c6ba5d724da9d814 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 16:31:54 -0800 Subject: [PATCH 04/26] refactor --- src/operator/nn/deconvolution-inl.h | 41 +++++++++++------------------ 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index affe5949293f..155fe71750f8 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -119,7 +119,7 @@ struct DeconvolutionParam : public dmlc::Parameter { } template - void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const { + void InferPad(const TShape &input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim]) const { // Modified by Li.bs // Use tag to control the calculation of pad bool bCal = false; @@ -238,16 +238,13 @@ class DeconvolutionOp { if (param_.kernel.ndim() == 2) { param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj); } else { - index_t o_pad_1D[1], o_adj_1D[1]; - param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D); - o_pad[0] = 0; - o_pad[1] = o_pad_1D[0]; - o_adj[0] = 0; - o_adj[1] = o_adj_1D[0]; + param_.InferPad({in_data_shape[2]}, o_pad, o_adj); } + auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); - auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]}); + auto padding = param_.kernel.ndim() == 2 ? + TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); @@ -290,8 +287,7 @@ class DeconvolutionOp { const index_t gstride = temp_col.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { - mshadow::Tensor tmpc = temp_col.Slice(gstride * gid, - gstride * (gid + 1)); + Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); // Legacy approach shown here for comparison: // tmpc = dot(wmat[gid].T(), temp_dst[gid]); linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); @@ -314,7 +310,7 @@ class DeconvolutionOp { if (!param_.no_bias) { // add bias, broadcast bias to dim 1: channel Tensor bias = in_data[deconv::kBias].get(s); - out += mshadow::expr::broadcast<1>(bias, out.shape_); + out += broadcast<1>(bias, out.shape_); } } @@ -342,21 +338,16 @@ class DeconvolutionOp { Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor grad = TBlobTo4DTensor(out_grad[deconv::kOut], s); Tensor gdata = TBlobTo4DTensor(in_grad[deconv::kData], s); - index_t o_pad[2], o_adj[2]; if (param_.kernel.ndim() == 2) { param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj); } else { - index_t o_pad_1D[1], o_adj_1D[1]; - param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D); - o_pad[0] = 0; - o_pad[1] = o_pad_1D[0]; - o_adj[0] = 0; - o_adj[1] = o_adj_1D[0]; + param_.InferPad({in_data_shape[2]}, o_pad, o_adj); } auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); - auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]}); + auto padding = param_.kernel.ndim() == 2 ? + TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); @@ -404,11 +395,11 @@ class DeconvolutionOp { if (i == 0) { Tensor tmp_gwmat = gwmat[gid]; // Legacy approach shown here for comparison: - // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); + // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]); } else { // Legacy approach shown here for comparison: - // gwmat[gid] += dot(temp_dst[gid], tmpc.T()); + // gwmat[gid] += dot(temp_dst[gid], tmpc.T()); linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo); } } @@ -424,10 +415,10 @@ class DeconvolutionOp { Assign(gdata.Slice(i, i + step), req[deconv::kData], (swapaxis<1, 0>(reshape(temp_dst, - mshadow::Shape4(gdata.shape_[1], - step, - gdata.size(2), - gdata.size(3)))))); + Shape4(gdata.shape_[1], + step, + gdata.size(2), + gdata.size(3)))))); } } if (!param_.no_bias) { From 20ae427b17a1d8521583c02d71b5c4967345f254 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 20:32:20 -0800 Subject: [PATCH 05/26] fix lint --- src/operator/nn/deconvolution-inl.h | 9 +++------ tests/python/unittest/test_operator.py | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 155fe71750f8..377f5b7a0850 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -282,8 +282,7 @@ class DeconvolutionOp { padding, stride, dilate, - temp_col.dptr_ - ); + temp_col.dptr_); const index_t gstride = temp_col.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { @@ -303,8 +302,7 @@ class DeconvolutionOp { stride, dilate, out.Slice(i, i+step).dptr_, - req[deconv::kOut] - ); + req[deconv::kOut]); } if (!param_.no_bias) { @@ -386,8 +384,7 @@ class DeconvolutionOp { padding, stride, dilate, - temp_col.dptr_ - ); + temp_col.dptr_); const index_t gstride = temp_col.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 146836c28459..864cd8c77833 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1380,7 +1380,6 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ assert out_shapes[0] == (input_shape[0], 5) + target_shape -@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10973") @with_seed() def test_deconvolution(): # 2D From 926cfd7ca1aaef1200bcf6870a550b12ab013898 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 30 Jan 2019 23:36:24 -0800 Subject: [PATCH 06/26] fix unit test, remove step in deconv --- src/operator/nn/deconvolution-inl.h | 50 ++++++++++++-------------- tests/python/unittest/test_operator.py | 2 +- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 377f5b7a0850..db710c4544ab 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -258,24 +258,24 @@ class DeconvolutionOp { Tensor workspace = ctx.requested[deconv::kTempSpace].get_space_typed( Shape1(this->InitTemp(out.shape_, data.shape_)), s); - for (index_t i = 0; i < nbatch; i += nstep_) { - const index_t step = std::min(nstep_, nbatch - i); + for (index_t i = 0; i < nbatch; ++i) { // temp_col: (N * kernel_size, OW * OH) Tensor temp_col = Tensor( workspace.dptr_, - Shape2(shape_colunit_[0], - shape_colunit_[1] * step), s); + Shape2(shape_colunit_[0], shape_colunit_[1]), + s); // temp_dst: (N, N/n_grup, OW * OH) Tensor temp_dst = Tensor( workspace.dptr_ + temp_col.shape_.Size(), Shape3(shape_dstunit_[0], - shape_dstunit_[1], - shape_dstunit_[2] * step), s); - temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); + shape_dstunit_[1], + shape_dstunit_[2]), + s); + temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); im2col( s, - (out.Slice(i, i+step)).dptr_, + (out.Slice(i, i + 1)).dptr_, out.shape_, temp_col.shape_, kernel, @@ -295,13 +295,13 @@ class DeconvolutionOp { col2im( s, temp_col.dptr_, - out.Slice(i, i + step).shape_, + out.Slice(i, i + 1).shape_, temp_col.shape_, kernel, padding, stride, dilate, - out.Slice(i, i+step).dptr_, + out.Slice(i, i + 1).dptr_, req[deconv::kOut]); } @@ -362,22 +362,22 @@ class DeconvolutionOp { Tensor workspace = ctx.requested[deconv::kTempSpace].get_space_typed( Shape1(this->InitTemp(grad.shape_, data.shape_)), s); - for (index_t i = 0; i < nbatch; i += nstep_) { - const index_t step = std::min(nstep_, nbatch - i); + for (index_t i = 0; i < nbatch; ++i) { Tensor temp_col = Tensor( workspace.dptr_, - Shape2(shape_colunit_[0], - shape_colunit_[1] * step), s); + Shape2(shape_colunit_[0], shape_colunit_[1]), + s); Tensor temp_dst = Tensor( workspace.dptr_ + temp_col.shape_.Size(), Shape3(shape_dstunit_[0], - shape_dstunit_[1], - shape_dstunit_[2] * step), s); - temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); + shape_dstunit_[1], + shape_dstunit_[2]), + s); + temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); im2col( s, - (grad.Slice(i, i + step)).dptr_, + (grad.Slice(i, i + 1)).dptr_, grad.shape_, temp_col.shape_, kernel, @@ -409,11 +409,11 @@ class DeconvolutionOp { // temp_dst[gid] = dot(wmat[gid], tmpc); linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); } - Assign(gdata.Slice(i, i + step), + Assign(gdata.Slice(i, i + 1), req[deconv::kData], (swapaxis<1, 0>(reshape(temp_dst, Shape4(gdata.shape_[1], - step, + 1, gdata.size(2), gdata.size(3)))))); } @@ -433,17 +433,12 @@ class DeconvolutionOp { shape_dstunit_ = mshadow::Shape3(param_.num_group, oshape[1] / param_.num_group, oshape[2] * oshape[3]); - // See convolution for workspace calculations. nstep_ will be the effective batch size - nstep_ = std::max( - std::min(static_cast(param_.workspace) / - (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]), - 1); mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], - shape_colunit_[1] * nstep_); + shape_colunit_[1]); mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], shape_dstunit_[1], - shape_dstunit_[2] * nstep_); + shape_dstunit_[2]); index_t required_size = scol.Size() + sdst.Size(); return required_size; } @@ -460,7 +455,6 @@ class DeconvolutionOp { DeconvolutionParam param_; mshadow::Shape<2> shape_colunit_; mshadow::Shape<3> shape_dstunit_; - index_t nstep_; }; // class DeconvolutionOp template diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 864cd8c77833..8246f95b44ba 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -1256,7 +1256,7 @@ def test_abs(): assert_almost_equal(out, npout) out_grad = mx.nd.empty(shape) - out_grad[:] = 2; + out_grad[:] = 2 npout_grad = out_grad.asnumpy() npout_grad = npout_grad * np.sign(data_tmp) exe_test.backward(out_grad) From c49dbe10c4cb02a3025ccc68da2ec5f68082be9d Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Thu, 31 Jan 2019 00:23:42 -0800 Subject: [PATCH 07/26] add unit test --- tests/python/unittest/test_gluon.py | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index abe6b136fe0c..277bb2ba703c 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -503,6 +503,43 @@ def test_deconv(): # layer = nn.Conv3DTranspose(16, (3, 3, 3), layout='NDHWC', in_channels=4) # # check_layer_forward(layer, (1, 10, 10, 10, 4)) +@with_seed() +def test_deconv_dilation(): + data = mx.nd.array((((0,0,0), + (0,1,0), + (0,0,0) + ), + ((0,0,0), + (0,2,0), + (0,0,0) + ) + ) + ) + kernel = mx.nd.array(((1,2,3), + (4,5,6), + (7,8,9))) + + data_batch = data.expand_dims(1) + weight = kernel.expand_dims(0).expand_dims(0) + layer = nn.Conv2DTranspose(in_channels=1, channels=1, + kernel_size=(3,3), padding=(1,1), + strides=(1,1), dilation=(2,2)) + layer.initialize() + layer.weight.set_data(weight) + out = layer(data_batch).asnumpy() + expected = np.array([[[[1.,0.,2.,0.,3.], + [0.,0.,0.,0.,0.], + [4.,0.,5.,0.,6.], + [0.,0.,0.,0.,0.], + [7.,0.,8.,0.,9.]]], + [[[2.,0.,4.,0.,6.], + [0.,0.,0.,0.,0.], + [8.,0.,10.,0.,12.], + [0.,0.,0.,0.,0.], + [14.,0.,16.,0.,18.]]] + ]) + assert_almost_equal(out, expected) + @with_seed() def test_pool(): From afd75d139fcbefa68edc9b2ddcdafdb38bc9e529 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Thu, 31 Jan 2019 00:26:57 -0800 Subject: [PATCH 08/26] refactor --- tests/python/unittest/test_gluon.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 277bb2ba703c..35b6fa899e8e 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -507,14 +507,11 @@ def test_deconv(): def test_deconv_dilation(): data = mx.nd.array((((0,0,0), (0,1,0), - (0,0,0) - ), + (0,0,0)), ((0,0,0), (0,2,0), - (0,0,0) - ) - ) - ) + (0,0,0)))) + kernel = mx.nd.array(((1,2,3), (4,5,6), (7,8,9))) From 5b5909712696554a4592b2ab0b93f9a62aa7927f Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Thu, 31 Jan 2019 00:46:57 -0800 Subject: [PATCH 09/26] fix build error --- src/operator/nn/deconvolution-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index db710c4544ab..0947b63d5daa 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -308,7 +308,7 @@ class DeconvolutionOp { if (!param_.no_bias) { // add bias, broadcast bias to dim 1: channel Tensor bias = in_data[deconv::kBias].get(s); - out += broadcast<1>(bias, out.shape_); + out += mshadow::expr::broadcast<1>(bias, out.shape_); } } From d1554c1d097baeb47ccc5ead4e9057274effc3b8 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 28 Jan 2019 18:19:35 +0000 Subject: [PATCH 10/26] Revert "Aggregate SGD (#13346)" This reverts commit 0a45e1a222637c7dee29511cbfc43e594571933b. --- cpp-package/scripts/OpWrapperGenerator.py | 4 +- docs/faq/env_var.md | 9 - python/mxnet/gluon/trainer.py | 15 +- python/mxnet/model.py | 10 +- python/mxnet/optimizer/optimizer.py | 231 ++++----------- src/operator/optimizer_op-inl.h | 295 -------------------- src/operator/optimizer_op.cc | 193 +------------ src/operator/optimizer_op.cu | 9 - tests/python/unittest/test_gluon_trainer.py | 8 +- tests/python/unittest/test_module.py | 3 - 10 files changed, 66 insertions(+), 711 deletions(-) diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py index 65ba247c25c8..ca430ec99e6e 100644 --- a/cpp-package/scripts/OpWrapperGenerator.py +++ b/cpp-package/scripts/OpWrapperGenerator.py @@ -97,8 +97,7 @@ class Arg: 'double':'double',\ 'double or None':'dmlc::optional',\ 'Shape or None':'dmlc::optional',\ - 'string':'const std::string&',\ - 'tuple of ':'nnvm::Tuple'} + 'string':'const std::string&'} name = '' type = '' description = '' @@ -408,7 +407,6 @@ def ParseAllOps(): "#include \"mxnet-cpp/op_util.h\"\n" "#include \"mxnet-cpp/operator.h\"\n" "#include \"dmlc/optional.h\"\n" - "#include \"nnvm/tuple.h\"\n" "\n" "namespace mxnet {\n" "namespace cpp {\n" diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index 99ebae21d61f..98057d0d76d6 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -145,10 +145,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device, when kvstore's type is `device`. -* MXNET_UPDATE_ON_KVSTORE - - Values: 0(false) or 1(true) ```(default=1)``` - - If true, weight updates are performed during the communication step, if possible. - ## Memonger * MXNET_BACKWARD_DO_MIRROR @@ -222,11 +218,6 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca - When the array size is bigger than or equal to this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count. - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread. -* MXNET_OPTIMIZER_AGGREGATION_SIZE - - Values: Int ```(default=4)``` - - Maximum value is 60. - - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD). - Settings for Minimum Memory Usage --------------------------------- - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1``` diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 8060f38ac2aa..f6c0a31b52e2 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -60,8 +60,7 @@ class Trainer(object): See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more - suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is - provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. + suitable option depending on the type of kvstore. Properties ---------- @@ -394,8 +393,6 @@ def update(self, batch_size, ignore_stale_grad=False): self._update(ignore_stale_grad) def _update(self, ignore_stale_grad=False): - updates = [[] for _ in self._updaters] - for i, param in enumerate(self._params): if param.grad_req == 'null': continue @@ -419,17 +416,11 @@ def _update(self, ignore_stale_grad=False): self._kvstore.pull(i, param.list_data(), priority=-i) continue - for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()): + for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()): if not ignore_stale_grad or arr._fresh_grad: - upd.append((i, grad, arr)) + upd(i, grad, arr) arr._fresh_grad = False - if not (self._kvstore and self._update_on_kvstore): - for updater, upd in zip(self._updaters, updates): - if upd: - i, w, g = zip(*upd) - updater(i, w, g) - def save_states(self, fname): """Saves trainer states (e.g. optimizer, momentum) to a file. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index c08077cc65f4..38fe739154d5 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params): arg_params : dict of str to `NDArray`. Model parameter, dict of name to `NDArray` of net's weights. """ - update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) + update_on_kvstore = True if kvstore is None: kv = None elif isinstance(kvstore, kvs.KVStore): kv = kvstore elif isinstance(kvstore, str): # create kvstore using the string type - if num_device == 1 and 'dist' not in kvstore: + if num_device is 1 and 'dist' not in kvstore: # no need to use kv for single device and single machine kv = None else: @@ -162,7 +162,6 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): def _update_params(param_arrays, grad_arrays, updater, num_device, kvstore=None, param_names=None): """Perform update of param_arrays from grad_arrays not on kvstore.""" - updates = [[] for _ in range(num_device)] for i, pair in enumerate(zip(param_arrays, grad_arrays)): arg_list, grad_list = pair if grad_list[0] is None: @@ -179,10 +178,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device, # state for the same index but on diff devs, TODO(mli) # use a better solution later w, g = p - updates[k].append((index*num_device+k, g, w)) - for dev_updates in updates: - i, w, g = zip(*dev_updates) - updater(i, w, g) + updater(index*num_device+k, g, w) def _multiple_callbacks(callbacks, *args, **kwargs): diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py index cb52ac54fdab..6ffbbcffc384 100644 --- a/python/mxnet/optimizer/optimizer.py +++ b/python/mxnet/optimizer/optimizer.py @@ -22,15 +22,12 @@ import math import pickle import warnings -import os import numpy from ..base import py_str from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply) from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update, mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update, - signsgd_update, signum_update, - multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update, - multi_mp_sgd_mom_update) + signsgd_update, signum_update) from ..ndarray import sparse from ..random import normal @@ -40,8 +37,6 @@ 'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register' ] -def _flatten_list(nested_list): - return [item for sublist in nested_list for item in sublist] class Optimizer(object): """The base class inherited by all optimizers. @@ -110,7 +105,6 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0., self._index_update_count = {} self.clip_gradient = clip_gradient self.multi_precision = multi_precision - self.aggregate_num = 0 if param_idx2name is None: param_idx2name = {} @@ -386,44 +380,13 @@ def _update_count(self, index): Parameters ---------- - index : int or list of int + index : int The index to be updated. """ - if not isinstance(index, (list, tuple)): - index = [index] - for idx in index: - if idx not in self._index_update_count: - self._index_update_count[idx] = self.begin_num_update - self._index_update_count[idx] += 1 - self.num_update = max(self._index_update_count[idx], self.num_update) - - def _get_lrs(self, indices): - """Gets the learning rates given the indices of the weights. - - Parameters - ---------- - indices : list of int - Indices corresponding to weights. - - Returns - ------- - lrs : list of float - Learning rates for those indices. - """ - if self.lr_scheduler is not None: - lr = self.lr_scheduler(self.num_update) - else: - lr = self.lr - - lrs = [lr for _ in indices] - for i, index in enumerate(indices): - if index in self.param_dict: - lrs[i] *= self.param_dict[index].lr_mult - elif index in self.lr_mult: - lrs[i] *= self.lr_mult[index] - elif index in self.idx2name: - lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0) - return lrs + if index not in self._index_update_count: + self._index_update_count[index] = self.begin_num_update + self._index_update_count[index] += 1 + self.num_update = max(self._index_update_count[index], self.num_update) def _get_lr(self, index): """Gets the learning rate given the index of the weight. @@ -438,31 +401,18 @@ def _get_lr(self, index): lr : float Learning rate for this index. """ - return self._get_lrs([index])[0] - - def _get_wds(self, indices): - """Gets weight decays for indices. - Returns 0 for non-weights if the name of weights are provided for `__init__`. - - Parameters - ---------- - indices : list of int - Indices of weights. + if self.lr_scheduler is not None: + lr = self.lr_scheduler(self.num_update) + else: + lr = self.lr - Returns - ------- - wds : list of float - Weight decays for those indices. - """ - wds = [self.wd for _ in indices] - for i, index in enumerate(indices): - if index in self.param_dict: - wds[i] *= self.param_dict[index].wd_mult - elif index in self.wd_mult: - wds[i] *= self.wd_mult[index] - elif index in self.idx2name: - wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0) - return wds + if index in self.param_dict: + lr *= self.param_dict[index].lr_mult + elif index in self.lr_mult: + lr *= self.lr_mult[index] + elif index in self.idx2name: + lr *= self.lr_mult.get(self.idx2name[index], 1.0) + return lr def _get_wd(self, index): """Gets weight decay for index. @@ -471,14 +421,21 @@ def _get_wd(self, index): Parameters ---------- index : int - The index of weight. + The index for weight. Returns ------- wd : float Weight decay for this index. """ - return self._get_wds([index])[0] + wd = self.wd + if index in self.param_dict: + wd *= self.param_dict[index].wd_mult + elif index in self.wd_mult: + wd *= self.wd_mult[index] + elif index in self.idx2name: + wd *= self.wd_mult.get(self.idx2name[index], 1.0) + return wd def __getstate__(self): ret = self.__dict__.copy() @@ -514,13 +471,6 @@ class SGD(Optimizer): provides slightly different semantics than the original update, and may lead to different empirical results. - In the case when ``update_on_kvstore`` is set to False (either globally via - MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in - :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update - of parameters, which may lead to improved performance. The aggregation size - is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and - defaults to 4. - Otherwise, **standard updates** are applied by:: rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight) @@ -552,7 +502,6 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs): super(SGD, self).__init__(**kwargs) self.momentum = momentum self.lazy_update = lazy_update - self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4")) def create_state_multi_precision(self, index, weight): weight_master_copy = None @@ -573,22 +522,12 @@ def create_state(self, index, weight): momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype) return momentum - def _update_impl(self, indices, weights, grads, states, multi_precision=False): - aggregate = True - if not isinstance(indices, (tuple, list)): - indices = [indices] - weights = [weights] - grads = [grads] - states = [states] - for weight, grad in zip(weights, grads): - assert(isinstance(weight, NDArray)) - assert(isinstance(grad, NDArray)) - aggregate = (aggregate and - weight.stype == 'default' and - grad.stype == 'default') - self._update_count(indices) - lrs = self._get_lrs(indices) - wds = self._get_wds(indices) + def _update_impl(self, index, weight, grad, state, multi_precision=False): + assert(isinstance(weight, NDArray)) + assert(isinstance(grad, NDArray)) + self._update_count(index) + lr = self._get_lr(index) + wd = self._get_wd(index) kwargs = {'rescale_grad': self.rescale_grad} if self.momentum > 0: @@ -596,49 +535,26 @@ def _update_impl(self, indices, weights, grads, states, multi_precision=False): if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient - if aggregate: - if not multi_precision: - if self.momentum > 0: - multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights, - num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) - else: - multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights, - num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) + if not multi_precision: + if state is not None: + sgd_mom_update(weight, grad, state, out=weight, + lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) else: - if self.momentum > 0: - multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))), - out=weights, num_weights=len(weights), - lrs=lrs, wds=wds, **kwargs) - else: - multi_mp_sgd_update(*_flatten_list(zip(weights, grads, - list(zip(*states))[1])), - out=weights, num_weights=len(weights), - lrs=lrs, wds=wds, **kwargs) + sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, + lr=lr, wd=wd, **kwargs) else: - for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds): - if not multi_precision: - if state is not None: - sgd_mom_update(weight, grad, state, out=weight, - lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) - else: - sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, - lr=lr, wd=wd, **kwargs) - else: - if state[0] is not None: - mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, - lr=lr, wd=wd, **kwargs) - else: - mp_sgd_update(weight, grad, state[1], out=weight, - lr=lr, wd=wd, **kwargs) + if state[0] is not None: + mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, + lr=lr, wd=wd, **kwargs) + else: + mp_sgd_update(weight, grad, state[1], out=weight, + lr=lr, wd=wd, **kwargs) def update(self, index, weight, grad, state): self._update_impl(index, weight, grad, state, multi_precision=False) def update_multi_precision(self, index, weight, grad, state): - if not isinstance(index, (tuple, list)): - use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 - else: - use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16 + use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 self._update_impl(index, weight, grad, state, multi_precision=use_multi_precision) @@ -1609,55 +1525,20 @@ def __init__(self, optimizer): self.optimizer = optimizer self.states = {} self.states_synced = {} - self.aggregate_updates = optimizer.aggregate_num > 0 def __call__(self, index, grad, weight): """Updates weight given gradient and index.""" - if not isinstance(index, (list, tuple)): - indices = [index] - grads = [grad] - weights = [weight] - else: - indices = index - grads = grad - weights = weight - for i, idx in enumerate(indices): - # convert ctypes.char_p.value back to python str if needed - if isinstance(idx, bytes): - indices[i] = py_str(idx) - idx = indices[i] - if idx not in self.states: - self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i]) - self.states_synced[idx] = True - elif not self.states_synced[idx]: - self.states[idx] = \ - self.sync_state_context(self.states[idx], weights[i].context) - self.states_synced[idx] = True - if self.aggregate_updates: - # segregate values based on type - type_map = {} - for i, w, g in zip(indices, weights, grads): - if w.dtype in type_map: - type_map[w.dtype].append((i, w, g)) - else: - type_map[w.dtype] = [(i, w, g)] - for idx in type_map: - current_index = 0 - indices, weights, grads = zip(*type_map[idx]) - while current_index < len(indices): - states = [] - step = min(self.optimizer.aggregate_num, len(indices) - current_index) - for j in range(step): - states.append(self.states[indices[current_index + j]]) - self.optimizer.update_multi_precision( - indices[current_index:current_index + self.optimizer.aggregate_num], - weights[current_index:current_index + self.optimizer.aggregate_num], - grads[current_index:current_index + self.optimizer.aggregate_num], - states) - current_index += self.optimizer.aggregate_num - else: - for i, w, g in zip(indices, weights, grads): - self.optimizer.update_multi_precision(i, w, g, self.states[i]) + # convert ctypes.char_p.value back to python str if needed + if isinstance(index, bytes): + index = py_str(index) + if index not in self.states: + self.states[index] = self.optimizer.create_state_multi_precision(index, weight) + self.states_synced[index] = True + elif not self.states_synced[index]: + self.states[index] = \ + self.sync_state_context(self.states[index], weight.context) + self.states_synced[index] = True + self.optimizer.update_multi_precision(index, weight, grad, self.states[index]) def sync_state_context(self, state, context): """sync state context.""" diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 223a1aa6c37d..9251b8614806 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -82,301 +82,6 @@ struct SGDParam : public dmlc::Parameter { } }; -struct MultiSGDParam : public dmlc::Parameter { - nnvm::Tuple lrs; - nnvm::Tuple wds; - float rescale_grad; - float clip_gradient; - int num_weights; - DMLC_DECLARE_PARAMETER(MultiSGDParam) { - DMLC_DECLARE_FIELD(lrs) - .describe("Learning rates."); - DMLC_DECLARE_FIELD(wds) - .describe("Weight decay augments the objective function with a " - "regularization term that penalizes large weights. " - "The penalty scales with the square of the magnitude of each weight."); - DMLC_DECLARE_FIELD(rescale_grad) - .set_default(1.0f) - .describe("Rescale gradient to grad = rescale_grad*grad."); - DMLC_DECLARE_FIELD(clip_gradient) - .set_default(-1.0f) - .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " - "If clip_gradient <= 0, gradient clipping is turned off. " - "grad = max(min(grad, clip_gradient), -clip_gradient)."); - DMLC_DECLARE_FIELD(num_weights) - .set_default(1) - .describe("Number of updated weights."); - } -}; - -struct MultiSGDMomParam : public dmlc::Parameter { - nnvm::Tuple lrs; - nnvm::Tuple wds; - float momentum; - float rescale_grad; - float clip_gradient; - int num_weights; - DMLC_DECLARE_PARAMETER(MultiSGDMomParam) { - DMLC_DECLARE_FIELD(lrs) - .describe("Learning rates."); - DMLC_DECLARE_FIELD(wds) - .describe("Weight decay augments the objective function with a " - "regularization term that penalizes large weights. " - "The penalty scales with the square of the magnitude of each weight."); - DMLC_DECLARE_FIELD(momentum) - .set_default(0.0f) - .describe("The decay rate of momentum estimates at each epoch."); - DMLC_DECLARE_FIELD(rescale_grad) - .set_default(1.0f) - .describe("Rescale gradient to grad = rescale_grad*grad."); - DMLC_DECLARE_FIELD(clip_gradient) - .set_default(-1.0f) - .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " - "If clip_gradient <= 0, gradient clipping is turned off. " - "grad = max(min(grad, clip_gradient), -clip_gradient)."); - DMLC_DECLARE_FIELD(num_weights) - .set_default(1) - .describe("Number of updated weights."); - } -}; - -template -inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const ParamType& param = dmlc::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); - CHECK_EQ(out_attrs->size(), param.num_weights); - - bool all_inferred = true; - auto& input_shapes = *in_attrs; - auto& output_shapes = *out_attrs; - // Learning rates - CHECK_EQ(param.lrs.ndim(), param.num_weights) - << "Number of learning rates is inconsistent with num_weights " - << "parameter passed. Expected number of learning rates: " - << param.num_weights << ", and got " << param.lrs.ndim(); - // Weight decays - CHECK_EQ(param.wds.ndim(), param.num_weights) - << "Number of weight decays is inconsistent with num_weights " - << "parameter passed. Expected number of weight decays: " - << param.num_weights << ", and got " << param.wds.ndim(); - // Weights and gradients - for (int i = 0; i < param.num_weights; ++i) { - std::vector input_vec; - std::vector output_vec({output_shapes[i]}); - for (int j = 0; j < input_stride; ++j) { - input_vec.push_back(input_shapes[i * input_stride + j]); - } - all_inferred = all_inferred && ElemwiseShape(attrs, &input_vec, &output_vec); - } - return all_inferred; -} - -template -inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const ParamType& param = dmlc::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); - CHECK_EQ(out_attrs->size(), param.num_weights); - - bool all_inferred = true; - auto& input_types = *in_attrs; - auto& output_types = *out_attrs; - // Weights and gradients - for (int i = 0; i < param.num_weights; ++i) { - std::vector input_vec; - std::vector output_vec({output_types[i]}); - for (int j = 0; j < input_stride - num_fp32_inputs; ++j) { - input_vec.push_back(input_types[i * input_stride + j]); - } - all_inferred = all_inferred && - ElemwiseType(attrs, &input_vec, &output_vec); - } - // master copies of weights - for (int i = 0; i < param.num_weights; ++i) { - for (int j = 0; j < num_fp32_inputs; ++j) { - TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32); - } - } - return all_inferred; -} - -template -struct MultiSGDKernelParam { - static const int N = 60; - int count; - size_t max_size; - size_t sizes[N]; - DType * weights[N]; - DType * grads[N]; - MPDType * mom[N]; - MPDType * weights32[N]; - DType * out_data[N]; - MPDType lrs[N]; - MPDType wds[N]; - MPDType clip_gradient; - MPDType rescale_grad; - MPDType momentum; -}; - -template -struct MultiSGDKernel { - template - MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam& param, - const OpReqType req) { - for (int index = 0; index < param.count; ++index) { - if ((size_t)i < param.sizes[index]) { - MPDType w = has_mixed_precision ? param.weights32[index][i] : - MPDType(param.weights[index][i]); - MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0); - if (param.clip_gradient >= 0.0f) { - mom = param.momentum*mom - - param.lrs[index]*param.wds[index]*w - - param.lrs[index] - *mshadow_op::clip::Map(param.rescale_grad * - static_cast(param.grads[index][i]), - param.clip_gradient); - } else { - mom = param.momentum*mom - - param.lrs[index]*param.wds[index]*w - - param.lrs[index]*param.rescale_grad*static_cast(param.grads[index][i]); - } - if (has_momentum) { - param.mom[index][i] = mom; - } - w = w + mom; - if (has_mixed_precision) { - param.weights32[index][i] = w; - } - KERNEL_ASSIGN(param.out_data[index][i], req, w); - } - } - } -}; - -template -MultiSGDKernelParam FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &outputs) { - using namespace mxnet_op; - const ParamType& p = nnvm::get(attrs.parsed); - Stream* s = ctx.get_stream(); - MultiSGDKernelParam param; - param.clip_gradient = p.clip_gradient; - param.rescale_grad = p.rescale_grad; - param.momentum = 0; - param.count = p.num_weights; - param.max_size = 0; - for (int i = 0; i < param.count; ++i) { - param.sizes[i] = inputs[i * input_stride].shape_.Size(); - if (param.max_size < param.sizes[i]) { - param.max_size = param.sizes[i]; - } - param.weights[i] = inputs[i * input_stride].FlatTo2D(s).dptr_; - param.grads[i] = inputs[i * input_stride + 1].FlatTo2D(s).dptr_; - // if mixed precision, then the last input in a set - // is 32-bit master copy of the weights - if (!std::is_same::value) { - param.weights32[i] = inputs[i * input_stride + input_stride - 1] - .FlatTo2D(s).dptr_; - } - param.out_data[i] = outputs[i].FlatTo2D(s).dptr_; - param.lrs[i] = p.lrs[i]; - param.wds[i] = p.wds[i]; - } - - return param; -} - - -template -MultiSGDKernelParam FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &outputs) { - using namespace mxnet_op; - const MultiSGDMomParam& p = nnvm::get(attrs.parsed); - Stream* s = ctx.get_stream(); - MultiSGDKernelParam param = - FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); - param.momentum = p.momentum; - for (int i = 0; i < param.count; ++i) { - param.mom[i] = inputs[i * input_stride + 2].FlatTo2D(s).dptr_; - } - - return param; -} - -template -class type_identity { - public: - using type = T; -}; - -template -class single_precision { - public: - using type = float; -}; - -template class MPTypeChooser, int input_stride> -inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mxnet_op; - Stream* s = ctx.get_stream(); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { - using MPDType = typename MPTypeChooser::type; - MultiSGDKernelParam param = - FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); - Kernel::value>, - xpu>::Launch(s, param.max_size, param, req[0]); - }); -} - -template class MPTypeChooser, int input_stride> -inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mxnet_op; - Stream* s = ctx.get_stream(); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { - using MPDType = typename MPTypeChooser::type; - MultiSGDKernelParam param = - FillMultiSGDMomKernelParam(attrs, ctx, inputs, outputs); - Kernel::value>, - xpu>::Launch(s, param.max_size, param, req[0]); - }); -} struct SGDKernel { template diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index 982995ad2f95..a52a6f32907c 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -31,8 +31,6 @@ namespace op { DMLC_REGISTER_PARAMETER(SGDParam); DMLC_REGISTER_PARAMETER(SGDMomParam); -DMLC_REGISTER_PARAMETER(MultiSGDParam); -DMLC_REGISTER_PARAMETER(MultiSGDMomParam); DMLC_REGISTER_PARAMETER(FTMLParam); DMLC_REGISTER_PARAMETER(AdamParam); DMLC_REGISTER_PARAMETER(RMSPropParam); @@ -54,7 +52,7 @@ It updates the weights using:: weight = weight - learning_rate * sign(gradient) -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(2) @@ -83,7 +81,7 @@ It updates the weights using:: Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(3) @@ -315,193 +313,6 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs, return dispatched; } -NNVM_REGISTER_OP(multi_sgd_update) -.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer. - -It updates the weights using:: - - weight = weight - learning_rate * (gradient + wd * weight) - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 2); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", ElemwiseType<-1, -1>) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FCompute", MultiSGDUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_sgd_mom_update) -.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer. - -Momentum update has better convergence rates on neural networks. Mathematically it looks -like below: - -.. math:: - - v_1 = \alpha * \nabla J(W_0)\\ - v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ - W_t = W_{t-1} + v_t - -It updates the weights using:: - - v = momentum * v - learning_rate * gradient - weight += v - -Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 3); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", ElemwiseType<-1, -1>) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("mom_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 3 + 2); - } - return ret; - }) -.set_attr("FCompute", MultiSGDMomUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum") -.add_arguments(MultiSGDMomParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_mp_sgd_update) -.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer. - -It updates the weights using:: - - weight = weight - learning_rate * (gradient + wd * weight) - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 3); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", MP_MultiSGD_InferType) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("weight32_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 3 + 2); - } - return ret; - }) -.set_attr("FCompute", MultiSGDUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_mp_sgd_mom_update) -.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer. - -Momentum update has better convergence rates on neural networks. Mathematically it looks -like below: - -.. math:: - - v_1 = \alpha * \nabla J(W_0)\\ - v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ - W_t = W_{t-1} + v_t - -It updates the weights using:: - - v = momentum * v - learning_rate * gradient - weight += v - -Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 4); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", MP_MultiSGD_InferType) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("mom_") + std::to_string(i)); - ret.push_back(std::string("weight32_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 4 + 2); - ret.push_back(i * 4 + 3); - } - return ret; - }) -.set_attr("FCompute", MultiSGDMomUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDMomParam::__FIELDS__()); NNVM_REGISTER_OP(sgd_update) MXNET_ADD_SPARSE_OP_ALIAS(sgd_update) diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index c42cf1831c43..0fd2ca83fda4 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -242,15 +242,6 @@ NNVM_REGISTER_OP(mp_sgd_update) NNVM_REGISTER_OP(mp_sgd_mom_update) .set_attr("FCompute", MP_SGDMomUpdate); -NNVM_REGISTER_OP(multi_sgd_update) -.set_attr("FCompute", MultiSGDUpdate); -NNVM_REGISTER_OP(multi_sgd_mom_update) -.set_attr("FCompute", MultiSGDMomUpdate); -NNVM_REGISTER_OP(multi_mp_sgd_update) -.set_attr("FCompute", MultiSGDUpdate); -NNVM_REGISTER_OP(multi_mp_sgd_mom_update) -.set_attr("FCompute", MultiSGDMomUpdate); - NNVM_REGISTER_OP(ftml_update) .set_attr("FCompute", FTMLUpdate); diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 9f190a0a88c2..985c38c31356 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -17,7 +17,6 @@ import mxnet as mx import unittest -import os import numpy as np from mxnet import gluon from mxnet.gluon import nn @@ -99,9 +98,6 @@ def dict_equ(a, b): @with_seed() def test_trainer_save_load(): - previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") - os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') - x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) @@ -116,7 +112,6 @@ def test_trainer_save_load(): x.lr_mult = 2.0 # check if parameter dict is correctly associated with optimizer after load_state assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 - os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() def test_trainer_sparse_save_load(): @@ -241,11 +236,10 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected): assert isinstance(err, expected) kvs = ['local', 'device'] - global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) for kv in kvs: check_trainer_sparse_kv(kv, 'default', 'default', True, True) check_trainer_sparse_kv(kv, 'default', 'default', False, False) - check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore) + check_trainer_sparse_kv(kv, 'default', 'default', None, True) check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False) check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True) check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False) diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index ae38a2297ded..144fbeef213f 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -174,8 +174,6 @@ def test_module_layout(): @with_seed() def test_save_load(): - previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") - os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') def dict_equ(a, b): assert set(a) == set(b) for k in a: @@ -213,7 +211,6 @@ def dict_equ(a, b): assert mod._symbol.tojson() == mod2._symbol.tojson() dict_equ(mod.get_params()[0], mod2.get_params()[0]) dict_equ(mod._kvstore._updater.states, mod2._updater.states) - os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() From fabc318a0ff7e9b22371e475edf0e3249f4d8b94 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 28 Jan 2019 18:19:35 +0000 Subject: [PATCH 11/26] Revert "Aggregate SGD (#13346)" This reverts commit 0a45e1a222637c7dee29511cbfc43e594571933b. --- cpp-package/scripts/OpWrapperGenerator.py | 4 +- docs/faq/env_var.md | 4 - python/mxnet/gluon/trainer.py | 15 +- python/mxnet/model.py | 10 +- python/mxnet/optimizer/optimizer.py | 231 ++++----------- src/operator/optimizer_op-inl.h | 295 -------------------- src/operator/optimizer_op.cc | 193 +------------ src/operator/optimizer_op.cu | 9 - tests/python/unittest/test_gluon_trainer.py | 8 +- tests/python/unittest/test_module.py | 3 - 10 files changed, 66 insertions(+), 706 deletions(-) diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py index 65ba247c25c8..ca430ec99e6e 100644 --- a/cpp-package/scripts/OpWrapperGenerator.py +++ b/cpp-package/scripts/OpWrapperGenerator.py @@ -97,8 +97,7 @@ class Arg: 'double':'double',\ 'double or None':'dmlc::optional',\ 'Shape or None':'dmlc::optional',\ - 'string':'const std::string&',\ - 'tuple of ':'nnvm::Tuple'} + 'string':'const std::string&'} name = '' type = '' description = '' @@ -408,7 +407,6 @@ def ParseAllOps(): "#include \"mxnet-cpp/op_util.h\"\n" "#include \"mxnet-cpp/operator.h\"\n" "#include \"dmlc/optional.h\"\n" - "#include \"nnvm/tuple.h\"\n" "\n" "namespace mxnet {\n" "namespace cpp {\n" diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index 83368bf4d0c3..bb29cc410c18 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -145,10 +145,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device, when kvstore's type is `device`. -* MXNET_UPDATE_ON_KVSTORE - - Values: 0(false) or 1(true) ```(default=1)``` - - If true, weight updates are performed during the communication step, if possible. - ## Memonger * MXNET_BACKWARD_DO_MIRROR diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 8060f38ac2aa..f6c0a31b52e2 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -60,8 +60,7 @@ class Trainer(object): See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more - suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is - provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. + suitable option depending on the type of kvstore. Properties ---------- @@ -394,8 +393,6 @@ def update(self, batch_size, ignore_stale_grad=False): self._update(ignore_stale_grad) def _update(self, ignore_stale_grad=False): - updates = [[] for _ in self._updaters] - for i, param in enumerate(self._params): if param.grad_req == 'null': continue @@ -419,17 +416,11 @@ def _update(self, ignore_stale_grad=False): self._kvstore.pull(i, param.list_data(), priority=-i) continue - for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()): + for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()): if not ignore_stale_grad or arr._fresh_grad: - upd.append((i, grad, arr)) + upd(i, grad, arr) arr._fresh_grad = False - if not (self._kvstore and self._update_on_kvstore): - for updater, upd in zip(self._updaters, updates): - if upd: - i, w, g = zip(*upd) - updater(i, w, g) - def save_states(self, fname): """Saves trainer states (e.g. optimizer, momentum) to a file. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index c08077cc65f4..38fe739154d5 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params): arg_params : dict of str to `NDArray`. Model parameter, dict of name to `NDArray` of net's weights. """ - update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) + update_on_kvstore = True if kvstore is None: kv = None elif isinstance(kvstore, kvs.KVStore): kv = kvstore elif isinstance(kvstore, str): # create kvstore using the string type - if num_device == 1 and 'dist' not in kvstore: + if num_device is 1 and 'dist' not in kvstore: # no need to use kv for single device and single machine kv = None else: @@ -162,7 +162,6 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): def _update_params(param_arrays, grad_arrays, updater, num_device, kvstore=None, param_names=None): """Perform update of param_arrays from grad_arrays not on kvstore.""" - updates = [[] for _ in range(num_device)] for i, pair in enumerate(zip(param_arrays, grad_arrays)): arg_list, grad_list = pair if grad_list[0] is None: @@ -179,10 +178,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device, # state for the same index but on diff devs, TODO(mli) # use a better solution later w, g = p - updates[k].append((index*num_device+k, g, w)) - for dev_updates in updates: - i, w, g = zip(*dev_updates) - updater(i, w, g) + updater(index*num_device+k, g, w) def _multiple_callbacks(callbacks, *args, **kwargs): diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py index cb52ac54fdab..6ffbbcffc384 100644 --- a/python/mxnet/optimizer/optimizer.py +++ b/python/mxnet/optimizer/optimizer.py @@ -22,15 +22,12 @@ import math import pickle import warnings -import os import numpy from ..base import py_str from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply) from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update, mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update, - signsgd_update, signum_update, - multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update, - multi_mp_sgd_mom_update) + signsgd_update, signum_update) from ..ndarray import sparse from ..random import normal @@ -40,8 +37,6 @@ 'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register' ] -def _flatten_list(nested_list): - return [item for sublist in nested_list for item in sublist] class Optimizer(object): """The base class inherited by all optimizers. @@ -110,7 +105,6 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0., self._index_update_count = {} self.clip_gradient = clip_gradient self.multi_precision = multi_precision - self.aggregate_num = 0 if param_idx2name is None: param_idx2name = {} @@ -386,44 +380,13 @@ def _update_count(self, index): Parameters ---------- - index : int or list of int + index : int The index to be updated. """ - if not isinstance(index, (list, tuple)): - index = [index] - for idx in index: - if idx not in self._index_update_count: - self._index_update_count[idx] = self.begin_num_update - self._index_update_count[idx] += 1 - self.num_update = max(self._index_update_count[idx], self.num_update) - - def _get_lrs(self, indices): - """Gets the learning rates given the indices of the weights. - - Parameters - ---------- - indices : list of int - Indices corresponding to weights. - - Returns - ------- - lrs : list of float - Learning rates for those indices. - """ - if self.lr_scheduler is not None: - lr = self.lr_scheduler(self.num_update) - else: - lr = self.lr - - lrs = [lr for _ in indices] - for i, index in enumerate(indices): - if index in self.param_dict: - lrs[i] *= self.param_dict[index].lr_mult - elif index in self.lr_mult: - lrs[i] *= self.lr_mult[index] - elif index in self.idx2name: - lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0) - return lrs + if index not in self._index_update_count: + self._index_update_count[index] = self.begin_num_update + self._index_update_count[index] += 1 + self.num_update = max(self._index_update_count[index], self.num_update) def _get_lr(self, index): """Gets the learning rate given the index of the weight. @@ -438,31 +401,18 @@ def _get_lr(self, index): lr : float Learning rate for this index. """ - return self._get_lrs([index])[0] - - def _get_wds(self, indices): - """Gets weight decays for indices. - Returns 0 for non-weights if the name of weights are provided for `__init__`. - - Parameters - ---------- - indices : list of int - Indices of weights. + if self.lr_scheduler is not None: + lr = self.lr_scheduler(self.num_update) + else: + lr = self.lr - Returns - ------- - wds : list of float - Weight decays for those indices. - """ - wds = [self.wd for _ in indices] - for i, index in enumerate(indices): - if index in self.param_dict: - wds[i] *= self.param_dict[index].wd_mult - elif index in self.wd_mult: - wds[i] *= self.wd_mult[index] - elif index in self.idx2name: - wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0) - return wds + if index in self.param_dict: + lr *= self.param_dict[index].lr_mult + elif index in self.lr_mult: + lr *= self.lr_mult[index] + elif index in self.idx2name: + lr *= self.lr_mult.get(self.idx2name[index], 1.0) + return lr def _get_wd(self, index): """Gets weight decay for index. @@ -471,14 +421,21 @@ def _get_wd(self, index): Parameters ---------- index : int - The index of weight. + The index for weight. Returns ------- wd : float Weight decay for this index. """ - return self._get_wds([index])[0] + wd = self.wd + if index in self.param_dict: + wd *= self.param_dict[index].wd_mult + elif index in self.wd_mult: + wd *= self.wd_mult[index] + elif index in self.idx2name: + wd *= self.wd_mult.get(self.idx2name[index], 1.0) + return wd def __getstate__(self): ret = self.__dict__.copy() @@ -514,13 +471,6 @@ class SGD(Optimizer): provides slightly different semantics than the original update, and may lead to different empirical results. - In the case when ``update_on_kvstore`` is set to False (either globally via - MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in - :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update - of parameters, which may lead to improved performance. The aggregation size - is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and - defaults to 4. - Otherwise, **standard updates** are applied by:: rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight) @@ -552,7 +502,6 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs): super(SGD, self).__init__(**kwargs) self.momentum = momentum self.lazy_update = lazy_update - self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4")) def create_state_multi_precision(self, index, weight): weight_master_copy = None @@ -573,22 +522,12 @@ def create_state(self, index, weight): momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype) return momentum - def _update_impl(self, indices, weights, grads, states, multi_precision=False): - aggregate = True - if not isinstance(indices, (tuple, list)): - indices = [indices] - weights = [weights] - grads = [grads] - states = [states] - for weight, grad in zip(weights, grads): - assert(isinstance(weight, NDArray)) - assert(isinstance(grad, NDArray)) - aggregate = (aggregate and - weight.stype == 'default' and - grad.stype == 'default') - self._update_count(indices) - lrs = self._get_lrs(indices) - wds = self._get_wds(indices) + def _update_impl(self, index, weight, grad, state, multi_precision=False): + assert(isinstance(weight, NDArray)) + assert(isinstance(grad, NDArray)) + self._update_count(index) + lr = self._get_lr(index) + wd = self._get_wd(index) kwargs = {'rescale_grad': self.rescale_grad} if self.momentum > 0: @@ -596,49 +535,26 @@ def _update_impl(self, indices, weights, grads, states, multi_precision=False): if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient - if aggregate: - if not multi_precision: - if self.momentum > 0: - multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights, - num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) - else: - multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights, - num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) + if not multi_precision: + if state is not None: + sgd_mom_update(weight, grad, state, out=weight, + lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) else: - if self.momentum > 0: - multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))), - out=weights, num_weights=len(weights), - lrs=lrs, wds=wds, **kwargs) - else: - multi_mp_sgd_update(*_flatten_list(zip(weights, grads, - list(zip(*states))[1])), - out=weights, num_weights=len(weights), - lrs=lrs, wds=wds, **kwargs) + sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, + lr=lr, wd=wd, **kwargs) else: - for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds): - if not multi_precision: - if state is not None: - sgd_mom_update(weight, grad, state, out=weight, - lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) - else: - sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, - lr=lr, wd=wd, **kwargs) - else: - if state[0] is not None: - mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, - lr=lr, wd=wd, **kwargs) - else: - mp_sgd_update(weight, grad, state[1], out=weight, - lr=lr, wd=wd, **kwargs) + if state[0] is not None: + mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, + lr=lr, wd=wd, **kwargs) + else: + mp_sgd_update(weight, grad, state[1], out=weight, + lr=lr, wd=wd, **kwargs) def update(self, index, weight, grad, state): self._update_impl(index, weight, grad, state, multi_precision=False) def update_multi_precision(self, index, weight, grad, state): - if not isinstance(index, (tuple, list)): - use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 - else: - use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16 + use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 self._update_impl(index, weight, grad, state, multi_precision=use_multi_precision) @@ -1609,55 +1525,20 @@ def __init__(self, optimizer): self.optimizer = optimizer self.states = {} self.states_synced = {} - self.aggregate_updates = optimizer.aggregate_num > 0 def __call__(self, index, grad, weight): """Updates weight given gradient and index.""" - if not isinstance(index, (list, tuple)): - indices = [index] - grads = [grad] - weights = [weight] - else: - indices = index - grads = grad - weights = weight - for i, idx in enumerate(indices): - # convert ctypes.char_p.value back to python str if needed - if isinstance(idx, bytes): - indices[i] = py_str(idx) - idx = indices[i] - if idx not in self.states: - self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i]) - self.states_synced[idx] = True - elif not self.states_synced[idx]: - self.states[idx] = \ - self.sync_state_context(self.states[idx], weights[i].context) - self.states_synced[idx] = True - if self.aggregate_updates: - # segregate values based on type - type_map = {} - for i, w, g in zip(indices, weights, grads): - if w.dtype in type_map: - type_map[w.dtype].append((i, w, g)) - else: - type_map[w.dtype] = [(i, w, g)] - for idx in type_map: - current_index = 0 - indices, weights, grads = zip(*type_map[idx]) - while current_index < len(indices): - states = [] - step = min(self.optimizer.aggregate_num, len(indices) - current_index) - for j in range(step): - states.append(self.states[indices[current_index + j]]) - self.optimizer.update_multi_precision( - indices[current_index:current_index + self.optimizer.aggregate_num], - weights[current_index:current_index + self.optimizer.aggregate_num], - grads[current_index:current_index + self.optimizer.aggregate_num], - states) - current_index += self.optimizer.aggregate_num - else: - for i, w, g in zip(indices, weights, grads): - self.optimizer.update_multi_precision(i, w, g, self.states[i]) + # convert ctypes.char_p.value back to python str if needed + if isinstance(index, bytes): + index = py_str(index) + if index not in self.states: + self.states[index] = self.optimizer.create_state_multi_precision(index, weight) + self.states_synced[index] = True + elif not self.states_synced[index]: + self.states[index] = \ + self.sync_state_context(self.states[index], weight.context) + self.states_synced[index] = True + self.optimizer.update_multi_precision(index, weight, grad, self.states[index]) def sync_state_context(self, state, context): """sync state context.""" diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 223a1aa6c37d..9251b8614806 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -82,301 +82,6 @@ struct SGDParam : public dmlc::Parameter { } }; -struct MultiSGDParam : public dmlc::Parameter { - nnvm::Tuple lrs; - nnvm::Tuple wds; - float rescale_grad; - float clip_gradient; - int num_weights; - DMLC_DECLARE_PARAMETER(MultiSGDParam) { - DMLC_DECLARE_FIELD(lrs) - .describe("Learning rates."); - DMLC_DECLARE_FIELD(wds) - .describe("Weight decay augments the objective function with a " - "regularization term that penalizes large weights. " - "The penalty scales with the square of the magnitude of each weight."); - DMLC_DECLARE_FIELD(rescale_grad) - .set_default(1.0f) - .describe("Rescale gradient to grad = rescale_grad*grad."); - DMLC_DECLARE_FIELD(clip_gradient) - .set_default(-1.0f) - .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " - "If clip_gradient <= 0, gradient clipping is turned off. " - "grad = max(min(grad, clip_gradient), -clip_gradient)."); - DMLC_DECLARE_FIELD(num_weights) - .set_default(1) - .describe("Number of updated weights."); - } -}; - -struct MultiSGDMomParam : public dmlc::Parameter { - nnvm::Tuple lrs; - nnvm::Tuple wds; - float momentum; - float rescale_grad; - float clip_gradient; - int num_weights; - DMLC_DECLARE_PARAMETER(MultiSGDMomParam) { - DMLC_DECLARE_FIELD(lrs) - .describe("Learning rates."); - DMLC_DECLARE_FIELD(wds) - .describe("Weight decay augments the objective function with a " - "regularization term that penalizes large weights. " - "The penalty scales with the square of the magnitude of each weight."); - DMLC_DECLARE_FIELD(momentum) - .set_default(0.0f) - .describe("The decay rate of momentum estimates at each epoch."); - DMLC_DECLARE_FIELD(rescale_grad) - .set_default(1.0f) - .describe("Rescale gradient to grad = rescale_grad*grad."); - DMLC_DECLARE_FIELD(clip_gradient) - .set_default(-1.0f) - .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " - "If clip_gradient <= 0, gradient clipping is turned off. " - "grad = max(min(grad, clip_gradient), -clip_gradient)."); - DMLC_DECLARE_FIELD(num_weights) - .set_default(1) - .describe("Number of updated weights."); - } -}; - -template -inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const ParamType& param = dmlc::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); - CHECK_EQ(out_attrs->size(), param.num_weights); - - bool all_inferred = true; - auto& input_shapes = *in_attrs; - auto& output_shapes = *out_attrs; - // Learning rates - CHECK_EQ(param.lrs.ndim(), param.num_weights) - << "Number of learning rates is inconsistent with num_weights " - << "parameter passed. Expected number of learning rates: " - << param.num_weights << ", and got " << param.lrs.ndim(); - // Weight decays - CHECK_EQ(param.wds.ndim(), param.num_weights) - << "Number of weight decays is inconsistent with num_weights " - << "parameter passed. Expected number of weight decays: " - << param.num_weights << ", and got " << param.wds.ndim(); - // Weights and gradients - for (int i = 0; i < param.num_weights; ++i) { - std::vector input_vec; - std::vector output_vec({output_shapes[i]}); - for (int j = 0; j < input_stride; ++j) { - input_vec.push_back(input_shapes[i * input_stride + j]); - } - all_inferred = all_inferred && ElemwiseShape(attrs, &input_vec, &output_vec); - } - return all_inferred; -} - -template -inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - const ParamType& param = dmlc::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); - CHECK_EQ(out_attrs->size(), param.num_weights); - - bool all_inferred = true; - auto& input_types = *in_attrs; - auto& output_types = *out_attrs; - // Weights and gradients - for (int i = 0; i < param.num_weights; ++i) { - std::vector input_vec; - std::vector output_vec({output_types[i]}); - for (int j = 0; j < input_stride - num_fp32_inputs; ++j) { - input_vec.push_back(input_types[i * input_stride + j]); - } - all_inferred = all_inferred && - ElemwiseType(attrs, &input_vec, &output_vec); - } - // master copies of weights - for (int i = 0; i < param.num_weights; ++i) { - for (int j = 0; j < num_fp32_inputs; ++j) { - TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32); - } - } - return all_inferred; -} - -template -struct MultiSGDKernelParam { - static const int N = 60; - int count; - size_t max_size; - size_t sizes[N]; - DType * weights[N]; - DType * grads[N]; - MPDType * mom[N]; - MPDType * weights32[N]; - DType * out_data[N]; - MPDType lrs[N]; - MPDType wds[N]; - MPDType clip_gradient; - MPDType rescale_grad; - MPDType momentum; -}; - -template -struct MultiSGDKernel { - template - MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam& param, - const OpReqType req) { - for (int index = 0; index < param.count; ++index) { - if ((size_t)i < param.sizes[index]) { - MPDType w = has_mixed_precision ? param.weights32[index][i] : - MPDType(param.weights[index][i]); - MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0); - if (param.clip_gradient >= 0.0f) { - mom = param.momentum*mom - - param.lrs[index]*param.wds[index]*w - - param.lrs[index] - *mshadow_op::clip::Map(param.rescale_grad * - static_cast(param.grads[index][i]), - param.clip_gradient); - } else { - mom = param.momentum*mom - - param.lrs[index]*param.wds[index]*w - - param.lrs[index]*param.rescale_grad*static_cast(param.grads[index][i]); - } - if (has_momentum) { - param.mom[index][i] = mom; - } - w = w + mom; - if (has_mixed_precision) { - param.weights32[index][i] = w; - } - KERNEL_ASSIGN(param.out_data[index][i], req, w); - } - } - } -}; - -template -MultiSGDKernelParam FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &outputs) { - using namespace mxnet_op; - const ParamType& p = nnvm::get(attrs.parsed); - Stream* s = ctx.get_stream(); - MultiSGDKernelParam param; - param.clip_gradient = p.clip_gradient; - param.rescale_grad = p.rescale_grad; - param.momentum = 0; - param.count = p.num_weights; - param.max_size = 0; - for (int i = 0; i < param.count; ++i) { - param.sizes[i] = inputs[i * input_stride].shape_.Size(); - if (param.max_size < param.sizes[i]) { - param.max_size = param.sizes[i]; - } - param.weights[i] = inputs[i * input_stride].FlatTo2D(s).dptr_; - param.grads[i] = inputs[i * input_stride + 1].FlatTo2D(s).dptr_; - // if mixed precision, then the last input in a set - // is 32-bit master copy of the weights - if (!std::is_same::value) { - param.weights32[i] = inputs[i * input_stride + input_stride - 1] - .FlatTo2D(s).dptr_; - } - param.out_data[i] = outputs[i].FlatTo2D(s).dptr_; - param.lrs[i] = p.lrs[i]; - param.wds[i] = p.wds[i]; - } - - return param; -} - - -template -MultiSGDKernelParam FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &outputs) { - using namespace mxnet_op; - const MultiSGDMomParam& p = nnvm::get(attrs.parsed); - Stream* s = ctx.get_stream(); - MultiSGDKernelParam param = - FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); - param.momentum = p.momentum; - for (int i = 0; i < param.count; ++i) { - param.mom[i] = inputs[i * input_stride + 2].FlatTo2D(s).dptr_; - } - - return param; -} - -template -class type_identity { - public: - using type = T; -}; - -template -class single_precision { - public: - using type = float; -}; - -template class MPTypeChooser, int input_stride> -inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mxnet_op; - Stream* s = ctx.get_stream(); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { - using MPDType = typename MPTypeChooser::type; - MultiSGDKernelParam param = - FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); - Kernel::value>, - xpu>::Launch(s, param.max_size, param, req[0]); - }); -} - -template class MPTypeChooser, int input_stride> -inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - using namespace mxnet_op; - Stream* s = ctx.get_stream(); - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { - using MPDType = typename MPTypeChooser::type; - MultiSGDKernelParam param = - FillMultiSGDMomKernelParam(attrs, ctx, inputs, outputs); - Kernel::value>, - xpu>::Launch(s, param.max_size, param, req[0]); - }); -} struct SGDKernel { template diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index 982995ad2f95..a52a6f32907c 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -31,8 +31,6 @@ namespace op { DMLC_REGISTER_PARAMETER(SGDParam); DMLC_REGISTER_PARAMETER(SGDMomParam); -DMLC_REGISTER_PARAMETER(MultiSGDParam); -DMLC_REGISTER_PARAMETER(MultiSGDMomParam); DMLC_REGISTER_PARAMETER(FTMLParam); DMLC_REGISTER_PARAMETER(AdamParam); DMLC_REGISTER_PARAMETER(RMSPropParam); @@ -54,7 +52,7 @@ It updates the weights using:: weight = weight - learning_rate * sign(gradient) -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(2) @@ -83,7 +81,7 @@ It updates the weights using:: Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(3) @@ -315,193 +313,6 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs, return dispatched; } -NNVM_REGISTER_OP(multi_sgd_update) -.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer. - -It updates the weights using:: - - weight = weight - learning_rate * (gradient + wd * weight) - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 2); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", ElemwiseType<-1, -1>) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FCompute", MultiSGDUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_sgd_mom_update) -.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer. - -Momentum update has better convergence rates on neural networks. Mathematically it looks -like below: - -.. math:: - - v_1 = \alpha * \nabla J(W_0)\\ - v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ - W_t = W_{t-1} + v_t - -It updates the weights using:: - - v = momentum * v - learning_rate * gradient - weight += v - -Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 3); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", ElemwiseType<-1, -1>) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("mom_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 3 + 2); - } - return ret; - }) -.set_attr("FCompute", MultiSGDMomUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum") -.add_arguments(MultiSGDMomParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_mp_sgd_update) -.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer. - -It updates the weights using:: - - weight = weight - learning_rate * (gradient + wd * weight) - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 3); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", MP_MultiSGD_InferType) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("weight32_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 3 + 2); - } - return ret; - }) -.set_attr("FCompute", MultiSGDUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDParam::__FIELDS__()); - -NNVM_REGISTER_OP(multi_mp_sgd_mom_update) -.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer. - -Momentum update has better convergence rates on neural networks. Mathematically it looks -like below: - -.. math:: - - v_1 = \alpha * \nabla J(W_0)\\ - v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ - W_t = W_{t-1} + v_t - -It updates the weights using:: - - v = momentum * v - learning_rate * gradient - weight += v - -Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. - -)code" ADD_FILELINE) -.set_num_inputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights * 4); - }) -.set_num_outputs([](const nnvm::NodeAttrs& attrs) { - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - return static_cast(param.num_weights); - }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", MultiSGDShape) -.set_attr("FInferType", MP_MultiSGD_InferType) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - uint32_t num_args = dmlc::get(attrs.parsed).num_weights; - std::vector ret; - for (uint32_t i = 0; i < num_args; ++i) { - ret.push_back(std::string("weight_") + std::to_string(i)); - ret.push_back(std::string("grad_") + std::to_string(i)); - ret.push_back(std::string("mom_") + std::to_string(i)); - ret.push_back(std::string("weight32_") + std::to_string(i)); - } - return ret; - }) -.set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - std::vector ret; - const MultiSGDMomParam& param = dmlc::get(attrs.parsed); - for (int i = 0; i < param.num_weights; ++i) { - ret.push_back(i * 4 + 2); - ret.push_back(i * 4 + 3); - } - return ret; - }) -.set_attr("FCompute", MultiSGDMomUpdate) -.add_argument("data", "NDArray-or-Symbol[]", "Weights") -.add_arguments(MultiSGDMomParam::__FIELDS__()); NNVM_REGISTER_OP(sgd_update) MXNET_ADD_SPARSE_OP_ALIAS(sgd_update) diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index c42cf1831c43..0fd2ca83fda4 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -242,15 +242,6 @@ NNVM_REGISTER_OP(mp_sgd_update) NNVM_REGISTER_OP(mp_sgd_mom_update) .set_attr("FCompute", MP_SGDMomUpdate); -NNVM_REGISTER_OP(multi_sgd_update) -.set_attr("FCompute", MultiSGDUpdate); -NNVM_REGISTER_OP(multi_sgd_mom_update) -.set_attr("FCompute", MultiSGDMomUpdate); -NNVM_REGISTER_OP(multi_mp_sgd_update) -.set_attr("FCompute", MultiSGDUpdate); -NNVM_REGISTER_OP(multi_mp_sgd_mom_update) -.set_attr("FCompute", MultiSGDMomUpdate); - NNVM_REGISTER_OP(ftml_update) .set_attr("FCompute", FTMLUpdate); diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 9f190a0a88c2..985c38c31356 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -17,7 +17,6 @@ import mxnet as mx import unittest -import os import numpy as np from mxnet import gluon from mxnet.gluon import nn @@ -99,9 +98,6 @@ def dict_equ(a, b): @with_seed() def test_trainer_save_load(): - previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") - os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') - x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) @@ -116,7 +112,6 @@ def test_trainer_save_load(): x.lr_mult = 2.0 # check if parameter dict is correctly associated with optimizer after load_state assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 - os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() def test_trainer_sparse_save_load(): @@ -241,11 +236,10 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected): assert isinstance(err, expected) kvs = ['local', 'device'] - global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) for kv in kvs: check_trainer_sparse_kv(kv, 'default', 'default', True, True) check_trainer_sparse_kv(kv, 'default', 'default', False, False) - check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore) + check_trainer_sparse_kv(kv, 'default', 'default', None, True) check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False) check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True) check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False) diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index ae38a2297ded..144fbeef213f 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -174,8 +174,6 @@ def test_module_layout(): @with_seed() def test_save_load(): - previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") - os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') def dict_equ(a, b): assert set(a) == set(b) for k in a: @@ -213,7 +211,6 @@ def dict_equ(a, b): assert mod._symbol.tojson() == mod2._symbol.tojson() dict_equ(mod.get_params()[0], mod2.get_params()[0]) dict_equ(mod._kvstore._updater.states, mod2._updater.states) - os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() From a649f670be13d85689d3fb1236cfb6cc562436b3 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Thu, 14 Feb 2019 07:40:12 -0800 Subject: [PATCH 12/26] Revert "Revert "Aggregate SGD (#13346)"" This reverts commit fabc318a0ff7e9b22371e475edf0e3249f4d8b94. --- cpp-package/scripts/OpWrapperGenerator.py | 4 +- docs/faq/env_var.md | 4 + python/mxnet/gluon/trainer.py | 15 +- python/mxnet/model.py | 10 +- python/mxnet/optimizer/optimizer.py | 231 +++++++++++---- src/operator/optimizer_op-inl.h | 295 ++++++++++++++++++++ src/operator/optimizer_op.cc | 193 ++++++++++++- src/operator/optimizer_op.cu | 9 + tests/python/unittest/test_gluon_trainer.py | 8 +- tests/python/unittest/test_module.py | 3 + 10 files changed, 706 insertions(+), 66 deletions(-) diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py index ca430ec99e6e..65ba247c25c8 100644 --- a/cpp-package/scripts/OpWrapperGenerator.py +++ b/cpp-package/scripts/OpWrapperGenerator.py @@ -97,7 +97,8 @@ class Arg: 'double':'double',\ 'double or None':'dmlc::optional',\ 'Shape or None':'dmlc::optional',\ - 'string':'const std::string&'} + 'string':'const std::string&',\ + 'tuple of ':'nnvm::Tuple'} name = '' type = '' description = '' @@ -407,6 +408,7 @@ def ParseAllOps(): "#include \"mxnet-cpp/op_util.h\"\n" "#include \"mxnet-cpp/operator.h\"\n" "#include \"dmlc/optional.h\"\n" + "#include \"nnvm/tuple.h\"\n" "\n" "namespace mxnet {\n" "namespace cpp {\n" diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index 87882baa4f6b..c35d4e5723a5 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -162,6 +162,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device, when kvstore's type is `device`. +* MXNET_UPDATE_ON_KVSTORE + - Values: 0(false) or 1(true) ```(default=1)``` + - If true, weight updates are performed during the communication step, if possible. + ## Memonger * MXNET_BACKWARD_DO_MIRROR diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index f6c0a31b52e2..8060f38ac2aa 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -60,7 +60,8 @@ class Trainer(object): See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. update_on_kvstore : bool, default None Whether to perform parameter updates on kvstore. If None, then trainer will choose the more - suitable option depending on the type of kvstore. + suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is + provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored. Properties ---------- @@ -393,6 +394,8 @@ def update(self, batch_size, ignore_stale_grad=False): self._update(ignore_stale_grad) def _update(self, ignore_stale_grad=False): + updates = [[] for _ in self._updaters] + for i, param in enumerate(self._params): if param.grad_req == 'null': continue @@ -416,11 +419,17 @@ def _update(self, ignore_stale_grad=False): self._kvstore.pull(i, param.list_data(), priority=-i) continue - for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()): + for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()): if not ignore_stale_grad or arr._fresh_grad: - upd(i, grad, arr) + upd.append((i, grad, arr)) arr._fresh_grad = False + if not (self._kvstore and self._update_on_kvstore): + for updater, upd in zip(self._updaters, updates): + if upd: + i, w, g = zip(*upd) + updater(i, w, g) + def save_states(self, fname): """Saves trainer states (e.g. optimizer, momentum) to a file. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 38fe739154d5..c08077cc65f4 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params): arg_params : dict of str to `NDArray`. Model parameter, dict of name to `NDArray` of net's weights. """ - update_on_kvstore = True + update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) if kvstore is None: kv = None elif isinstance(kvstore, kvs.KVStore): kv = kvstore elif isinstance(kvstore, str): # create kvstore using the string type - if num_device is 1 and 'dist' not in kvstore: + if num_device == 1 and 'dist' not in kvstore: # no need to use kv for single device and single machine kv = None else: @@ -162,6 +162,7 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): def _update_params(param_arrays, grad_arrays, updater, num_device, kvstore=None, param_names=None): """Perform update of param_arrays from grad_arrays not on kvstore.""" + updates = [[] for _ in range(num_device)] for i, pair in enumerate(zip(param_arrays, grad_arrays)): arg_list, grad_list = pair if grad_list[0] is None: @@ -178,7 +179,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device, # state for the same index but on diff devs, TODO(mli) # use a better solution later w, g = p - updater(index*num_device+k, g, w) + updates[k].append((index*num_device+k, g, w)) + for dev_updates in updates: + i, w, g = zip(*dev_updates) + updater(i, w, g) def _multiple_callbacks(callbacks, *args, **kwargs): diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py index 6ffbbcffc384..cb52ac54fdab 100644 --- a/python/mxnet/optimizer/optimizer.py +++ b/python/mxnet/optimizer/optimizer.py @@ -22,12 +22,15 @@ import math import pickle import warnings +import os import numpy from ..base import py_str from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply) from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update, mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update, - signsgd_update, signum_update) + signsgd_update, signum_update, + multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update, + multi_mp_sgd_mom_update) from ..ndarray import sparse from ..random import normal @@ -37,6 +40,8 @@ 'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register' ] +def _flatten_list(nested_list): + return [item for sublist in nested_list for item in sublist] class Optimizer(object): """The base class inherited by all optimizers. @@ -105,6 +110,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0., self._index_update_count = {} self.clip_gradient = clip_gradient self.multi_precision = multi_precision + self.aggregate_num = 0 if param_idx2name is None: param_idx2name = {} @@ -380,13 +386,44 @@ def _update_count(self, index): Parameters ---------- - index : int + index : int or list of int The index to be updated. """ - if index not in self._index_update_count: - self._index_update_count[index] = self.begin_num_update - self._index_update_count[index] += 1 - self.num_update = max(self._index_update_count[index], self.num_update) + if not isinstance(index, (list, tuple)): + index = [index] + for idx in index: + if idx not in self._index_update_count: + self._index_update_count[idx] = self.begin_num_update + self._index_update_count[idx] += 1 + self.num_update = max(self._index_update_count[idx], self.num_update) + + def _get_lrs(self, indices): + """Gets the learning rates given the indices of the weights. + + Parameters + ---------- + indices : list of int + Indices corresponding to weights. + + Returns + ------- + lrs : list of float + Learning rates for those indices. + """ + if self.lr_scheduler is not None: + lr = self.lr_scheduler(self.num_update) + else: + lr = self.lr + + lrs = [lr for _ in indices] + for i, index in enumerate(indices): + if index in self.param_dict: + lrs[i] *= self.param_dict[index].lr_mult + elif index in self.lr_mult: + lrs[i] *= self.lr_mult[index] + elif index in self.idx2name: + lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0) + return lrs def _get_lr(self, index): """Gets the learning rate given the index of the weight. @@ -401,18 +438,31 @@ def _get_lr(self, index): lr : float Learning rate for this index. """ - if self.lr_scheduler is not None: - lr = self.lr_scheduler(self.num_update) - else: - lr = self.lr + return self._get_lrs([index])[0] - if index in self.param_dict: - lr *= self.param_dict[index].lr_mult - elif index in self.lr_mult: - lr *= self.lr_mult[index] - elif index in self.idx2name: - lr *= self.lr_mult.get(self.idx2name[index], 1.0) - return lr + def _get_wds(self, indices): + """Gets weight decays for indices. + Returns 0 for non-weights if the name of weights are provided for `__init__`. + + Parameters + ---------- + indices : list of int + Indices of weights. + + Returns + ------- + wds : list of float + Weight decays for those indices. + """ + wds = [self.wd for _ in indices] + for i, index in enumerate(indices): + if index in self.param_dict: + wds[i] *= self.param_dict[index].wd_mult + elif index in self.wd_mult: + wds[i] *= self.wd_mult[index] + elif index in self.idx2name: + wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0) + return wds def _get_wd(self, index): """Gets weight decay for index. @@ -421,21 +471,14 @@ def _get_wd(self, index): Parameters ---------- index : int - The index for weight. + The index of weight. Returns ------- wd : float Weight decay for this index. """ - wd = self.wd - if index in self.param_dict: - wd *= self.param_dict[index].wd_mult - elif index in self.wd_mult: - wd *= self.wd_mult[index] - elif index in self.idx2name: - wd *= self.wd_mult.get(self.idx2name[index], 1.0) - return wd + return self._get_wds([index])[0] def __getstate__(self): ret = self.__dict__.copy() @@ -471,6 +514,13 @@ class SGD(Optimizer): provides slightly different semantics than the original update, and may lead to different empirical results. + In the case when ``update_on_kvstore`` is set to False (either globally via + MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in + :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update + of parameters, which may lead to improved performance. The aggregation size + is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and + defaults to 4. + Otherwise, **standard updates** are applied by:: rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight) @@ -502,6 +552,7 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs): super(SGD, self).__init__(**kwargs) self.momentum = momentum self.lazy_update = lazy_update + self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4")) def create_state_multi_precision(self, index, weight): weight_master_copy = None @@ -522,12 +573,22 @@ def create_state(self, index, weight): momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype) return momentum - def _update_impl(self, index, weight, grad, state, multi_precision=False): - assert(isinstance(weight, NDArray)) - assert(isinstance(grad, NDArray)) - self._update_count(index) - lr = self._get_lr(index) - wd = self._get_wd(index) + def _update_impl(self, indices, weights, grads, states, multi_precision=False): + aggregate = True + if not isinstance(indices, (tuple, list)): + indices = [indices] + weights = [weights] + grads = [grads] + states = [states] + for weight, grad in zip(weights, grads): + assert(isinstance(weight, NDArray)) + assert(isinstance(grad, NDArray)) + aggregate = (aggregate and + weight.stype == 'default' and + grad.stype == 'default') + self._update_count(indices) + lrs = self._get_lrs(indices) + wds = self._get_wds(indices) kwargs = {'rescale_grad': self.rescale_grad} if self.momentum > 0: @@ -535,26 +596,49 @@ def _update_impl(self, index, weight, grad, state, multi_precision=False): if self.clip_gradient: kwargs['clip_gradient'] = self.clip_gradient - if not multi_precision: - if state is not None: - sgd_mom_update(weight, grad, state, out=weight, - lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) + if aggregate: + if not multi_precision: + if self.momentum > 0: + multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights, + num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) + else: + multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights, + num_weights=len(weights), lrs=lrs, wds=wds, **kwargs) else: - sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, - lr=lr, wd=wd, **kwargs) + if self.momentum > 0: + multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))), + out=weights, num_weights=len(weights), + lrs=lrs, wds=wds, **kwargs) + else: + multi_mp_sgd_update(*_flatten_list(zip(weights, grads, + list(zip(*states))[1])), + out=weights, num_weights=len(weights), + lrs=lrs, wds=wds, **kwargs) else: - if state[0] is not None: - mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, - lr=lr, wd=wd, **kwargs) - else: - mp_sgd_update(weight, grad, state[1], out=weight, - lr=lr, wd=wd, **kwargs) + for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds): + if not multi_precision: + if state is not None: + sgd_mom_update(weight, grad, state, out=weight, + lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs) + else: + sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update, + lr=lr, wd=wd, **kwargs) + else: + if state[0] is not None: + mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, + lr=lr, wd=wd, **kwargs) + else: + mp_sgd_update(weight, grad, state[1], out=weight, + lr=lr, wd=wd, **kwargs) def update(self, index, weight, grad, state): self._update_impl(index, weight, grad, state, multi_precision=False) def update_multi_precision(self, index, weight, grad, state): - use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 + if not isinstance(index, (tuple, list)): + use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 + else: + use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16 self._update_impl(index, weight, grad, state, multi_precision=use_multi_precision) @@ -1525,20 +1609,55 @@ def __init__(self, optimizer): self.optimizer = optimizer self.states = {} self.states_synced = {} + self.aggregate_updates = optimizer.aggregate_num > 0 def __call__(self, index, grad, weight): """Updates weight given gradient and index.""" - # convert ctypes.char_p.value back to python str if needed - if isinstance(index, bytes): - index = py_str(index) - if index not in self.states: - self.states[index] = self.optimizer.create_state_multi_precision(index, weight) - self.states_synced[index] = True - elif not self.states_synced[index]: - self.states[index] = \ - self.sync_state_context(self.states[index], weight.context) - self.states_synced[index] = True - self.optimizer.update_multi_precision(index, weight, grad, self.states[index]) + if not isinstance(index, (list, tuple)): + indices = [index] + grads = [grad] + weights = [weight] + else: + indices = index + grads = grad + weights = weight + for i, idx in enumerate(indices): + # convert ctypes.char_p.value back to python str if needed + if isinstance(idx, bytes): + indices[i] = py_str(idx) + idx = indices[i] + if idx not in self.states: + self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i]) + self.states_synced[idx] = True + elif not self.states_synced[idx]: + self.states[idx] = \ + self.sync_state_context(self.states[idx], weights[i].context) + self.states_synced[idx] = True + if self.aggregate_updates: + # segregate values based on type + type_map = {} + for i, w, g in zip(indices, weights, grads): + if w.dtype in type_map: + type_map[w.dtype].append((i, w, g)) + else: + type_map[w.dtype] = [(i, w, g)] + for idx in type_map: + current_index = 0 + indices, weights, grads = zip(*type_map[idx]) + while current_index < len(indices): + states = [] + step = min(self.optimizer.aggregate_num, len(indices) - current_index) + for j in range(step): + states.append(self.states[indices[current_index + j]]) + self.optimizer.update_multi_precision( + indices[current_index:current_index + self.optimizer.aggregate_num], + weights[current_index:current_index + self.optimizer.aggregate_num], + grads[current_index:current_index + self.optimizer.aggregate_num], + states) + current_index += self.optimizer.aggregate_num + else: + for i, w, g in zip(indices, weights, grads): + self.optimizer.update_multi_precision(i, w, g, self.states[i]) def sync_state_context(self, state, context): """sync state context.""" diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 9251b8614806..223a1aa6c37d 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -82,6 +82,301 @@ struct SGDParam : public dmlc::Parameter { } }; +struct MultiSGDParam : public dmlc::Parameter { + nnvm::Tuple lrs; + nnvm::Tuple wds; + float rescale_grad; + float clip_gradient; + int num_weights; + DMLC_DECLARE_PARAMETER(MultiSGDParam) { + DMLC_DECLARE_FIELD(lrs) + .describe("Learning rates."); + DMLC_DECLARE_FIELD(wds) + .describe("Weight decay augments the objective function with a " + "regularization term that penalizes large weights. " + "The penalty scales with the square of the magnitude of each weight."); + DMLC_DECLARE_FIELD(rescale_grad) + .set_default(1.0f) + .describe("Rescale gradient to grad = rescale_grad*grad."); + DMLC_DECLARE_FIELD(clip_gradient) + .set_default(-1.0f) + .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " + "If clip_gradient <= 0, gradient clipping is turned off. " + "grad = max(min(grad, clip_gradient), -clip_gradient)."); + DMLC_DECLARE_FIELD(num_weights) + .set_default(1) + .describe("Number of updated weights."); + } +}; + +struct MultiSGDMomParam : public dmlc::Parameter { + nnvm::Tuple lrs; + nnvm::Tuple wds; + float momentum; + float rescale_grad; + float clip_gradient; + int num_weights; + DMLC_DECLARE_PARAMETER(MultiSGDMomParam) { + DMLC_DECLARE_FIELD(lrs) + .describe("Learning rates."); + DMLC_DECLARE_FIELD(wds) + .describe("Weight decay augments the objective function with a " + "regularization term that penalizes large weights. " + "The penalty scales with the square of the magnitude of each weight."); + DMLC_DECLARE_FIELD(momentum) + .set_default(0.0f) + .describe("The decay rate of momentum estimates at each epoch."); + DMLC_DECLARE_FIELD(rescale_grad) + .set_default(1.0f) + .describe("Rescale gradient to grad = rescale_grad*grad."); + DMLC_DECLARE_FIELD(clip_gradient) + .set_default(-1.0f) + .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] " + "If clip_gradient <= 0, gradient clipping is turned off. " + "grad = max(min(grad, clip_gradient), -clip_gradient)."); + DMLC_DECLARE_FIELD(num_weights) + .set_default(1) + .describe("Number of updated weights."); + } +}; + +template +inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const ParamType& param = dmlc::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); + CHECK_EQ(out_attrs->size(), param.num_weights); + + bool all_inferred = true; + auto& input_shapes = *in_attrs; + auto& output_shapes = *out_attrs; + // Learning rates + CHECK_EQ(param.lrs.ndim(), param.num_weights) + << "Number of learning rates is inconsistent with num_weights " + << "parameter passed. Expected number of learning rates: " + << param.num_weights << ", and got " << param.lrs.ndim(); + // Weight decays + CHECK_EQ(param.wds.ndim(), param.num_weights) + << "Number of weight decays is inconsistent with num_weights " + << "parameter passed. Expected number of weight decays: " + << param.num_weights << ", and got " << param.wds.ndim(); + // Weights and gradients + for (int i = 0; i < param.num_weights; ++i) { + std::vector input_vec; + std::vector output_vec({output_shapes[i]}); + for (int j = 0; j < input_stride; ++j) { + input_vec.push_back(input_shapes[i * input_stride + j]); + } + all_inferred = all_inferred && ElemwiseShape(attrs, &input_vec, &output_vec); + } + return all_inferred; +} + +template +inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + const ParamType& param = dmlc::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), input_stride * param.num_weights); + CHECK_EQ(out_attrs->size(), param.num_weights); + + bool all_inferred = true; + auto& input_types = *in_attrs; + auto& output_types = *out_attrs; + // Weights and gradients + for (int i = 0; i < param.num_weights; ++i) { + std::vector input_vec; + std::vector output_vec({output_types[i]}); + for (int j = 0; j < input_stride - num_fp32_inputs; ++j) { + input_vec.push_back(input_types[i * input_stride + j]); + } + all_inferred = all_inferred && + ElemwiseType(attrs, &input_vec, &output_vec); + } + // master copies of weights + for (int i = 0; i < param.num_weights; ++i) { + for (int j = 0; j < num_fp32_inputs; ++j) { + TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32); + } + } + return all_inferred; +} + +template +struct MultiSGDKernelParam { + static const int N = 60; + int count; + size_t max_size; + size_t sizes[N]; + DType * weights[N]; + DType * grads[N]; + MPDType * mom[N]; + MPDType * weights32[N]; + DType * out_data[N]; + MPDType lrs[N]; + MPDType wds[N]; + MPDType clip_gradient; + MPDType rescale_grad; + MPDType momentum; +}; + +template +struct MultiSGDKernel { + template + MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam& param, + const OpReqType req) { + for (int index = 0; index < param.count; ++index) { + if ((size_t)i < param.sizes[index]) { + MPDType w = has_mixed_precision ? param.weights32[index][i] : + MPDType(param.weights[index][i]); + MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0); + if (param.clip_gradient >= 0.0f) { + mom = param.momentum*mom + - param.lrs[index]*param.wds[index]*w + - param.lrs[index] + *mshadow_op::clip::Map(param.rescale_grad * + static_cast(param.grads[index][i]), + param.clip_gradient); + } else { + mom = param.momentum*mom + - param.lrs[index]*param.wds[index]*w + - param.lrs[index]*param.rescale_grad*static_cast(param.grads[index][i]); + } + if (has_momentum) { + param.mom[index][i] = mom; + } + w = w + mom; + if (has_mixed_precision) { + param.weights32[index][i] = w; + } + KERNEL_ASSIGN(param.out_data[index][i], req, w); + } + } + } +}; + +template +MultiSGDKernelParam FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &outputs) { + using namespace mxnet_op; + const ParamType& p = nnvm::get(attrs.parsed); + Stream* s = ctx.get_stream(); + MultiSGDKernelParam param; + param.clip_gradient = p.clip_gradient; + param.rescale_grad = p.rescale_grad; + param.momentum = 0; + param.count = p.num_weights; + param.max_size = 0; + for (int i = 0; i < param.count; ++i) { + param.sizes[i] = inputs[i * input_stride].shape_.Size(); + if (param.max_size < param.sizes[i]) { + param.max_size = param.sizes[i]; + } + param.weights[i] = inputs[i * input_stride].FlatTo2D(s).dptr_; + param.grads[i] = inputs[i * input_stride + 1].FlatTo2D(s).dptr_; + // if mixed precision, then the last input in a set + // is 32-bit master copy of the weights + if (!std::is_same::value) { + param.weights32[i] = inputs[i * input_stride + input_stride - 1] + .FlatTo2D(s).dptr_; + } + param.out_data[i] = outputs[i].FlatTo2D(s).dptr_; + param.lrs[i] = p.lrs[i]; + param.wds[i] = p.wds[i]; + } + + return param; +} + + +template +MultiSGDKernelParam FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &outputs) { + using namespace mxnet_op; + const MultiSGDMomParam& p = nnvm::get(attrs.parsed); + Stream* s = ctx.get_stream(); + MultiSGDKernelParam param = + FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); + param.momentum = p.momentum; + for (int i = 0; i < param.count; ++i) { + param.mom[i] = inputs[i * input_stride + 2].FlatTo2D(s).dptr_; + } + + return param; +} + +template +class type_identity { + public: + using type = T; +}; + +template +class single_precision { + public: + using type = float; +}; + +template class MPTypeChooser, int input_stride> +inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mxnet_op; + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using MPDType = typename MPTypeChooser::type; + MultiSGDKernelParam param = + FillMultiSGDKernelParam(attrs, ctx, inputs, outputs); + Kernel::value>, + xpu>::Launch(s, param.max_size, param, req[0]); + }); +} + +template class MPTypeChooser, int input_stride> +inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + using namespace mxnet_op; + Stream* s = ctx.get_stream(); + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + using MPDType = typename MPTypeChooser::type; + MultiSGDKernelParam param = + FillMultiSGDMomKernelParam(attrs, ctx, inputs, outputs); + Kernel::value>, + xpu>::Launch(s, param.max_size, param, req[0]); + }); +} struct SGDKernel { template diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index a52a6f32907c..982995ad2f95 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -31,6 +31,8 @@ namespace op { DMLC_REGISTER_PARAMETER(SGDParam); DMLC_REGISTER_PARAMETER(SGDMomParam); +DMLC_REGISTER_PARAMETER(MultiSGDParam); +DMLC_REGISTER_PARAMETER(MultiSGDMomParam); DMLC_REGISTER_PARAMETER(FTMLParam); DMLC_REGISTER_PARAMETER(AdamParam); DMLC_REGISTER_PARAMETER(RMSPropParam); @@ -52,7 +54,7 @@ It updates the weights using:: weight = weight - learning_rate * sign(gradient) -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(2) @@ -81,7 +83,7 @@ It updates the weights using:: Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. -.. note:: +.. note:: - sparse ndarray not supported for this optimizer yet. )code" ADD_FILELINE) .set_num_inputs(3) @@ -313,6 +315,193 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs, return dispatched; } +NNVM_REGISTER_OP(multi_sgd_update) +.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer. + +It updates the weights using:: + + weight = weight - learning_rate * (gradient + wd * weight) + +)code" ADD_FILELINE) +.set_num_inputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights * 2); + }) +.set_num_outputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights); + }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", MultiSGDShape) +.set_attr("FInferType", ElemwiseType<-1, -1>) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + uint32_t num_args = dmlc::get(attrs.parsed).num_weights; + std::vector ret; + for (uint32_t i = 0; i < num_args; ++i) { + ret.push_back(std::string("weight_") + std::to_string(i)); + ret.push_back(std::string("grad_") + std::to_string(i)); + } + return ret; + }) +.set_attr("FCompute", MultiSGDUpdate) +.add_argument("data", "NDArray-or-Symbol[]", "Weights") +.add_arguments(MultiSGDParam::__FIELDS__()); + +NNVM_REGISTER_OP(multi_sgd_mom_update) +.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer. + +Momentum update has better convergence rates on neural networks. Mathematically it looks +like below: + +.. math:: + + v_1 = \alpha * \nabla J(W_0)\\ + v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ + W_t = W_{t-1} + v_t + +It updates the weights using:: + + v = momentum * v - learning_rate * gradient + weight += v + +Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. + +)code" ADD_FILELINE) +.set_num_inputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights * 3); + }) +.set_num_outputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights); + }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", MultiSGDShape) +.set_attr("FInferType", ElemwiseType<-1, -1>) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + uint32_t num_args = dmlc::get(attrs.parsed).num_weights; + std::vector ret; + for (uint32_t i = 0; i < num_args; ++i) { + ret.push_back(std::string("weight_") + std::to_string(i)); + ret.push_back(std::string("grad_") + std::to_string(i)); + ret.push_back(std::string("mom_") + std::to_string(i)); + } + return ret; + }) +.set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + std::vector ret; + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + for (int i = 0; i < param.num_weights; ++i) { + ret.push_back(i * 3 + 2); + } + return ret; + }) +.set_attr("FCompute", MultiSGDMomUpdate) +.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum") +.add_arguments(MultiSGDMomParam::__FIELDS__()); + +NNVM_REGISTER_OP(multi_mp_sgd_update) +.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer. + +It updates the weights using:: + + weight = weight - learning_rate * (gradient + wd * weight) + +)code" ADD_FILELINE) +.set_num_inputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights * 3); + }) +.set_num_outputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights); + }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", MultiSGDShape) +.set_attr("FInferType", MP_MultiSGD_InferType) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + uint32_t num_args = dmlc::get(attrs.parsed).num_weights; + std::vector ret; + for (uint32_t i = 0; i < num_args; ++i) { + ret.push_back(std::string("weight_") + std::to_string(i)); + ret.push_back(std::string("grad_") + std::to_string(i)); + ret.push_back(std::string("weight32_") + std::to_string(i)); + } + return ret; + }) +.set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + std::vector ret; + const MultiSGDParam& param = dmlc::get(attrs.parsed); + for (int i = 0; i < param.num_weights; ++i) { + ret.push_back(i * 3 + 2); + } + return ret; + }) +.set_attr("FCompute", MultiSGDUpdate) +.add_argument("data", "NDArray-or-Symbol[]", "Weights") +.add_arguments(MultiSGDParam::__FIELDS__()); + +NNVM_REGISTER_OP(multi_mp_sgd_mom_update) +.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer. + +Momentum update has better convergence rates on neural networks. Mathematically it looks +like below: + +.. math:: + + v_1 = \alpha * \nabla J(W_0)\\ + v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\ + W_t = W_{t-1} + v_t + +It updates the weights using:: + + v = momentum * v - learning_rate * gradient + weight += v + +Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch. + +)code" ADD_FILELINE) +.set_num_inputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights * 4); + }) +.set_num_outputs([](const nnvm::NodeAttrs& attrs) { + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + return static_cast(param.num_weights); + }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", MultiSGDShape) +.set_attr("FInferType", MP_MultiSGD_InferType) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + uint32_t num_args = dmlc::get(attrs.parsed).num_weights; + std::vector ret; + for (uint32_t i = 0; i < num_args; ++i) { + ret.push_back(std::string("weight_") + std::to_string(i)); + ret.push_back(std::string("grad_") + std::to_string(i)); + ret.push_back(std::string("mom_") + std::to_string(i)); + ret.push_back(std::string("weight32_") + std::to_string(i)); + } + return ret; + }) +.set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + std::vector ret; + const MultiSGDMomParam& param = dmlc::get(attrs.parsed); + for (int i = 0; i < param.num_weights; ++i) { + ret.push_back(i * 4 + 2); + ret.push_back(i * 4 + 3); + } + return ret; + }) +.set_attr("FCompute", MultiSGDMomUpdate) +.add_argument("data", "NDArray-or-Symbol[]", "Weights") +.add_arguments(MultiSGDMomParam::__FIELDS__()); NNVM_REGISTER_OP(sgd_update) MXNET_ADD_SPARSE_OP_ALIAS(sgd_update) diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 0fd2ca83fda4..c42cf1831c43 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -242,6 +242,15 @@ NNVM_REGISTER_OP(mp_sgd_update) NNVM_REGISTER_OP(mp_sgd_mom_update) .set_attr("FCompute", MP_SGDMomUpdate); +NNVM_REGISTER_OP(multi_sgd_update) +.set_attr("FCompute", MultiSGDUpdate); +NNVM_REGISTER_OP(multi_sgd_mom_update) +.set_attr("FCompute", MultiSGDMomUpdate); +NNVM_REGISTER_OP(multi_mp_sgd_update) +.set_attr("FCompute", MultiSGDUpdate); +NNVM_REGISTER_OP(multi_mp_sgd_mom_update) +.set_attr("FCompute", MultiSGDMomUpdate); + NNVM_REGISTER_OP(ftml_update) .set_attr("FCompute", FTMLUpdate); diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 985c38c31356..9f190a0a88c2 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -17,6 +17,7 @@ import mxnet as mx import unittest +import os import numpy as np from mxnet import gluon from mxnet.gluon import nn @@ -98,6 +99,9 @@ def dict_equ(a, b): @with_seed() def test_trainer_save_load(): + previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") + os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') + x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) @@ -112,6 +116,7 @@ def test_trainer_save_load(): x.lr_mult = 2.0 # check if parameter dict is correctly associated with optimizer after load_state assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 + os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() def test_trainer_sparse_save_load(): @@ -236,10 +241,11 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected): assert isinstance(err, expected) kvs = ['local', 'device'] + global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1"))) for kv in kvs: check_trainer_sparse_kv(kv, 'default', 'default', True, True) check_trainer_sparse_kv(kv, 'default', 'default', False, False) - check_trainer_sparse_kv(kv, 'default', 'default', None, True) + check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore) check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False) check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True) check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False) diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py index 144fbeef213f..ae38a2297ded 100644 --- a/tests/python/unittest/test_module.py +++ b/tests/python/unittest/test_module.py @@ -174,6 +174,8 @@ def test_module_layout(): @with_seed() def test_save_load(): + previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") + os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') def dict_equ(a, b): assert set(a) == set(b) for k in a: @@ -211,6 +213,7 @@ def dict_equ(a, b): assert mod._symbol.tojson() == mod2._symbol.tojson() dict_equ(mod.get_params()[0], mod2.get_params()[0]) dict_equ(mod._kvstore._updater.states, mod2._updater.states) + os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) @with_seed() From a6b0b9ef7d8dd287dc469e5c622031f92311186c Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 15 Feb 2019 16:55:34 -0800 Subject: [PATCH 13/26] add comments --- src/operator/nn/deconvolution-inl.h | 42 ++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 0947b63d5daa..0e436d4d8aa6 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -232,6 +232,19 @@ class DeconvolutionOp { << "Must init CuBLAS handle in stream"; #endif auto in_data_shape = in_data[deconv::kData].shape_; + // G: num of groups + // N: num of batches + // C: num of channels + // IH: input height + // IW: input width + // KH: kernel height + // KW: kernel width + // OH: output width + // OW: output height + // OC: num of output channels + + // 2D case: data (N, C, IH, IW) + // 2D case: out (N, OC, OH, OW) Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor out = TBlobTo4DTensor(out_data[deconv::kOut], s); index_t o_pad[2], o_adj[2]; @@ -252,25 +265,40 @@ class DeconvolutionOp { Shape3(param_.num_group, data.shape_[1] / param_.num_group, param_.num_filter / param_.num_group * kernel_size); + // 2D: wmat (G, C/G, OC/G * KH * KW) Tensor wmat = in_data[deconv::kWeight].get_with_shape(wmat_shape, s); const index_t nbatch = data.size(0); + + // shape_colunit_ : (OC * KH * KW, IH * IW) + shape_colunit_ = mshadow::Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]); + // shape_dstunit_ : (G, C/G, IH * IW) + shape_dstunit_ = mshadow::Shape3( + param_.num_group, + data.shape_[1] / param_.num_group, + data.shape_[2] * data.shape_[3] + ); + Tensor workspace = - ctx.requested[deconv::kTempSpace].get_space_typed( - Shape1(this->InitTemp(out.shape_, data.shape_)), s); + ctx.requested[deconv::kTempSpace].get_space_typed( + Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s); +// Tensor workspace = +// ctx.requested[deconv::kTempSpace].get_space_typed( +// Shape1(this->InitTemp(out.shape_, data.shape_)), s); for (index_t i = 0; i < nbatch; ++i) { - // temp_col: (N * kernel_size, OW * OH) + // temp_col: (OC * KH * KW, IH * IW) Tensor temp_col = Tensor( workspace.dptr_, - Shape2(shape_colunit_[0], shape_colunit_[1]), - s); - // temp_dst: (N, N/n_grup, OW * OH) + Shape2(shape_colunit_[0], + shape_colunit_[1]), + s); + // temp_dst : (G, C/G, IH * IW) Tensor temp_dst = Tensor( workspace.dptr_ + temp_col.shape_.Size(), Shape3(shape_dstunit_[0], shape_dstunit_[1], shape_dstunit_[2]), - s); + s); temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); im2col( From 3240833727ed96c46999a9c67d037d90839f75b3 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 15 Feb 2019 17:04:52 -0800 Subject: [PATCH 14/26] fix lint --- src/operator/nn/deconvolution-inl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 0e436d4d8aa6..c87345a73601 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -276,8 +276,7 @@ class DeconvolutionOp { shape_dstunit_ = mshadow::Shape3( param_.num_group, data.shape_[1] / param_.num_group, - data.shape_[2] * data.shape_[3] - ); + data.shape_[2] * data.shape_[3]); Tensor workspace = ctx.requested[deconv::kTempSpace].get_space_typed( From 88892d2debff9ca0ef957b0051a249fe8862106d Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 15 Mar 2019 14:52:33 -0700 Subject: [PATCH 15/26] fix lint error --- src/operator/nn/deconvolution-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index e18fb0afe57c..b791793e9f40 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -375,7 +375,7 @@ class DeconvolutionOp { TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); - + Shape<3> wmat_shape = Shape3(param_.num_group, data.shape_[1] / param_.num_group, From 0675b3b36cc9e0a4ab7221863a736c48fbd160e8 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 29 Apr 2019 13:51:30 -0700 Subject: [PATCH 16/26] fix a bug in calling im2col (col_shape should be 3) --- src/operator/nn/deconvolution-inl.h | 75 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 5d860dd60e3d..198989a30495 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -34,6 +34,7 @@ #include #include #include +#include #include "../operator_common.h" #include "../linalg.h" #include "./im2col.h" @@ -263,69 +264,71 @@ class DeconvolutionOp { auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); - Shape<3> wmat_shape = + Shape<3> weight_shape = Shape3(param_.num_group, data.shape_[1] / param_.num_group, param_.num_filter / param_.num_group * kernel_size); - // 2D: wmat (G, C/G, OC/G * KH * KW) - Tensor wmat = - in_data[deconv::kWeight].get_with_shape(wmat_shape, s); + // 2D case: weight_3d (G, C/G, OC/G * KH * KW) + Tensor weight_3d = + in_data[deconv::kWeight].get_with_shape(weight_shape, s); const index_t nbatch = data.size(0); // shape_colunit_ : (OC * KH * KW, IH * IW) - shape_colunit_ = mshadow::Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]); + shape_colunit_ = Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]); // shape_dstunit_ : (G, C/G, IH * IW) - shape_dstunit_ = mshadow::Shape3( - param_.num_group, - data.shape_[1] / param_.num_group, - data.shape_[2] * data.shape_[3]); + shape_dstunit_ = Shape3( + param_.num_group, + data.shape_[1] / param_.num_group, + data.shape_[2] * data.shape_[3]); + Tensor workspace = - ctx.requested[deconv::kTempSpace].get_space_typed( - Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s); -// Tensor workspace = -// ctx.requested[deconv::kTempSpace].get_space_typed( -// Shape1(this->InitTemp(out.shape_, data.shape_)), s); + ctx.requested[deconv::kTempSpace].get_space_typed( + Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s); + + Tensor col_buffer_3d = Tensor( + workspace.dptr_, + Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]), + s); + // temp_col: (N, OC * KH * KW, IH * IW) + // Tensor temp_col = Tensor( + // workspace.dptr_, + // Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]), + // s); + for (index_t i = 0; i < nbatch; ++i) { - // temp_col: (OC * KH * KW, IH * IW) - Tensor temp_col = Tensor( - workspace.dptr_, - Shape2(shape_colunit_[0], - shape_colunit_[1]), - s); // temp_dst : (G, C/G, IH * IW) Tensor temp_dst = Tensor( - workspace.dptr_ + temp_col.shape_.Size(), - Shape3(shape_dstunit_[0], - shape_dstunit_[1], - shape_dstunit_[2]), - s); + workspace.dptr_ + shape_colunit_.Size(), + shape_dstunit_, + s); temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); im2col( s, (out.Slice(i, i + 1)).dptr_, out.shape_, - temp_col.shape_, + col_buffer_3d.shape_, kernel, padding, stride, dilate, - temp_col.dptr_); + col_buffer_3d.dptr_); - const index_t gstride = temp_col.size(0) / param_.num_group; + + const index_t gstride = col_buffer_3d.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { - Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); + //Tensor tmpc = col_buffer_3d.Slice(gstride * gid, gstride * (gid + 1)); // Legacy approach shown here for comparison: - // tmpc = dot(wmat[gid].T(), temp_dst[gid]); - linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); + // tmpc = dot(weight_3d[gid].T(), temp_dst[gid]); + linalg_gemm(weight_3d[gid], temp_dst[gid], col_buffer_3d[gid], true, false, s); } col2im( s, - temp_col.dptr_, + col_buffer_3d.dptr_, out.Slice(i, i + 1).shape_, - temp_col.shape_, + col_buffer_3d.shape_, kernel, padding, stride, @@ -378,14 +381,14 @@ class DeconvolutionOp { auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); - Shape<3> wmat_shape = + Shape<3> weight_shape = Shape3(param_.num_group, data.shape_[1] / param_.num_group, param_.num_filter / param_.num_group * kernel_size); Tensor wmat = - in_data[deconv::kWeight].get_with_shape(wmat_shape, s); + in_data[deconv::kWeight].get_with_shape(weight_shape, s); Tensor gwmat = - in_grad[deconv::kWeight].get_with_shape(wmat_shape, s); + in_grad[deconv::kWeight].get_with_shape(weight_shape, s); const index_t nbatch = data.size(0); Tensor workspace = From f403b9c66a8765fa38f8c39cc1cd449d63797ed7 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 3 May 2019 13:58:53 -0700 Subject: [PATCH 17/26] fix im2col parameter mismatch --- src/operator/nn/deconvolution-inl.h | 78 +++++++++++++---------------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 198989a30495..7f5083e8c427 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -246,8 +246,8 @@ class DeconvolutionOp { // OW: output height // OC: num of output channels - // 2D case: data (N, C, IH, IW) - // 2D case: out (N, OC, OH, OW) + // data: (N, C, IH, IW) + // out: (N, OC, OH, OW) Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor out = TBlobTo4DTensor(out_data[deconv::kOut], s); index_t o_pad[2], o_adj[2]; @@ -259,71 +259,61 @@ class DeconvolutionOp { auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); - auto padding = param_.kernel.ndim() == 2 ? - TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); + auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); - auto kernel_size = kernel.Size(); - Shape<3> weight_shape = - Shape3(param_.num_group, - data.shape_[1] / param_.num_group, - param_.num_filter / param_.num_group * kernel_size); - // 2D case: weight_3d (G, C/G, OC/G * KH * KW) - Tensor weight_3d = - in_data[deconv::kWeight].get_with_shape(weight_shape, s); + // C/G * KW * KH + auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size(); + + // OC/G + auto channel_group = out.shape_[1] / param_.num_group; + + // IH*IW + auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim()); + + // OH*OW + auto out_spatial_size = out.shape_.ProdShape(2, out_data[deconv::kOut].ndim()); + + // weight_3d: (G, OC/G, KH * KW) + Shape<3> weight_shape = Shape3(param_.num_group, channel_group, kernel_size); + Tensor weight_3d = in_data[deconv::kWeight].get_with_shape(weight_shape, s); + const index_t nbatch = data.size(0); - // shape_colunit_ : (OC * KH * KW, IH * IW) - shape_colunit_ = Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]); + auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size; + // shape_dstunit_ : (G, C/G, IH * IW) shape_dstunit_ = Shape3( param_.num_group, data.shape_[1] / param_.num_group, data.shape_[2] * data.shape_[3]); - Tensor workspace = ctx.requested[deconv::kTempSpace].get_space_typed( - Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s); + Shape1(col_buffer_size + data.shape_.Size()), s); + // col_buffer_3d : (G, KH * KW, IH * IW) Tensor col_buffer_3d = Tensor( - workspace.dptr_, - Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]), - s); - // temp_col: (N, OC * KH * KW, IH * IW) - // Tensor temp_col = Tensor( - // workspace.dptr_, - // Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]), - // s); + workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s); for (index_t i = 0; i < nbatch; ++i) { - // temp_dst : (G, C/G, IH * IW) - Tensor temp_dst = Tensor( - workspace.dptr_ + shape_colunit_.Size(), - shape_dstunit_, - s); - temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); + Tensor data_3d = Tensor( + workspace.dptr_ + col_buffer_size, + Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s); - im2col( - s, - (out.Slice(i, i + 1)).dptr_, - out.shape_, - col_buffer_3d.shape_, - kernel, - padding, - stride, - dilate, - col_buffer_3d.dptr_); + // data_3d : (G, C/G, IH * IW) + data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_); + // im2col(s, (out.Slice(i, i + 1)).dptr_, out.shape_, col_buffer_3d.shape_, + // kernel, padding, stride, dilate, col_buffer_3d.dptr_); - const index_t gstride = col_buffer_3d.size(0) / param_.num_group; for (uint32_t gid = 0; gid < param_.num_group; ++gid) { - //Tensor tmpc = col_buffer_3d.Slice(gstride * gid, gstride * (gid + 1)); // Legacy approach shown here for comparison: - // tmpc = dot(weight_3d[gid].T(), temp_dst[gid]); - linalg_gemm(weight_3d[gid], temp_dst[gid], col_buffer_3d[gid], true, false, s); + // tmpc = dot(weight_3d[gid].T(), data_3d[gid]); + linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s); } + col2im( s, col_buffer_3d.dptr_, From bdbf81d659469f951e6faea03758f5ea6e735136 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 6 May 2019 15:30:19 -0700 Subject: [PATCH 18/26] add debug --- src/operator/nn/deconvolution-inl.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 7f5083e8c427..5204b6e10a60 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -284,22 +284,22 @@ class DeconvolutionOp { // shape_dstunit_ : (G, C/G, IH * IW) shape_dstunit_ = Shape3( - param_.num_group, - data.shape_[1] / param_.num_group, - data.shape_[2] * data.shape_[3]); + param_.num_group, + data.shape_[1] / param_.num_group, + data.shape_[2] * data.shape_[3]); - Tensor workspace = - ctx.requested[deconv::kTempSpace].get_space_typed( - Shape1(col_buffer_size + data.shape_.Size()), s); + Tensor workspace = ctx.requested[deconv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size + data.shape_.Size()), s); // col_buffer_3d : (G, KH * KW, IH * IW) Tensor col_buffer_3d = Tensor( - workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s); + workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s); for (index_t i = 0; i < nbatch; ++i) { + // Tensor data_3d = data[i]; Tensor data_3d = Tensor( - workspace.dptr_ + col_buffer_size, - Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s); + workspace.dptr_ + col_buffer_size, + Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s); // data_3d : (G, C/G, IH * IW) data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_); @@ -309,10 +309,17 @@ class DeconvolutionOp { for (uint32_t gid = 0; gid < param_.num_group; ++gid) { // Legacy approach shown here for comparison: - // tmpc = dot(weight_3d[gid].T(), data_3d[gid]); + // col_buffer_3d[gid] = dot(weight_3d[gid].T(), data_3d[gid]); linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s); } + std::cout << "col buffer: " << std::endl; + for (auto j = 0; j < kernel_size; ++j) { + for (auto k = 0; k < data_spatial_size; ++k) { + std::cout << *(static_cast(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " "; + } + std::cout << std::endl; + } col2im( s, From 2c929805c8fe84b45f0ed32c28c53f836575ea08 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 10 May 2019 10:19:28 -0700 Subject: [PATCH 19/26] set col_buffer_shape --- src/operator/nn/deconvolution-inl.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 5204b6e10a60..4a5e8ff37382 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -262,11 +262,14 @@ class DeconvolutionOp { auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); + auto conv_in_channels = data.shape_[1]; + auto conv_out_channels = out.shape_[1]; + // C/G * KW * KH auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size(); // OC/G - auto channel_group = out.shape_[1] / param_.num_group; + auto channel_group = conv_out_channels / param_.num_group; // IH*IW auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim()); @@ -281,6 +284,11 @@ class DeconvolutionOp { const index_t nbatch = data.size(0); auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size; + mxnet::TShape col_buffer_shape(3, 1); + col_buffer_shape[0] = conv_in_channels * kernel.Size(); + for (int i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = data.shape_[i+1]; + } // shape_dstunit_ : (G, C/G, IH * IW) shape_dstunit_ = Shape3( @@ -291,9 +299,11 @@ class DeconvolutionOp { Tensor workspace = ctx.requested[deconv::kTempSpace] .get_space_typed(Shape1(col_buffer_size + data.shape_.Size()), s); + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + // col_buffer_3d : (G, KH * KW, IH * IW) - Tensor col_buffer_3d = Tensor( - workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(param_.num_group, kernel_size, data_spatial_size), s); for (index_t i = 0; i < nbatch; ++i) { // Tensor data_3d = data[i]; @@ -313,6 +323,7 @@ class DeconvolutionOp { linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s); } + std::cout << "col buffer: " << std::endl; for (auto j = 0; j < kernel_size; ++j) { for (auto k = 0; k < data_spatial_size; ++k) { From 0c44ec86dd56b1dd0fadf7e62fbfe30e11238146 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 10 May 2019 11:52:00 -0700 Subject: [PATCH 20/26] dump data from gpu to cpu to debug --- src/operator/nn/deconvolution-inl.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 4a5e8ff37382..9d6e69489c10 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -325,6 +325,15 @@ class DeconvolutionOp { std::cout << "col buffer: " << std::endl; + DType *tmp_data = new DType[col_buffer_size]; + if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { + std::cout << "running on GPU " << std::endl; + NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id); + col_data.SyncCopyToCPU(tmp_data, col_buffer_size); + } else { + tmp_data = static_cast(col_buffer_3d[0].dptr_); + } + for (auto j = 0; j < kernel_size; ++j) { for (auto k = 0; k < data_spatial_size; ++k) { std::cout << *(static_cast(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " "; From 2c868fd93b6facf270b373c95c6a0825ad43599a Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 10 May 2019 12:00:13 -0700 Subject: [PATCH 21/26] debug --- src/operator/nn/deconvolution-inl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 9d6e69489c10..e545e1012206 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -330,6 +330,7 @@ class DeconvolutionOp { std::cout << "running on GPU " << std::endl; NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id); col_data.SyncCopyToCPU(tmp_data, col_buffer_size); + std::cout << "complete " << std::endl; } else { tmp_data = static_cast(col_buffer_3d[0].dptr_); } From 5dacddc88425476318e438c7f7d1f1fc5f825b91 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Fri, 10 May 2019 12:06:00 -0700 Subject: [PATCH 22/26] debug --- src/operator/nn/deconvolution-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index e545e1012206..101da3aeddfd 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -337,7 +337,7 @@ class DeconvolutionOp { for (auto j = 0; j < kernel_size; ++j) { for (auto k = 0; k < data_spatial_size; ++k) { - std::cout << *(static_cast(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " "; + std::cout << *(tmp_data + j * kernel_size + k) << " "; } std::cout << std::endl; } From 29c4488aebb1005ae7603a2de966869751e2c2a7 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 13 May 2019 10:24:58 -0700 Subject: [PATCH 23/26] update function call to col2im --- src/operator/nn/deconvolution-inl.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 101da3aeddfd..fd91743d9a17 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -342,17 +342,11 @@ class DeconvolutionOp { std::cout << std::endl; } - col2im( - s, - col_buffer_3d.dptr_, - out.Slice(i, i + 1).shape_, - col_buffer_3d.shape_, - kernel, - padding, - stride, - dilate, - out.Slice(i, i + 1).dptr_, - req[deconv::kOut]); + auto input_dim_ = in_data_shape.ProdShape(1, in_data_shape.ndim()); + + col2im(s, col_buffer.dptr(), out_data[deconv::kOut].shape_, col_buffer.shape_, + kernel, padding, stride, dilate, out_data[deconv::kOut].dptr() + i * input_dim_, req[deconv::kOut]); + } if (!param_.no_bias) { From 5f3c8813a53abab9df6db5477a3da18b6fff7292 Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 13 May 2019 15:50:13 -0700 Subject: [PATCH 24/26] fix backward pass --- src/operator/nn/deconvolution-inl.h | 263 ++++++++++++++-------------- 1 file changed, 127 insertions(+), 136 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index fd91743d9a17..df797db6cef1 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -229,10 +229,11 @@ class DeconvolutionOp { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_data.size(), expected); CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[deconv::kData].shape_, out_data[deconv::kData].shape_); Stream *s = ctx.get_stream(); #if defined(__CUDACC__) CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + << "Must init cuBLAS handle in stream"; #endif auto in_data_shape = in_data[deconv::kData].shape_; // G: num of groups @@ -246,10 +247,10 @@ class DeconvolutionOp { // OW: output height // OC: num of output channels - // data: (N, C, IH, IW) - // out: (N, OC, OH, OW) - Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); - Tensor out = TBlobTo4DTensor(out_data[deconv::kOut], s); + // input_4d: (N, C, IH, IW) + // output_4d: (N, OC, OH, OW) + Tensor input_4d = TBlobTo4DTensor(in_data[deconv::kData], s); + Tensor output_4d = TBlobTo4DTensor(out_data[deconv::kOut], s); index_t o_pad[2], o_adj[2]; if (param_.kernel.ndim() == 2) { param_.InferPad(mxnet::TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj); @@ -262,97 +263,71 @@ class DeconvolutionOp { auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); - auto conv_in_channels = data.shape_[1]; - auto conv_out_channels = out.shape_[1]; - - // C/G * KW * KH - auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size(); - - // OC/G - auto channel_group = conv_out_channels / param_.num_group; - - // IH*IW - auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim()); - - // OH*OW - auto out_spatial_size = out.shape_.ProdShape(2, out_data[deconv::kOut].ndim()); - // weight_3d: (G, OC/G, KH * KW) - Shape<3> weight_shape = Shape3(param_.num_group, channel_group, kernel_size); - Tensor weight_3d = in_data[deconv::kWeight].get_with_shape(weight_shape, s); + Tensor weight_3d = in_data[deconv::kWeight].get_with_shape( + Shape3(param_.num_group, conv_out_channels_ / group_, kernel_dim_), s); - const index_t nbatch = data.size(0); - auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size; - mxnet::TShape col_buffer_shape(3, 1); - col_buffer_shape[0] = conv_in_channels * kernel.Size(); + Tensor workspace = ctx.requested[deconv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s); + + mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1); + col_buffer_shape[0] = conv_in_channels_ * kernel.Size(); for (int i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = data.shape_[i+1]; + col_buffer_shape[i] = in_data[deconv::kData].shape_[i + 1]; } - // shape_dstunit_ : (G, C/G, IH * IW) - shape_dstunit_ = Shape3( - param_.num_group, - data.shape_[1] / param_.num_group, - data.shape_[2] * data.shape_[3]); - - Tensor workspace = ctx.requested[deconv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size + data.shape_.Size()), s); - + // create a colum buffer to hold the matrix product between weight_3d(T) and input_data TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); // col_buffer_3d : (G, KH * KW, IH * IW) Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(param_.num_group, kernel_size, data_spatial_size), s); + Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); - for (index_t i = 0; i < nbatch; ++i) { - // Tensor data_3d = data[i]; + for (index_t i = 0; i < num_; ++i) { + // Tensor data_3d = input_4d[i]; Tensor data_3d = Tensor( - workspace.dptr_ + col_buffer_size, - Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s); + workspace.dptr_ + col_buffer_size_, + Shape3(param_.num_group, input_4d.shape_[1] / param_.num_group, conv_in_spatial_dim_), s); // data_3d : (G, C/G, IH * IW) - data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_); - - // im2col(s, (out.Slice(i, i + 1)).dptr_, out.shape_, col_buffer_3d.shape_, - // kernel, padding, stride, dilate, col_buffer_3d.dptr_); + data_3d = reshape(swapaxis<1, 0>(input_4d.Slice(i, i + 1)), data_3d.shape_); - for (uint32_t gid = 0; gid < param_.num_group; ++gid) { + for (int g = 0; g < group_; ++g) { // Legacy approach shown here for comparison: - // col_buffer_3d[gid] = dot(weight_3d[gid].T(), data_3d[gid]); - linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s); + // col_buffer_3d[g] = dot(weight_3d[g].T(), data_3d[g]); + linalg_gemm(weight_3d[g], data_3d[g], col_buffer_3d[g], true, false, s); } + // TODO: (lnyuan) remove debugging code std::cout << "col buffer: " << std::endl; - DType *tmp_data = new DType[col_buffer_size]; + DType *tmp_data = new DType[col_buffer_size_]; if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { std::cout << "running on GPU " << std::endl; NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id); - col_data.SyncCopyToCPU(tmp_data, col_buffer_size); + col_data.SyncCopyToCPU(tmp_data, col_buffer_size_); std::cout << "complete " << std::endl; } else { tmp_data = static_cast(col_buffer_3d[0].dptr_); } - for (auto j = 0; j < kernel_size; ++j) { - for (auto k = 0; k < data_spatial_size; ++k) { - std::cout << *(tmp_data + j * kernel_size + k) << " "; + for (auto j = 0; j < kernel_dim_; ++j) { + for (auto k = 0; k < conv_in_spatial_dim_; ++k) { + std::cout << *(tmp_data + j * kernel_dim_ + k) << " "; } std::cout << std::endl; } - auto input_dim_ = in_data_shape.ProdShape(1, in_data_shape.ndim()); - col2im(s, col_buffer.dptr(), out_data[deconv::kOut].shape_, col_buffer.shape_, kernel, padding, stride, dilate, out_data[deconv::kOut].dptr() + i * input_dim_, req[deconv::kOut]); } - if (!param_.no_bias) { + if (bias_term_) { // add bias, broadcast bias to dim 1: channel Tensor bias = in_data[deconv::kBias].get(s); - out += mshadow::expr::broadcast<1>(bias, out.shape_); + output_4d += mshadow::expr::broadcast<1>(bias, output_4d.shape_); } } @@ -363,21 +338,22 @@ class DeconvolutionOp { const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; CHECK_EQ(in_data.size(), expected); CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); - // get data + + LayerSetUp(out_grad[deconv::kOut].shape_, in_grad[deconv::kData].shape_); Stream *s = ctx.get_stream(); #if defined(__CUDACC__) CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + << "Must init cuBLAS handle in stream"; #endif + auto in_data_shape = in_data[deconv::kData].shape_; - Tensor data = TBlobTo4DTensor(in_data[deconv::kData], s); + Tensor data_4d = TBlobTo4DTensor(in_data[deconv::kData], s); Tensor grad = TBlobTo4DTensor(out_grad[deconv::kOut], s); Tensor gdata = TBlobTo4DTensor(in_grad[deconv::kData], s); index_t o_pad[2], o_adj[2]; @@ -388,74 +364,58 @@ class DeconvolutionOp { } auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); - auto padding = param_.kernel.ndim() == 2 ? - TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); + auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); auto kernel_size = kernel.Size(); - Shape<3> weight_shape = - Shape3(param_.num_group, - data.shape_[1] / param_.num_group, - param_.num_filter / param_.num_group * kernel_size); - Tensor wmat = - in_data[deconv::kWeight].get_with_shape(weight_shape, s); - Tensor gwmat = - in_grad[deconv::kWeight].get_with_shape(weight_shape, s); - - const index_t nbatch = data.size(0); - Tensor workspace = - ctx.requested[deconv::kTempSpace].get_space_typed( - Shape1(this->InitTemp(grad.shape_, data.shape_)), s); - for (index_t i = 0; i < nbatch; ++i) { - Tensor temp_col = Tensor( - workspace.dptr_, - Shape2(shape_colunit_[0], shape_colunit_[1]), - s); - Tensor temp_dst = Tensor( - workspace.dptr_ + temp_col.shape_.Size(), - Shape3(shape_dstunit_[0], - shape_dstunit_[1], - shape_dstunit_[2]), - s); - temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); - - im2col( - s, - (grad.Slice(i, i + 1)).dptr_, - grad.shape_, - temp_col.shape_, - kernel, - padding, - stride, - dilate, - temp_col.dptr_); - - const index_t gstride = temp_col.size(0) / param_.num_group; - for (uint32_t gid = 0; gid < param_.num_group; ++gid) { - Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); - if (i == 0) { - Tensor tmp_gwmat = gwmat[gid]; - // Legacy approach shown here for comparison: - // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); - linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]); - } else { - // Legacy approach shown here for comparison: - // gwmat[gid] += dot(temp_dst[gid], tmpc.T()); - linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo); - } + Tensor weight_3d = in_data[deconv::kWeight] + .get_with_shape(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s); + Tensor dweight_3d = in_grad[deconv::kWeight] + .get_with_shape(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s); + + Tensor workspace = ctx.requested[deconv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s); + // calculate shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1, 1); + col_buffer_shape[0] = conv_out_channels_ * kernel_size; + for (int i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_grad[deconv::kOut].shape_[i+1]; + } + // create a column buffer to store ograd + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); + + for (index_t i = 0; i < num_; ++i) { + // Tensor data_3d = input_4d[i]; + Tensor data_3d = Tensor( + workspace.dptr_ + col_buffer_size_, + Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s); + + // data_3d : (G, C/G, IH * IW) + data_3d = reshape(swapaxis<1, 0>(data_4d.Slice(i, i + 1)), data_3d.shape_); + + // convert output gradient array to column buffer + im2col(s, out_grad[deconv::kOut].dptr() + i * output_dim_, out_grad[deconv::kOut].shape_, + col_buffer.shape_, kernel, padding, stride, dilate, col_buffer.dptr()); + + for (int g = 0; g < group_; ++g) { + auto request = (i == 0) ? req[deconv::kWeight] : kAddTo; + // Legacy approach shown here for comparison: + // dweight_3d[gid] += dot(temp_dst[gid], tmpc.T()); + linalg_gemm(data_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request); } if (req[deconv::kData] == kWriteTo || req[deconv::kData] == kWriteInplace || req[deconv::kData] == kAddTo) { - for (uint32_t gid = 0; gid < param_.num_group; ++gid) { - Tensor tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); + for (int g = 0; g < group_; ++g) { // Legacy approach shown here for comparison: - // temp_dst[gid] = dot(wmat[gid], tmpc); - linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); + // temp_dst[gid] = dot(weight_3d[gid], tmpc); + linalg_gemm(weight_3d[g], col_buffer_3d[g], data_3d[g], false, false, s); } Assign(gdata.Slice(i, i + 1), req[deconv::kData], - (swapaxis<1, 0>(reshape(temp_dst, + (swapaxis<1, 0>(reshape(data_3d, Shape4(gdata.shape_[1], 1, gdata.size(2), @@ -469,22 +429,6 @@ class DeconvolutionOp { } private: - inline index_t InitTemp(const mshadow::Shape<4> &ishape, - const mshadow::Shape<4> &oshape) { - const int ksize = param_.kernel.Size(); - shape_colunit_ = mshadow::Shape2(ishape[1] * ksize, - oshape[2] * oshape[3]); - shape_dstunit_ = mshadow::Shape3(param_.num_group, - oshape[1] / param_.num_group, - oshape[2] * oshape[3]); - mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], - shape_colunit_[1]); - mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], - shape_dstunit_[1], - shape_dstunit_[2]); - index_t required_size = scol.Size() + sdst.Size(); - return required_size; - } inline Tensor TBlobTo4DTensor(const TBlob &tb, Stream *s) { using namespace mshadow; @@ -495,9 +439,56 @@ class DeconvolutionOp { Shape4(tb.shape_[0], tb.shape_[1], 1, tb.shape_[2]), s); } + void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const int num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = param_.num_filter; + conv_in_channels_ = channels_; + bias_term_ = !param_.no_bias; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + conv_in_spatial_dim_ = ishape.ProdShape(2, ishape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_in_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + +private: DeconvolutionParam param_; - mshadow::Shape<2> shape_colunit_; - mshadow::Shape<3> shape_dstunit_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_spatial_dim_; // number of pixels of input images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool bias_term_; // has bias term? }; // class DeconvolutionOp template From db3aaef5c8e175e24d206878abf744f5608fa13c Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Mon, 13 May 2019 16:16:21 -0700 Subject: [PATCH 25/26] comment out debug message --- src/operator/nn/deconvolution-inl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index df797db6cef1..48026028be4b 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -301,6 +301,7 @@ class DeconvolutionOp { // TODO: (lnyuan) remove debugging code + /* std::cout << "col buffer: " << std::endl; DType *tmp_data = new DType[col_buffer_size_]; if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { @@ -318,7 +319,7 @@ class DeconvolutionOp { } std::cout << std::endl; } - + */ col2im(s, col_buffer.dptr(), out_data[deconv::kOut].shape_, col_buffer.shape_, kernel, padding, stride, dilate, out_data[deconv::kOut].dptr() + i * input_dim_, req[deconv::kOut]); From 424f36d358994e9d9c2db483eebdbd085409e76a Mon Sep 17 00:00:00 2001 From: Lin Yuan Date: Wed, 15 May 2019 20:47:07 -0700 Subject: [PATCH 26/26] fix bug in backward --- src/operator/nn/deconvolution-inl.h | 117 +++++++++++++++------------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index 48026028be4b..8724d4e5a366 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -261,18 +261,16 @@ class DeconvolutionOp { auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); - auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); - // weight_3d: (G, OC/G, KH * KW) + // weight_3d: (G, C/G, OC/G * KH * KW) Tensor weight_3d = in_data[deconv::kWeight].get_with_shape( - Shape3(param_.num_group, conv_out_channels_ / group_, kernel_dim_), s); - + Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s); Tensor workspace = ctx.requested[deconv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s); + .get_space_typed(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s); mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1); - col_buffer_shape[0] = conv_in_channels_ * kernel.Size(); + col_buffer_shape[0] = conv_out_channels_ * param_.kernel.Size(); for (int i = 1; i < col_buffer_shape.ndim(); ++i) { col_buffer_shape[i] = in_data[deconv::kData].shape_[i + 1]; } @@ -280,19 +278,37 @@ class DeconvolutionOp { // create a colum buffer to hold the matrix product between weight_3d(T) and input_data TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); - // col_buffer_3d : (G, KH * KW, IH * IW) + // col_buffer_3d : (G, OC/G * KH * KW, IH * IW) Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); + Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); for (index_t i = 0; i < num_; ++i) { // Tensor data_3d = input_4d[i]; Tensor data_3d = Tensor( - workspace.dptr_ + col_buffer_size_, - Shape3(param_.num_group, input_4d.shape_[1] / param_.num_group, conv_in_spatial_dim_), s); + workspace.dptr_ + col_buffer_size_, + Shape3(group_, input_4d.shape_[1] / group_, conv_in_spatial_dim_), s); - // data_3d : (G, C/G, IH * IW) + // data_3d : (G, IC/G, IH * IW) data_3d = reshape(swapaxis<1, 0>(input_4d.Slice(i, i + 1)), data_3d.shape_); - + /* + std::cout << "data_3d: " << std::endl; + DType *tmp_data = new DType[data_3d.shape_.Size()]; + if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { + std::cout << "running on GPU " << std::endl; + NDArray data(data_3d, ctx.run_ctx.get_ctx().dev_id); + data.SyncCopyToCPU(tmp_data, data_3d.shape_.Size()); + std::cout << "complete " << std::endl; + } else { + tmp_data = static_cast(data_3d[0].dptr_); + } + + for (auto j = 0; j < data_3d.shape_[1]; ++j) { + for (auto k = 0; k < data_3d.shape_[2]; ++k) { + std::cout << *(tmp_data + j * data_3d.shape_[2] + k) << " "; + } + std::cout << std::endl; + } + */ for (int g = 0; g < group_; ++g) { // Legacy approach shown here for comparison: // col_buffer_3d[g] = dot(weight_3d[g].T(), data_3d[g]); @@ -302,27 +318,27 @@ class DeconvolutionOp { // TODO: (lnyuan) remove debugging code /* - std::cout << "col buffer: " << std::endl; - DType *tmp_data = new DType[col_buffer_size_]; - if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { - std::cout << "running on GPU " << std::endl; - NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id); - col_data.SyncCopyToCPU(tmp_data, col_buffer_size_); - std::cout << "complete " << std::endl; - } else { - tmp_data = static_cast(col_buffer_3d[0].dptr_); - } - - for (auto j = 0; j < kernel_dim_; ++j) { - for (auto k = 0; k < conv_in_spatial_dim_; ++k) { - std::cout << *(tmp_data + j * kernel_dim_ + k) << " "; - } - std::cout << std::endl; - } + std::cout << "col buffer: " << std::endl; + DType *tmp_col = new DType[col_buffer_size_]; + if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) { + std::cout << "running on GPU " << std::endl; + NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id); + col_data.SyncCopyToCPU(tmp_col, col_buffer_size_); + std::cout << "complete " << std::endl; + } else { + tmp_col = static_cast(col_buffer_3d[0].dptr_); + } + + for (auto j = 0; j < col_buffer_3d.shape_[1]; ++j) { + for (auto k = 0; k < col_buffer_3d.shape_[2]; ++k) { + std::cout << *(tmp_col + j * col_buffer_3d.shape_[2] + k) << " "; + } + std::cout << std::endl; + } */ col2im(s, col_buffer.dptr(), out_data[deconv::kOut].shape_, col_buffer.shape_, - kernel, padding, stride, dilate, out_data[deconv::kOut].dptr() + i * input_dim_, req[deconv::kOut]); - + param_.kernel, padding, stride, dilate, + out_data[deconv::kOut].dptr() + i * output_dim_, req[deconv::kOut]); } if (bias_term_) { @@ -346,7 +362,7 @@ class DeconvolutionOp { CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); - LayerSetUp(out_grad[deconv::kOut].shape_, in_grad[deconv::kData].shape_); + LayerSetUp(in_grad[deconv::kData].shape_, out_grad[deconv::kOut].shape_); Stream *s = ctx.get_stream(); #if defined(__CUDACC__) CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) @@ -366,39 +382,44 @@ class DeconvolutionOp { auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); - auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); - auto kernel_size = kernel.Size(); + // weight_3d: (G, C/G, OC * KH * KW) Tensor weight_3d = in_data[deconv::kWeight] - .get_with_shape(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s); + .get_with_shape(Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s); + + // dweight_3d: (G, C/G, OC * KH * KW) Tensor dweight_3d = in_grad[deconv::kWeight] - .get_with_shape(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s); + .get_with_shape(Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s); Tensor workspace = ctx.requested[deconv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s); + .get_space_typed(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s); + // calculate shape of col_buffer TShape col_buffer_shape(num_spatial_axes_ + 1, 1); - col_buffer_shape[0] = conv_out_channels_ * kernel_size; + col_buffer_shape[0] = conv_out_channels_ * param_.kernel.Size(); for (int i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = out_grad[deconv::kOut].shape_[i+1]; + col_buffer_shape[i] = in_data[deconv::kData].shape_[i+1]; } + // create a column buffer to store ograd TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // col_buffer_3d: (G, OC/G * KH * KW, IH * IW) Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); + Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s); for (index_t i = 0; i < num_; ++i) { // Tensor data_3d = input_4d[i]; Tensor data_3d = Tensor( - workspace.dptr_ + col_buffer_size_, - Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s); + workspace.dptr_ + col_buffer_size_, + Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s); // data_3d : (G, C/G, IH * IW) data_3d = reshape(swapaxis<1, 0>(data_4d.Slice(i, i + 1)), data_3d.shape_); // convert output gradient array to column buffer im2col(s, out_grad[deconv::kOut].dptr() + i * output_dim_, out_grad[deconv::kOut].shape_, - col_buffer.shape_, kernel, padding, stride, dilate, col_buffer.dptr()); + col_buffer.shape_, param_.kernel, padding, stride, dilate, col_buffer.dptr()); for (int g = 0; g < group_; ++g) { auto request = (i == 0) ? req[deconv::kWeight] : kAddTo; @@ -454,19 +475,15 @@ class DeconvolutionOp { conv_out_channels_ = param_.num_filter; conv_in_channels_ = channels_; bias_term_ = !param_.no_bias; - kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + kernel_dim_ = conv_out_channels_ / group_ * param_.kernel.Size(); weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); conv_in_spatial_dim_ = ishape.ProdShape(2, ishape.ndim()); - col_offset_ = kernel_dim_ * conv_out_spatial_dim_; - output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; // size of the column buffer used for storing im2col-ed pixels col_buffer_size_ = kernel_dim_ * group_ * conv_in_spatial_dim_; // input/output image size (#channels * height * width) input_dim_ = ishape.ProdShape(1, ishape.ndim()); output_dim_ = oshape.ProdShape(1, oshape.ndim()); - num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; - num_kernels_col2im_ = input_dim_; } private: @@ -482,13 +499,9 @@ class DeconvolutionOp { index_t conv_in_channels_; // number of input channels index_t kernel_dim_; // number of input channels per group * kernel size index_t weight_offset_; // number of output channels per group * kernel_dim_ - index_t col_offset_; - index_t output_offset_; index_t col_buffer_size_; index_t input_dim_; index_t output_dim_; - index_t num_kernels_im2col_; - index_t num_kernels_col2im_; bool bias_term_; // has bias term? }; // class DeconvolutionOp