From 9fe0589a754acf376fd2769bd40642901e279e33 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 12:11:52 -0800
Subject: [PATCH 01/26] replace with im2col/col2im functions

---
 src/operator/nn/deconvolution-inl.h | 55 +++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index d89a489c0183..a9124110cad7 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -36,6 +36,7 @@
 #include <utility>
 #include "../operator_common.h"
 #include "../linalg.h"
+#include "./im2col.h"
 
 
 namespace mxnet {
@@ -242,6 +243,7 @@ class DeconvolutionOp {
     }
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -272,13 +274,24 @@ class DeconvolutionOp {
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
       if (o_pad[0] == 0 && o_pad[1] == 0) {
-        temp_col = unpack_patch2col(out.Slice(i, i + step),
-                                    kernel[0],
-                                    kernel[1],
-                                    stride[0],
-                                    stride[1],
-                                    dilate[0],
-                                    dilate[1]);
+        // temp_col = unpack_patch2col(out.Slice(i, i + step),
+        //                             kernel[0],
+        //                             kernel[1],
+        //                             stride[0],
+        //                             stride[1],
+        //                             dilate[0],
+        //                             dilate[1]);
+        im2col(
+          s,
+          (out.Slice(i, i+step)).dptr_,
+          out.shape_,
+          temp_col.shape_,
+          kernel,
+          padding,
+          stride,
+          dilate,
+          temp_col.dptr_
+        );
       } else {
         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
                                         o_pad[0], o_pad[1]),
@@ -298,14 +311,26 @@ class DeconvolutionOp {
         linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
       if (o_pad[0] == 0 && o_pad[1] == 0) {
-        out.Slice(i, i + step) = pack_col2patch(temp_col,
-                                   out.Slice(i, i + step).shape_,
-                                   kernel[0],
-                                   kernel[1],
-                                   stride[0],
-                                   stride[1],
-                                   dilate[0],
-                                   dilate[1]);
+        // out.Slice(i, i + step) = pack_col2patch(temp_col,
+        //                            out.Slice(i, i + step).shape_,
+        //                            kernel[0],
+        //                            kernel[1],
+        //                            stride[0],
+        //                            stride[1],
+        //                            dilate[0],
+        //                            dilate[1]);
+        col2im(
+          s,
+          temp_col.dptr_,
+          out.Slice(i, i + step).shape_,
+          temp_col.shape_,
+          kernel,
+          padding,
+          stride,
+          dilate,
+          out.Slice(i, i+step).dptr_,
+          req[deconv::kOut]
+        );
       } else {
         Shape<4> pshape = out.Slice(i, i + step).shape_;
         pshape[2] += 2 * o_pad[0];

From 19dfcb539c25bd8de995fdb304e1a0e1e3dba1f8 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 15:15:24 -0800
Subject: [PATCH 02/26] fixed padding problem in transpose conv forward

---
 src/operator/nn/deconvolution-inl.h | 101 +++++++++-------------------
 1 file changed, 33 insertions(+), 68 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index a9124110cad7..62829fc6ad70 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -227,6 +227,10 @@ class DeconvolutionOp {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
     auto in_data_shape = in_data[deconv::kData].shape_;
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
@@ -253,55 +257,37 @@ class DeconvolutionOp {
                param_.num_filter / param_.num_group * kernel_size);
     Tensor<xpu, 3, DType> wmat =
         in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
+      // temp_col: (N*kernel_size, OW * OH)
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                             workspace.dptr_,
                                             Shape2(shape_colunit_[0],
                                             shape_colunit_[1] * step), s);
+      // temp_dst: (N, N/n_grup, OW * OH)
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        // temp_col = unpack_patch2col(out.Slice(i, i + step),
-        //                             kernel[0],
-        //                             kernel[1],
-        //                             stride[0],
-        //                             stride[1],
-        //                             dilate[0],
-        //                             dilate[1]);
-        im2col(
-          s,
-          (out.Slice(i, i+step)).dptr_,
-          out.shape_,
-          temp_col.shape_,
-          kernel,
-          padding,
-          stride,
-          dilate,
-          temp_col.dptr_
-        );
-      } else {
-        temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-                                        o_pad[0], o_pad[1]),
-                                    kernel[0],
-                                    kernel[1],
-                                    stride[0],
-                                    stride[1],
-                                    dilate[0],
-                                    dilate[1]);
-      }
+
+      im2col(
+        s,
+        (out.Slice(i, i+step)).dptr_,
+        out.shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        temp_col.dptr_
+      );
+
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
@@ -310,42 +296,21 @@ class DeconvolutionOp {
         //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
         linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        // out.Slice(i, i + step) = pack_col2patch(temp_col,
-        //                            out.Slice(i, i + step).shape_,
-        //                            kernel[0],
-        //                            kernel[1],
-        //                            stride[0],
-        //                            stride[1],
-        //                            dilate[0],
-        //                            dilate[1]);
-        col2im(
-          s,
-          temp_col.dptr_,
-          out.Slice(i, i + step).shape_,
-          temp_col.shape_,
-          kernel,
-          padding,
-          stride,
-          dilate,
-          out.Slice(i, i+step).dptr_,
-          req[deconv::kOut]
-        );
-      } else {
-        Shape<4> pshape = out.Slice(i, i + step).shape_;
-        pshape[2] += 2 * o_pad[0];
-        pshape[3] += 2 * o_pad[1];
-        out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
-                                        pshape,
-                                        kernel[0],
-                                        kernel[1],
-                                        stride[0],
-                                        stride[1],
-                                        dilate[0],
-                                        dilate[1]),
-                                        out[i][0].shape_);
-      }
+
+      col2im(
+        s,
+        temp_col.dptr_,
+        out.Slice(i, i + step).shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        out.Slice(i, i+step).dptr_,
+        req[deconv::kOut]
+      );
     }
+
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);

From 747df6cdf65afb55bd241093f3188bb111c519ca Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 15:55:01 -0800
Subject: [PATCH 03/26] fix backward deconvolution

---
 src/operator/nn/deconvolution-inl.h | 45 ++++++++++++++---------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 62829fc6ad70..affe5949293f 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -263,7 +263,7 @@ class DeconvolutionOp {
             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
     for (index_t i = 0; i < nbatch; i += nstep_) {
       const index_t step = std::min(nstep_, nbatch - i);
-      // temp_col: (N*kernel_size, OW * OH)
+      // temp_col: (N * kernel_size, OW * OH)
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                             workspace.dptr_,
                                             Shape2(shape_colunit_[0],
@@ -293,7 +293,7 @@ class DeconvolutionOp {
         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
                                               gstride * (gid + 1));
         // Legacy approach shown here for comparison:
-        //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        // tmpc = dot(wmat[gid].T(), temp_dst[gid]);
         linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
 
@@ -334,6 +334,10 @@ class DeconvolutionOp {
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
     auto in_data_shape = in_data[deconv::kData].shape_;
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
@@ -352,6 +356,7 @@ class DeconvolutionOp {
     }
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -363,10 +368,6 @@ class DeconvolutionOp {
         in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
     Tensor<xpu, 3, DType> gwmat =
         in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
 
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
@@ -384,23 +385,19 @@ class DeconvolutionOp {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        temp_col = unpack_patch2col(grad.Slice(i, i + step),
-                                     kernel[0],
-                                     kernel[1],
-                                     stride[0],
-                                     stride[1],
-                                     dilate[0],
-                                     dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]),
-                                     kernel[0],
-                                     kernel[1],
-                                     stride[0],
-                                     stride[1],
-                                     dilate[0],
-                                     dilate[1]);
-      }
+
+      im2col(
+        s,
+        (grad.Slice(i, i + step)).dptr_,
+        grad.shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        temp_col.dptr_
+      );
+
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
@@ -421,7 +418,7 @@ class DeconvolutionOp {
         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
           // Legacy approach shown here for comparison:
-          //   temp_dst[gid] = dot(wmat[gid], tmpc);
+          // temp_dst[gid] = dot(wmat[gid], tmpc);
           linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
         }
         Assign(gdata.Slice(i, i + step),

From 854cff2275a39c89a0ebbd97c6ba5d724da9d814 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 16:31:54 -0800
Subject: [PATCH 04/26] refactor

---
 src/operator/nn/deconvolution-inl.h | 41 +++++++++++------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index affe5949293f..155fe71750f8 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -119,7 +119,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   }
 
   template<size_t ndim>
-  void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const {
+  void InferPad(const TShape &input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim]) const {
     // Modified by Li.bs
     // Use tag to control the calculation of pad
     bool bCal = false;
@@ -238,16 +238,13 @@ class DeconvolutionOp {
     if (param_.kernel.ndim() == 2) {
       param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
     } else {
-      index_t o_pad_1D[1], o_adj_1D[1];
-      param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
-      o_pad[0] = 0;
-      o_pad[1] = o_pad_1D[0];
-      o_adj[0] = 0;
-      o_adj[1] = o_adj_1D[0];
+      param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
     }
+
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
-    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]});
+    auto padding = param_.kernel.ndim() == 2 ?
+      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -290,8 +287,7 @@ class DeconvolutionOp {
 
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-                                              gstride * (gid + 1));
+        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         // Legacy approach shown here for comparison:
         // tmpc = dot(wmat[gid].T(), temp_dst[gid]);
         linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
@@ -314,7 +310,7 @@ class DeconvolutionOp {
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
-      out += mshadow::expr::broadcast<1>(bias, out.shape_);
+      out += broadcast<1>(bias, out.shape_);
     }
   }
 
@@ -342,21 +338,16 @@ class DeconvolutionOp {
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
     Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s);
-
     index_t o_pad[2], o_adj[2];
     if (param_.kernel.ndim() == 2) {
       param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
     } else {
-      index_t o_pad_1D[1], o_adj_1D[1];
-      param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
-      o_pad[0] = 0;
-      o_pad[1] = o_pad_1D[0];
-      o_adj[0] = 0;
-      o_adj[1] = o_adj_1D[0];
+      param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
     }
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
-    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[1]});
+    auto padding = param_.kernel.ndim() == 2 ?
+      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -404,11 +395,11 @@ class DeconvolutionOp {
         if (i == 0) {
           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
           // Legacy approach shown here for comparison:
-          //   Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
+          // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
           linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]);
         } else {
           // Legacy approach shown here for comparison:
-          //   gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          // gwmat[gid] += dot(temp_dst[gid], tmpc.T());
           linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
         }
       }
@@ -424,10 +415,10 @@ class DeconvolutionOp {
         Assign(gdata.Slice(i, i + step),
                req[deconv::kData],
                (swapaxis<1, 0>(reshape(temp_dst,
-                                      mshadow::Shape4(gdata.shape_[1],
-                                                      step,
-                                                      gdata.size(2),
-                                                      gdata.size(3))))));
+                                       Shape4(gdata.shape_[1],
+                                              step,
+                                              gdata.size(2),
+                                              gdata.size(3))))));
       }
     }
     if (!param_.no_bias) {

From 20ae427b17a1d8521583c02d71b5c4967345f254 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 20:32:20 -0800
Subject: [PATCH 05/26] fix lint

---
 src/operator/nn/deconvolution-inl.h    | 9 +++------
 tests/python/unittest/test_operator.py | 1 -
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 155fe71750f8..377f5b7a0850 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -282,8 +282,7 @@ class DeconvolutionOp {
         padding,
         stride,
         dilate,
-        temp_col.dptr_
-      );
+        temp_col.dptr_);
 
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -303,8 +302,7 @@ class DeconvolutionOp {
         stride,
         dilate,
         out.Slice(i, i+step).dptr_,
-        req[deconv::kOut]
-      );
+        req[deconv::kOut]);
     }
 
     if (!param_.no_bias) {
@@ -386,8 +384,7 @@ class DeconvolutionOp {
         padding,
         stride,
         dilate,
-        temp_col.dptr_
-      );
+        temp_col.dptr_);
 
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 146836c28459..864cd8c77833 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1380,7 +1380,6 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
     assert out_shapes[0] == (input_shape[0], 5) + target_shape
 
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10973")
 @with_seed()
 def test_deconvolution():
     # 2D

From 926cfd7ca1aaef1200bcf6870a550b12ab013898 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 30 Jan 2019 23:36:24 -0800
Subject: [PATCH 06/26] fix unit test, remove step in deconv

---
 src/operator/nn/deconvolution-inl.h    | 50 ++++++++++++--------------
 tests/python/unittest/test_operator.py |  2 +-
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 377f5b7a0850..db710c4544ab 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -258,24 +258,24 @@ class DeconvolutionOp {
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
+    for (index_t i = 0; i < nbatch; ++i) {
       // temp_col: (N * kernel_size, OW * OH)
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                             workspace.dptr_,
-                                            Shape2(shape_colunit_[0],
-                                            shape_colunit_[1] * step), s);
+                                            Shape2(shape_colunit_[0], shape_colunit_[1]),
+                                            s);
       // temp_dst: (N, N/n_grup, OW * OH)
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
-                                           shape_dstunit_[1],
-                                           shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+                                                  shape_dstunit_[1],
+                                                  shape_dstunit_[2]),
+                                           s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
 
       im2col(
         s,
-        (out.Slice(i, i+step)).dptr_,
+        (out.Slice(i, i + 1)).dptr_,
         out.shape_,
         temp_col.shape_,
         kernel,
@@ -295,13 +295,13 @@ class DeconvolutionOp {
       col2im(
         s,
         temp_col.dptr_,
-        out.Slice(i, i + step).shape_,
+        out.Slice(i, i + 1).shape_,
         temp_col.shape_,
         kernel,
         padding,
         stride,
         dilate,
-        out.Slice(i, i+step).dptr_,
+        out.Slice(i, i + 1).dptr_,
         req[deconv::kOut]);
     }
 
@@ -362,22 +362,22 @@ class DeconvolutionOp {
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
+    for (index_t i = 0; i < nbatch; ++i) {
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                            workspace.dptr_,
-                                           Shape2(shape_colunit_[0],
-                                           shape_colunit_[1] * step), s);
+                                           Shape2(shape_colunit_[0], shape_colunit_[1]),
+                                           s);
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
-                                           shape_dstunit_[1],
-                                           shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
+                                                  shape_dstunit_[1],
+                                                  shape_dstunit_[2]),
+                                           s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
 
       im2col(
         s,
-        (grad.Slice(i, i + step)).dptr_,
+        (grad.Slice(i, i + 1)).dptr_,
         grad.shape_,
         temp_col.shape_,
         kernel,
@@ -409,11 +409,11 @@ class DeconvolutionOp {
           // temp_dst[gid] = dot(wmat[gid], tmpc);
           linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
         }
-        Assign(gdata.Slice(i, i + step),
+        Assign(gdata.Slice(i, i + 1),
                req[deconv::kData],
                (swapaxis<1, 0>(reshape(temp_dst,
                                        Shape4(gdata.shape_[1],
-                                              step,
+                                              1,
                                               gdata.size(2),
                                               gdata.size(3))))));
       }
@@ -433,17 +433,12 @@ class DeconvolutionOp {
     shape_dstunit_ = mshadow::Shape3(param_.num_group,
                                      oshape[1] / param_.num_group,
                                      oshape[2] * oshape[3]);
-    // See convolution for workspace calculations. nstep_ will be the effective batch size
-    nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
-          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
-      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-                                             shape_colunit_[1] * nstep_);
+                                             shape_colunit_[1]);
     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
                                              shape_dstunit_[1],
-                                             shape_dstunit_[2] * nstep_);
+                                             shape_dstunit_[2]);
     index_t required_size = scol.Size() + sdst.Size();
     return required_size;
   }
@@ -460,7 +455,6 @@ class DeconvolutionOp {
   DeconvolutionParam param_;
   mshadow::Shape<2> shape_colunit_;
   mshadow::Shape<3> shape_dstunit_;
-  index_t nstep_;
 };  // class DeconvolutionOp
 
 template<typename xpu>
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 864cd8c77833..8246f95b44ba 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1256,7 +1256,7 @@ def test_abs():
     assert_almost_equal(out, npout)
 
     out_grad = mx.nd.empty(shape)
-    out_grad[:] = 2;
+    out_grad[:] = 2
     npout_grad = out_grad.asnumpy()
     npout_grad = npout_grad * np.sign(data_tmp)
     exe_test.backward(out_grad)

From c49dbe10c4cb02a3025ccc68da2ec5f68082be9d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 31 Jan 2019 00:23:42 -0800
Subject: [PATCH 07/26] add unit test

---
 tests/python/unittest/test_gluon.py | 37 +++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index abe6b136fe0c..277bb2ba703c 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -503,6 +503,43 @@ def test_deconv():
     # layer = nn.Conv3DTranspose(16, (3, 3, 3), layout='NDHWC', in_channels=4)
     # # check_layer_forward(layer, (1, 10, 10, 10, 4))
 
+@with_seed()
+def test_deconv_dilation():
+    data = mx.nd.array((((0,0,0),
+                         (0,1,0),
+                         (0,0,0)
+                        ),
+                        ((0,0,0),
+                         (0,2,0),
+                         (0,0,0)
+                        )
+                       ) 
+                      )
+    kernel = mx.nd.array(((1,2,3),
+                          (4,5,6),
+                          (7,8,9)))
+
+    data_batch = data.expand_dims(1)
+    weight = kernel.expand_dims(0).expand_dims(0)
+    layer = nn.Conv2DTranspose(in_channels=1, channels=1,
+                               kernel_size=(3,3), padding=(1,1),
+                               strides=(1,1), dilation=(2,2))
+    layer.initialize()
+    layer.weight.set_data(weight)
+    out = layer(data_batch).asnumpy()
+    expected = np.array([[[[1.,0.,2.,0.,3.],
+                           [0.,0.,0.,0.,0.],
+                           [4.,0.,5.,0.,6.],
+                           [0.,0.,0.,0.,0.],
+                           [7.,0.,8.,0.,9.]]],
+                         [[[2.,0.,4.,0.,6.],
+                           [0.,0.,0.,0.,0.],
+                           [8.,0.,10.,0.,12.],
+                           [0.,0.,0.,0.,0.],
+                           [14.,0.,16.,0.,18.]]]  
+                        ])
+    assert_almost_equal(out, expected)
+
 
 @with_seed()
 def test_pool():

From afd75d139fcbefa68edc9b2ddcdafdb38bc9e529 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 31 Jan 2019 00:26:57 -0800
Subject: [PATCH 08/26] refactor

---
 tests/python/unittest/test_gluon.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 277bb2ba703c..35b6fa899e8e 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -507,14 +507,11 @@ def test_deconv():
 def test_deconv_dilation():
     data = mx.nd.array((((0,0,0),
                          (0,1,0),
-                         (0,0,0)
-                        ),
+                         (0,0,0)),
                         ((0,0,0),
                          (0,2,0),
-                         (0,0,0)
-                        )
-                       ) 
-                      )
+                         (0,0,0))))
+
     kernel = mx.nd.array(((1,2,3),
                           (4,5,6),
                           (7,8,9)))

From 5b5909712696554a4592b2ab0b93f9a62aa7927f Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 31 Jan 2019 00:46:57 -0800
Subject: [PATCH 09/26] fix build error

---
 src/operator/nn/deconvolution-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index db710c4544ab..0947b63d5daa 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -308,7 +308,7 @@ class DeconvolutionOp {
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
-      out += broadcast<1>(bias, out.shape_);
+      out += mshadow::expr::broadcast<1>(bias, out.shape_);
     }
   }
 

From d1554c1d097baeb47ccc5ead4e9057274effc3b8 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 28 Jan 2019 18:19:35 +0000
Subject: [PATCH 10/26] Revert "Aggregate SGD (#13346)"

This reverts commit 0a45e1a222637c7dee29511cbfc43e594571933b.
---
 cpp-package/scripts/OpWrapperGenerator.py   |   4 +-
 docs/faq/env_var.md                         |   9 -
 python/mxnet/gluon/trainer.py               |  15 +-
 python/mxnet/model.py                       |  10 +-
 python/mxnet/optimizer/optimizer.py         | 231 ++++-----------
 src/operator/optimizer_op-inl.h             | 295 --------------------
 src/operator/optimizer_op.cc                | 193 +------------
 src/operator/optimizer_op.cu                |   9 -
 tests/python/unittest/test_gluon_trainer.py |   8 +-
 tests/python/unittest/test_module.py        |   3 -
 10 files changed, 66 insertions(+), 711 deletions(-)

diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 65ba247c25c8..ca430ec99e6e 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -97,8 +97,7 @@ class Arg:
         'double':'double',\
         'double or None':'dmlc::optional<double>',\
         'Shape or None':'dmlc::optional<Shape>',\
-        'string':'const std::string&',\
-        'tuple of <float>':'nnvm::Tuple<mx_float>'}
+        'string':'const std::string&'}
     name = ''
     type = ''
     description = ''
@@ -408,7 +407,6 @@ def ParseAllOps():
                       "#include \"mxnet-cpp/op_util.h\"\n"
                       "#include \"mxnet-cpp/operator.h\"\n"
                       "#include \"dmlc/optional.h\"\n"
-                      "#include \"nnvm/tuple.h\"\n"
                       "\n"
                       "namespace mxnet {\n"
                       "namespace cpp {\n"
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 99ebae21d61f..98057d0d76d6 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -145,10 +145,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
     when kvstore's type is `device`.
 
-* MXNET_UPDATE_ON_KVSTORE
-  - Values: 0(false) or 1(true) ```(default=1)```
-  - If true, weight updates are performed during the communication step, if possible.
-
 ## Memonger
 
 * MXNET_BACKWARD_DO_MIRROR
@@ -222,11 +218,6 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
   - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
 
-* MXNET_OPTIMIZER_AGGREGATION_SIZE
-  - Values: Int ```(default=4)```
-  - Maximum value is 60.
-  - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).
-
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 8060f38ac2aa..f6c0a31b52e2 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,8 +60,7 @@ class Trainer(object):
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
         Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        suitable option depending on the type of kvstore.
 
     Properties
     ----------
@@ -394,8 +393,6 @@ def update(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
-        updates = [[] for _ in self._updaters]
-
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
@@ -419,17 +416,11 @@ def _update(self, ignore_stale_grad=False):
                     self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
-            for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
+            for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
-                    upd.append((i, grad, arr))
+                    upd(i, grad, arr)
                     arr._fresh_grad = False
 
-        if not (self._kvstore and self._update_on_kvstore):
-            for updater, upd in zip(self._updaters, updates):
-                if upd:
-                    i, w, g = zip(*upd)
-                    updater(i, w, g)
-
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index c08077cc65f4..38fe739154d5 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to `NDArray`.
         Model parameter, dict of name to `NDArray` of net's weights.
     """
-    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
+    update_on_kvstore = True
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
         kv = kvstore
     elif isinstance(kvstore, str):
         # create kvstore using the string type
-        if num_device == 1 and 'dist' not in kvstore:
+        if num_device is 1 and 'dist' not in kvstore:
             # no need to use kv for single device and single machine
             kv = None
         else:
@@ -162,7 +162,6 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
-    updates = [[] for _ in range(num_device)]
     for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
@@ -179,10 +178,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
             w, g = p
-            updates[k].append((index*num_device+k, g, w))
-    for dev_updates in updates:
-        i, w, g = zip(*dev_updates)
-        updater(i, w, g)
+            updater(index*num_device+k, g, w)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index cb52ac54fdab..6ffbbcffc384 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -22,15 +22,12 @@
 import math
 import pickle
 import warnings
-import os
 import numpy
 from ..base import py_str
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update,
-                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
-                       multi_mp_sgd_mom_update)
+                       signsgd_update, signum_update)
 from ..ndarray import sparse
 from ..random import normal
 
@@ -40,8 +37,6 @@
     'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
 ]
 
-def _flatten_list(nested_list):
-    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -110,7 +105,6 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
-        self.aggregate_num = 0
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -386,44 +380,13 @@ def _update_count(self, index):
 
         Parameters
         ----------
-        index : int or list of int
+        index : int
             The index to be updated.
         """
-        if not isinstance(index, (list, tuple)):
-            index = [index]
-        for idx in index:
-            if idx not in self._index_update_count:
-                self._index_update_count[idx] = self.begin_num_update
-            self._index_update_count[idx] += 1
-            self.num_update = max(self._index_update_count[idx], self.num_update)
-
-    def _get_lrs(self, indices):
-        """Gets the learning rates given the indices of the weights.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices corresponding to weights.
-
-        Returns
-        -------
-        lrs : list of float
-            Learning rates for those indices.
-        """
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        lrs = [lr for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                lrs[i] *= self.param_dict[index].lr_mult
-            elif index in self.lr_mult:
-                lrs[i] *= self.lr_mult[index]
-            elif index in self.idx2name:
-                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lrs
+        if index not in self._index_update_count:
+            self._index_update_count[index] = self.begin_num_update
+        self._index_update_count[index] += 1
+        self.num_update = max(self._index_update_count[index], self.num_update)
 
     def _get_lr(self, index):
         """Gets the learning rate given the index of the weight.
@@ -438,31 +401,18 @@ def _get_lr(self, index):
         lr : float
             Learning rate for this index.
         """
-        return self._get_lrs([index])[0]
-
-    def _get_wds(self, indices):
-        """Gets weight decays for indices.
-        Returns 0 for non-weights if the name of weights are provided for `__init__`.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices of weights.
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.num_update)
+        else:
+            lr = self.lr
 
-        Returns
-        -------
-        wds : list of float
-            Weight decays for those indices.
-        """
-        wds = [self.wd for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                wds[i] *= self.param_dict[index].wd_mult
-            elif index in self.wd_mult:
-                wds[i] *= self.wd_mult[index]
-            elif index in self.idx2name:
-                wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0)
-        return wds
+        if index in self.param_dict:
+            lr *= self.param_dict[index].lr_mult
+        elif index in self.lr_mult:
+            lr *= self.lr_mult[index]
+        elif index in self.idx2name:
+            lr *= self.lr_mult.get(self.idx2name[index], 1.0)
+        return lr
 
     def _get_wd(self, index):
         """Gets weight decay for index.
@@ -471,14 +421,21 @@ def _get_wd(self, index):
         Parameters
         ----------
         index : int
-            The index of weight.
+            The index for weight.
 
         Returns
         -------
         wd : float
             Weight decay for this index.
         """
-        return self._get_wds([index])[0]
+        wd = self.wd
+        if index in self.param_dict:
+            wd *= self.param_dict[index].wd_mult
+        elif index in self.wd_mult:
+            wd *= self.wd_mult[index]
+        elif index in self.idx2name:
+            wd *= self.wd_mult.get(self.idx2name[index], 1.0)
+        return wd
 
     def __getstate__(self):
         ret = self.__dict__.copy()
@@ -514,13 +471,6 @@ class SGD(Optimizer):
     provides slightly different semantics than the original update, and
     may lead to different empirical results.
 
-    In the case when ``update_on_kvstore`` is set to False (either globally via
-    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
-    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
-    of parameters, which may lead to improved performance. The aggregation size
-    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
-    defaults to 4.
-
     Otherwise, **standard updates** are applied by::
 
         rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
@@ -552,7 +502,6 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
         self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
 
     def create_state_multi_precision(self, index, weight):
         weight_master_copy = None
@@ -573,22 +522,12 @@ def create_state(self, index, weight):
             momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
         return momentum
 
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
+    def _update_impl(self, index, weight, grad, state, multi_precision=False):
+        assert(isinstance(weight, NDArray))
+        assert(isinstance(grad, NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
 
         kwargs = {'rescale_grad': self.rescale_grad}
         if self.momentum > 0:
@@ -596,49 +535,26 @@ def _update_impl(self, indices, weights, grads, states, multi_precision=False):
         if self.clip_gradient:
             kwargs['clip_gradient'] = self.clip_gradient
 
-        if aggregate:
-            if not multi_precision:
-                if self.momentum > 0:
-                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
-                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
-                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+        if not multi_precision:
+            if state is not None:
+                sgd_mom_update(weight, grad, state, out=weight,
+                               lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
             else:
-                if self.momentum > 0:
-                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
-                                            out=weights, num_weights=len(weights),
-                                            lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
-                                                           list(zip(*states))[1])),
-                                        out=weights, num_weights=len(weights),
-                                        lrs=lrs, wds=wds, **kwargs)
+                sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                           lr=lr, wd=wd, **kwargs)
         else:
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
+            if state[0] is not None:
+                mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+            else:
+                mp_sgd_update(weight, grad, state[1], out=weight,
+                              lr=lr, wd=wd, **kwargs)
 
     def update(self, index, weight, grad, state):
         self._update_impl(index, weight, grad, state, multi_precision=False)
 
     def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
+        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
         self._update_impl(index, weight, grad, state,
                           multi_precision=use_multi_precision)
 
@@ -1609,55 +1525,20 @@ def __init__(self, optimizer):
         self.optimizer = optimizer
         self.states = {}
         self.states_synced = {}
-        self.aggregate_updates = optimizer.aggregate_num > 0
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
-        if not isinstance(index, (list, tuple)):
-            indices = [index]
-            grads = [grad]
-            weights = [weight]
-        else:
-            indices = index
-            grads = grad
-            weights = weight
-        for i, idx in enumerate(indices):
-            # convert ctypes.char_p.value back to python str if needed
-            if isinstance(idx, bytes):
-                indices[i] = py_str(idx)
-                idx = indices[i]
-            if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
-                self.states_synced[idx] = True
-            elif not self.states_synced[idx]:
-                self.states[idx] = \
-                    self.sync_state_context(self.states[idx], weights[i].context)
-                self.states_synced[idx] = True
-        if self.aggregate_updates:
-            # segregate values based on type
-            type_map = {}
-            for i, w, g in zip(indices, weights, grads):
-                if w.dtype in type_map:
-                    type_map[w.dtype].append((i, w, g))
-                else:
-                    type_map[w.dtype] = [(i, w, g)]
-            for idx in type_map:
-                current_index = 0
-                indices, weights, grads = zip(*type_map[idx])
-                while current_index < len(indices):
-                    states = []
-                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
-                    for j in range(step):
-                        states.append(self.states[indices[current_index + j]])
-                    self.optimizer.update_multi_precision(
-                        indices[current_index:current_index + self.optimizer.aggregate_num],
-                        weights[current_index:current_index + self.optimizer.aggregate_num],
-                        grads[current_index:current_index + self.optimizer.aggregate_num],
-                        states)
-                    current_index += self.optimizer.aggregate_num
-        else:
-            for i, w, g in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision(i, w, g, self.states[i])
+        # convert ctypes.char_p.value back to python str if needed
+        if isinstance(index, bytes):
+            index = py_str(index)
+        if index not in self.states:
+            self.states[index] = self.optimizer.create_state_multi_precision(index, weight)
+            self.states_synced[index] = True
+        elif not self.states_synced[index]:
+            self.states[index] = \
+                self.sync_state_context(self.states[index], weight.context)
+            self.states_synced[index] = True
+        self.optimizer.update_multi_precision(index, weight, grad, self.states[index])
 
     def sync_state_context(self, state, context):
         """sync state context."""
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 223a1aa6c37d..9251b8614806 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -82,301 +82,6 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
   }
 };
 
-struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
-  float rescale_grad;
-  float clip_gradient;
-  int num_weights;
-  DMLC_DECLARE_PARAMETER(MultiSGDParam) {
-    DMLC_DECLARE_FIELD(lrs)
-    .describe("Learning rates.");
-    DMLC_DECLARE_FIELD(wds)
-    .describe("Weight decay augments the objective function with a "
-              "regularization term that penalizes large weights. "
-              "The penalty scales with the square of the magnitude of each weight.");
-    DMLC_DECLARE_FIELD(rescale_grad)
-    .set_default(1.0f)
-    .describe("Rescale gradient to grad = rescale_grad*grad.");
-    DMLC_DECLARE_FIELD(clip_gradient)
-    .set_default(-1.0f)
-    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
-              "If clip_gradient <= 0, gradient clipping is turned off. "
-              "grad = max(min(grad, clip_gradient), -clip_gradient).");
-    DMLC_DECLARE_FIELD(num_weights)
-    .set_default(1)
-    .describe("Number of updated weights.");
-  }
-};
-
-struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
-  float momentum;
-  float rescale_grad;
-  float clip_gradient;
-  int num_weights;
-  DMLC_DECLARE_PARAMETER(MultiSGDMomParam) {
-    DMLC_DECLARE_FIELD(lrs)
-    .describe("Learning rates.");
-    DMLC_DECLARE_FIELD(wds)
-    .describe("Weight decay augments the objective function with a "
-              "regularization term that penalizes large weights. "
-              "The penalty scales with the square of the magnitude of each weight.");
-    DMLC_DECLARE_FIELD(momentum)
-    .set_default(0.0f)
-    .describe("The decay rate of momentum estimates at each epoch.");
-    DMLC_DECLARE_FIELD(rescale_grad)
-    .set_default(1.0f)
-    .describe("Rescale gradient to grad = rescale_grad*grad.");
-    DMLC_DECLARE_FIELD(clip_gradient)
-    .set_default(-1.0f)
-    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
-              "If clip_gradient <= 0, gradient clipping is turned off. "
-              "grad = max(min(grad, clip_gradient), -clip_gradient).");
-    DMLC_DECLARE_FIELD(num_weights)
-    .set_default(1)
-    .describe("Number of updated weights.");
-  }
-};
-
-template<typename ParamType, int input_stride>
-inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs,
-                          std::vector<TShape> *in_attrs,
-                          std::vector<TShape> *out_attrs) {
-  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
-  CHECK_EQ(out_attrs->size(), param.num_weights);
-
-  bool all_inferred = true;
-  auto& input_shapes = *in_attrs;
-  auto& output_shapes = *out_attrs;
-  // Learning rates
-  CHECK_EQ(param.lrs.ndim(), param.num_weights)
-    << "Number of learning rates is inconsistent with num_weights "
-    << "parameter passed. Expected number of learning rates: "
-    << param.num_weights << ", and got " << param.lrs.ndim();
-  // Weight decays
-  CHECK_EQ(param.wds.ndim(), param.num_weights)
-    << "Number of weight decays is inconsistent with num_weights "
-    << "parameter passed. Expected number of weight decays: "
-    << param.num_weights << ", and got " << param.wds.ndim();
-  // Weights and gradients
-  for (int i = 0; i < param.num_weights; ++i) {
-    std::vector<TShape> input_vec;
-    std::vector<TShape> output_vec({output_shapes[i]});
-    for (int j = 0; j < input_stride; ++j) {
-      input_vec.push_back(input_shapes[i * input_stride + j]);
-    }
-    all_inferred = all_inferred && ElemwiseShape<input_stride, 1>(attrs, &input_vec, &output_vec);
-  }
-  return all_inferred;
-}
-
-template <typename ParamType, int input_stride, int num_fp32_inputs>
-inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs,
-                                  std::vector<int> *in_attrs,
-                                  std::vector<int> *out_attrs) {
-  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
-  CHECK_EQ(out_attrs->size(), param.num_weights);
-
-  bool all_inferred = true;
-  auto& input_types = *in_attrs;
-  auto& output_types = *out_attrs;
-  // Weights and gradients
-  for (int i = 0; i < param.num_weights; ++i) {
-    std::vector<int> input_vec;
-    std::vector<int> output_vec({output_types[i]});
-    for (int j = 0; j < input_stride - num_fp32_inputs; ++j) {
-      input_vec.push_back(input_types[i * input_stride + j]);
-    }
-    all_inferred = all_inferred &&
-                   ElemwiseType<input_stride - num_fp32_inputs, 1>(attrs, &input_vec, &output_vec);
-  }
-  // master copies of weights
-  for (int i = 0; i < param.num_weights; ++i) {
-    for (int j = 0; j < num_fp32_inputs; ++j) {
-      TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32);
-    }
-  }
-  return all_inferred;
-}
-
-template<typename DType, typename MPDType>
-struct MultiSGDKernelParam {
-  static const int N = 60;
-  int count;
-  size_t max_size;
-  size_t sizes[N];
-  DType * weights[N];
-  DType * grads[N];
-  MPDType * mom[N];
-  MPDType * weights32[N];
-  DType * out_data[N];
-  MPDType lrs[N];
-  MPDType wds[N];
-  MPDType clip_gradient;
-  MPDType rescale_grad;
-  MPDType momentum;
-};
-
-template <typename MPDType, bool has_momentum, bool has_mixed_precision>
-struct MultiSGDKernel {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam<DType, MPDType>& param,
-    const OpReqType req) {
-    for (int index = 0; index < param.count; ++index) {
-      if ((size_t)i < param.sizes[index]) {
-        MPDType w = has_mixed_precision ? param.weights32[index][i] :
-                                          MPDType(param.weights[index][i]);
-        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
-        if (param.clip_gradient >= 0.0f) {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]
-                *mshadow_op::clip::Map(param.rescale_grad *
-                                       static_cast<MPDType>(param.grads[index][i]),
-                                     param.clip_gradient);
-        } else {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
-        }
-        if (has_momentum) {
-          param.mom[index][i] = mom;
-        }
-        w = w + mom;
-        if (has_mixed_precision) {
-          param.weights32[index][i] = w;
-        }
-        KERNEL_ASSIGN(param.out_data[index][i], req, w);
-      }
-    }
-  }
-};
-
-template<typename xpu,
-         typename DType,
-         typename MPDType,
-         typename ParamType = MultiSGDParam,
-         int input_stride = 2>
-MultiSGDKernelParam<DType, MPDType> FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs,
-                                                            const OpContext &ctx,
-                                                            const std::vector<TBlob> &inputs,
-                                                            const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  const ParamType& p = nnvm::get<ParamType>(attrs.parsed);
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MultiSGDKernelParam<DType, MPDType> param;
-  param.clip_gradient = p.clip_gradient;
-  param.rescale_grad = p.rescale_grad;
-  param.momentum = 0;
-  param.count = p.num_weights;
-  param.max_size = 0;
-  for (int i = 0; i < param.count; ++i) {
-    param.sizes[i] = inputs[i * input_stride].shape_.Size();
-    if (param.max_size < param.sizes[i]) {
-      param.max_size = param.sizes[i];
-    }
-    param.weights[i] = inputs[i * input_stride].FlatTo2D<xpu, DType>(s).dptr_;
-    param.grads[i] = inputs[i * input_stride + 1].FlatTo2D<xpu, DType>(s).dptr_;
-    // if mixed precision, then the last input in a set
-    // is 32-bit master copy of the weights
-    if (!std::is_same<DType, MPDType>::value) {
-      param.weights32[i] = inputs[i * input_stride + input_stride - 1]
-                           .FlatTo2D<xpu, MPDType>(s).dptr_;
-    }
-    param.out_data[i] = outputs[i].FlatTo2D<xpu, DType>(s).dptr_;
-    param.lrs[i] = p.lrs[i];
-    param.wds[i] = p.wds[i];
-  }
-
-  return param;
-}
-
-
-template<typename xpu,
-         typename DType,
-         typename MPDType,
-         int input_stride = 3>
-MultiSGDKernelParam<DType, MPDType> FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs,
-                                                            const OpContext &ctx,
-                                                            const std::vector<TBlob> &inputs,
-                                                            const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  const MultiSGDMomParam& p = nnvm::get<MultiSGDMomParam>(attrs.parsed);
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MultiSGDKernelParam<DType, MPDType> param =
-    FillMultiSGDKernelParam<xpu,
-                            DType,
-                            MPDType,
-                            MultiSGDMomParam,
-                            input_stride>(attrs, ctx, inputs, outputs);
-  param.momentum = p.momentum;
-  for (int i = 0; i < param.count; ++i) {
-    param.mom[i] = inputs[i * input_stride + 2].FlatTo2D<xpu, MPDType>(s).dptr_;
-  }
-
-  return param;
-}
-
-template<typename T>
-class type_identity {
- public:
-  using type = T;
-};
-
-template<typename T>
-class single_precision {
- public:
-  using type = float;
-};
-
-template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
-inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs,
-                           const OpContext &ctx,
-                           const std::vector<TBlob> &inputs,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    using MPDType = typename MPTypeChooser<DType>::type;
-    MultiSGDKernelParam<DType, MPDType> param =
-      FillMultiSGDKernelParam<xpu,
-                              DType,
-                              MPDType,
-                              MultiSGDParam,
-                              input_stride>(attrs, ctx, inputs, outputs);
-    Kernel<MultiSGDKernel<MPDType,
-                          false,
-                          !std::is_same<DType, MPDType>::value>,
-                          xpu>::Launch(s, param.max_size, param, req[0]);
-  });
-}
-
-template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
-inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs,
-                              const OpContext &ctx,
-                              const std::vector<TBlob> &inputs,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    using MPDType = typename MPTypeChooser<DType>::type;
-    MultiSGDKernelParam<DType, MPDType> param =
-      FillMultiSGDMomKernelParam<xpu,
-                                 DType,
-                                 MPDType,
-                                 input_stride>(attrs, ctx, inputs, outputs);
-    Kernel<MultiSGDKernel<MPDType,
-                          true,
-                          !std::is_same<DType, MPDType>::value>,
-                          xpu>::Launch(s, param.max_size, param, req[0]);
-  });
-}
 
 struct SGDKernel {
   template<typename DType>
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 982995ad2f95..a52a6f32907c 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -31,8 +31,6 @@ namespace op {
 
 DMLC_REGISTER_PARAMETER(SGDParam);
 DMLC_REGISTER_PARAMETER(SGDMomParam);
-DMLC_REGISTER_PARAMETER(MultiSGDParam);
-DMLC_REGISTER_PARAMETER(MultiSGDMomParam);
 DMLC_REGISTER_PARAMETER(FTMLParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
@@ -54,7 +52,7 @@ It updates the weights using::
 
  weight = weight - learning_rate * sign(gradient)
 
-.. note::
+.. note:: 
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(2)
@@ -83,7 +81,7 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
-.. note::
+.. note:: 
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(3)
@@ -315,193 +313,6 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-NNVM_REGISTER_OP(multi_sgd_update)
-.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
-
-It updates the weights using::
-
- weight = weight - learning_rate * (gradient + wd * weight)
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 2);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 2>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, type_identity, 2>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_sgd_mom_update)
-.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer.
-
-Momentum update has better convergence rates on neural networks. Mathematically it looks
-like below:
-
-.. math::
-
-  v_1 = \alpha * \nabla J(W_0)\\
-  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
-  W_t = W_{t-1} + v_t
-
-It updates the weights using::
-
-  v = momentum * v - learning_rate * gradient
-  weight += v
-
-Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 3);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDMomParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 3>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("mom_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 3 + 2);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, type_identity, 3>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum")
-.add_arguments(MultiSGDMomParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_mp_sgd_update)
-.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer.
-
-It updates the weights using::
-
- weight = weight - learning_rate * (gradient + wd * weight)
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 3);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 3>)
-.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDParam, 3, 1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("weight32_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 3 + 2);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, single_precision, 3>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
-.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer.
-
-Momentum update has better convergence rates on neural networks. Mathematically it looks
-like below:
-
-.. math::
-
-  v_1 = \alpha * \nabla J(W_0)\\
-  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
-  W_t = W_{t-1} + v_t
-
-It updates the weights using::
-
-  v = momentum * v - learning_rate * gradient
-  weight += v
-
-Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 4);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDMomParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 4>)
-.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDMomParam, 4, 2>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDMomParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("mom_") + std::to_string(i));
-      ret.push_back(std::string("weight32_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 4 + 2);
-      ret.push_back(i * 4 + 3);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, single_precision, 4>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_update)
 MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index c42cf1831c43..0fd2ca83fda4 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -242,15 +242,6 @@ NNVM_REGISTER_OP(mp_sgd_update)
 NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
-NNVM_REGISTER_OP(multi_sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
-NNVM_REGISTER_OP(multi_sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
-NNVM_REGISTER_OP(multi_mp_sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
-NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);
-
 NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 9f190a0a88c2..985c38c31356 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -17,7 +17,6 @@
 
 import mxnet as mx
 import unittest
-import os
 import numpy as np
 from mxnet import gluon
 from mxnet.gluon import nn
@@ -99,9 +98,6 @@ def dict_equ(a, b):
 
 @with_seed()
 def test_trainer_save_load():
-    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
-
     x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
     trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
@@ -116,7 +112,6 @@ def test_trainer_save_load():
     x.lr_mult = 2.0
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 @with_seed()
 def test_trainer_sparse_save_load():
@@ -241,11 +236,10 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
             assert isinstance(err, expected)
 
     kvs = ['local', 'device']
-    global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     for kv in kvs:
         check_trainer_sparse_kv(kv, 'default', 'default', True, True)
         check_trainer_sparse_kv(kv, 'default', 'default', False, False)
-        check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore)
+        check_trainer_sparse_kv(kv, 'default', 'default', None, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ae38a2297ded..144fbeef213f 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -174,8 +174,6 @@ def test_module_layout():
 
 @with_seed()
 def test_save_load():
-    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
         assert set(a) == set(b)
         for k in a:
@@ -213,7 +211,6 @@ def dict_equ(a, b):
     assert mod._symbol.tojson() == mod2._symbol.tojson()
     dict_equ(mod.get_params()[0], mod2.get_params()[0])
     dict_equ(mod._kvstore._updater.states, mod2._updater.states)
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 
 @with_seed()

From fabc318a0ff7e9b22371e475edf0e3249f4d8b94 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 28 Jan 2019 18:19:35 +0000
Subject: [PATCH 11/26] Revert "Aggregate SGD (#13346)"

This reverts commit 0a45e1a222637c7dee29511cbfc43e594571933b.
---
 cpp-package/scripts/OpWrapperGenerator.py   |   4 +-
 docs/faq/env_var.md                         |   4 -
 python/mxnet/gluon/trainer.py               |  15 +-
 python/mxnet/model.py                       |  10 +-
 python/mxnet/optimizer/optimizer.py         | 231 ++++-----------
 src/operator/optimizer_op-inl.h             | 295 --------------------
 src/operator/optimizer_op.cc                | 193 +------------
 src/operator/optimizer_op.cu                |   9 -
 tests/python/unittest/test_gluon_trainer.py |   8 +-
 tests/python/unittest/test_module.py        |   3 -
 10 files changed, 66 insertions(+), 706 deletions(-)

diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 65ba247c25c8..ca430ec99e6e 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -97,8 +97,7 @@ class Arg:
         'double':'double',\
         'double or None':'dmlc::optional<double>',\
         'Shape or None':'dmlc::optional<Shape>',\
-        'string':'const std::string&',\
-        'tuple of <float>':'nnvm::Tuple<mx_float>'}
+        'string':'const std::string&'}
     name = ''
     type = ''
     description = ''
@@ -408,7 +407,6 @@ def ParseAllOps():
                       "#include \"mxnet-cpp/op_util.h\"\n"
                       "#include \"mxnet-cpp/operator.h\"\n"
                       "#include \"dmlc/optional.h\"\n"
-                      "#include \"nnvm/tuple.h\"\n"
                       "\n"
                       "namespace mxnet {\n"
                       "namespace cpp {\n"
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 83368bf4d0c3..bb29cc410c18 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -145,10 +145,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
     when kvstore's type is `device`.
 
-* MXNET_UPDATE_ON_KVSTORE
-  - Values: 0(false) or 1(true) ```(default=1)```
-  - If true, weight updates are performed during the communication step, if possible.
-
 ## Memonger
 
 * MXNET_BACKWARD_DO_MIRROR
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 8060f38ac2aa..f6c0a31b52e2 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,8 +60,7 @@ class Trainer(object):
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
         Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        suitable option depending on the type of kvstore.
 
     Properties
     ----------
@@ -394,8 +393,6 @@ def update(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
-        updates = [[] for _ in self._updaters]
-
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
@@ -419,17 +416,11 @@ def _update(self, ignore_stale_grad=False):
                     self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
-            for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
+            for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
-                    upd.append((i, grad, arr))
+                    upd(i, grad, arr)
                     arr._fresh_grad = False
 
-        if not (self._kvstore and self._update_on_kvstore):
-            for updater, upd in zip(self._updaters, updates):
-                if upd:
-                    i, w, g = zip(*upd)
-                    updater(i, w, g)
-
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index c08077cc65f4..38fe739154d5 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to `NDArray`.
         Model parameter, dict of name to `NDArray` of net's weights.
     """
-    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
+    update_on_kvstore = True
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
         kv = kvstore
     elif isinstance(kvstore, str):
         # create kvstore using the string type
-        if num_device == 1 and 'dist' not in kvstore:
+        if num_device is 1 and 'dist' not in kvstore:
             # no need to use kv for single device and single machine
             kv = None
         else:
@@ -162,7 +162,6 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
-    updates = [[] for _ in range(num_device)]
     for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
@@ -179,10 +178,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
             w, g = p
-            updates[k].append((index*num_device+k, g, w))
-    for dev_updates in updates:
-        i, w, g = zip(*dev_updates)
-        updater(i, w, g)
+            updater(index*num_device+k, g, w)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index cb52ac54fdab..6ffbbcffc384 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -22,15 +22,12 @@
 import math
 import pickle
 import warnings
-import os
 import numpy
 from ..base import py_str
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update,
-                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
-                       multi_mp_sgd_mom_update)
+                       signsgd_update, signum_update)
 from ..ndarray import sparse
 from ..random import normal
 
@@ -40,8 +37,6 @@
     'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
 ]
 
-def _flatten_list(nested_list):
-    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -110,7 +105,6 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
-        self.aggregate_num = 0
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -386,44 +380,13 @@ def _update_count(self, index):
 
         Parameters
         ----------
-        index : int or list of int
+        index : int
             The index to be updated.
         """
-        if not isinstance(index, (list, tuple)):
-            index = [index]
-        for idx in index:
-            if idx not in self._index_update_count:
-                self._index_update_count[idx] = self.begin_num_update
-            self._index_update_count[idx] += 1
-            self.num_update = max(self._index_update_count[idx], self.num_update)
-
-    def _get_lrs(self, indices):
-        """Gets the learning rates given the indices of the weights.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices corresponding to weights.
-
-        Returns
-        -------
-        lrs : list of float
-            Learning rates for those indices.
-        """
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        lrs = [lr for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                lrs[i] *= self.param_dict[index].lr_mult
-            elif index in self.lr_mult:
-                lrs[i] *= self.lr_mult[index]
-            elif index in self.idx2name:
-                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lrs
+        if index not in self._index_update_count:
+            self._index_update_count[index] = self.begin_num_update
+        self._index_update_count[index] += 1
+        self.num_update = max(self._index_update_count[index], self.num_update)
 
     def _get_lr(self, index):
         """Gets the learning rate given the index of the weight.
@@ -438,31 +401,18 @@ def _get_lr(self, index):
         lr : float
             Learning rate for this index.
         """
-        return self._get_lrs([index])[0]
-
-    def _get_wds(self, indices):
-        """Gets weight decays for indices.
-        Returns 0 for non-weights if the name of weights are provided for `__init__`.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices of weights.
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.num_update)
+        else:
+            lr = self.lr
 
-        Returns
-        -------
-        wds : list of float
-            Weight decays for those indices.
-        """
-        wds = [self.wd for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                wds[i] *= self.param_dict[index].wd_mult
-            elif index in self.wd_mult:
-                wds[i] *= self.wd_mult[index]
-            elif index in self.idx2name:
-                wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0)
-        return wds
+        if index in self.param_dict:
+            lr *= self.param_dict[index].lr_mult
+        elif index in self.lr_mult:
+            lr *= self.lr_mult[index]
+        elif index in self.idx2name:
+            lr *= self.lr_mult.get(self.idx2name[index], 1.0)
+        return lr
 
     def _get_wd(self, index):
         """Gets weight decay for index.
@@ -471,14 +421,21 @@ def _get_wd(self, index):
         Parameters
         ----------
         index : int
-            The index of weight.
+            The index for weight.
 
         Returns
         -------
         wd : float
             Weight decay for this index.
         """
-        return self._get_wds([index])[0]
+        wd = self.wd
+        if index in self.param_dict:
+            wd *= self.param_dict[index].wd_mult
+        elif index in self.wd_mult:
+            wd *= self.wd_mult[index]
+        elif index in self.idx2name:
+            wd *= self.wd_mult.get(self.idx2name[index], 1.0)
+        return wd
 
     def __getstate__(self):
         ret = self.__dict__.copy()
@@ -514,13 +471,6 @@ class SGD(Optimizer):
     provides slightly different semantics than the original update, and
     may lead to different empirical results.
 
-    In the case when ``update_on_kvstore`` is set to False (either globally via
-    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
-    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
-    of parameters, which may lead to improved performance. The aggregation size
-    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
-    defaults to 4.
-
     Otherwise, **standard updates** are applied by::
 
         rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
@@ -552,7 +502,6 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
         self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
 
     def create_state_multi_precision(self, index, weight):
         weight_master_copy = None
@@ -573,22 +522,12 @@ def create_state(self, index, weight):
             momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
         return momentum
 
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
+    def _update_impl(self, index, weight, grad, state, multi_precision=False):
+        assert(isinstance(weight, NDArray))
+        assert(isinstance(grad, NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
 
         kwargs = {'rescale_grad': self.rescale_grad}
         if self.momentum > 0:
@@ -596,49 +535,26 @@ def _update_impl(self, indices, weights, grads, states, multi_precision=False):
         if self.clip_gradient:
             kwargs['clip_gradient'] = self.clip_gradient
 
-        if aggregate:
-            if not multi_precision:
-                if self.momentum > 0:
-                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
-                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
-                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+        if not multi_precision:
+            if state is not None:
+                sgd_mom_update(weight, grad, state, out=weight,
+                               lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
             else:
-                if self.momentum > 0:
-                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
-                                            out=weights, num_weights=len(weights),
-                                            lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
-                                                           list(zip(*states))[1])),
-                                        out=weights, num_weights=len(weights),
-                                        lrs=lrs, wds=wds, **kwargs)
+                sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                           lr=lr, wd=wd, **kwargs)
         else:
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
+            if state[0] is not None:
+                mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+            else:
+                mp_sgd_update(weight, grad, state[1], out=weight,
+                              lr=lr, wd=wd, **kwargs)
 
     def update(self, index, weight, grad, state):
         self._update_impl(index, weight, grad, state, multi_precision=False)
 
     def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
+        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
         self._update_impl(index, weight, grad, state,
                           multi_precision=use_multi_precision)
 
@@ -1609,55 +1525,20 @@ def __init__(self, optimizer):
         self.optimizer = optimizer
         self.states = {}
         self.states_synced = {}
-        self.aggregate_updates = optimizer.aggregate_num > 0
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
-        if not isinstance(index, (list, tuple)):
-            indices = [index]
-            grads = [grad]
-            weights = [weight]
-        else:
-            indices = index
-            grads = grad
-            weights = weight
-        for i, idx in enumerate(indices):
-            # convert ctypes.char_p.value back to python str if needed
-            if isinstance(idx, bytes):
-                indices[i] = py_str(idx)
-                idx = indices[i]
-            if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
-                self.states_synced[idx] = True
-            elif not self.states_synced[idx]:
-                self.states[idx] = \
-                    self.sync_state_context(self.states[idx], weights[i].context)
-                self.states_synced[idx] = True
-        if self.aggregate_updates:
-            # segregate values based on type
-            type_map = {}
-            for i, w, g in zip(indices, weights, grads):
-                if w.dtype in type_map:
-                    type_map[w.dtype].append((i, w, g))
-                else:
-                    type_map[w.dtype] = [(i, w, g)]
-            for idx in type_map:
-                current_index = 0
-                indices, weights, grads = zip(*type_map[idx])
-                while current_index < len(indices):
-                    states = []
-                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
-                    for j in range(step):
-                        states.append(self.states[indices[current_index + j]])
-                    self.optimizer.update_multi_precision(
-                        indices[current_index:current_index + self.optimizer.aggregate_num],
-                        weights[current_index:current_index + self.optimizer.aggregate_num],
-                        grads[current_index:current_index + self.optimizer.aggregate_num],
-                        states)
-                    current_index += self.optimizer.aggregate_num
-        else:
-            for i, w, g in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision(i, w, g, self.states[i])
+        # convert ctypes.char_p.value back to python str if needed
+        if isinstance(index, bytes):
+            index = py_str(index)
+        if index not in self.states:
+            self.states[index] = self.optimizer.create_state_multi_precision(index, weight)
+            self.states_synced[index] = True
+        elif not self.states_synced[index]:
+            self.states[index] = \
+                self.sync_state_context(self.states[index], weight.context)
+            self.states_synced[index] = True
+        self.optimizer.update_multi_precision(index, weight, grad, self.states[index])
 
     def sync_state_context(self, state, context):
         """sync state context."""
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 223a1aa6c37d..9251b8614806 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -82,301 +82,6 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
   }
 };
 
-struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
-  float rescale_grad;
-  float clip_gradient;
-  int num_weights;
-  DMLC_DECLARE_PARAMETER(MultiSGDParam) {
-    DMLC_DECLARE_FIELD(lrs)
-    .describe("Learning rates.");
-    DMLC_DECLARE_FIELD(wds)
-    .describe("Weight decay augments the objective function with a "
-              "regularization term that penalizes large weights. "
-              "The penalty scales with the square of the magnitude of each weight.");
-    DMLC_DECLARE_FIELD(rescale_grad)
-    .set_default(1.0f)
-    .describe("Rescale gradient to grad = rescale_grad*grad.");
-    DMLC_DECLARE_FIELD(clip_gradient)
-    .set_default(-1.0f)
-    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
-              "If clip_gradient <= 0, gradient clipping is turned off. "
-              "grad = max(min(grad, clip_gradient), -clip_gradient).");
-    DMLC_DECLARE_FIELD(num_weights)
-    .set_default(1)
-    .describe("Number of updated weights.");
-  }
-};
-
-struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
-  float momentum;
-  float rescale_grad;
-  float clip_gradient;
-  int num_weights;
-  DMLC_DECLARE_PARAMETER(MultiSGDMomParam) {
-    DMLC_DECLARE_FIELD(lrs)
-    .describe("Learning rates.");
-    DMLC_DECLARE_FIELD(wds)
-    .describe("Weight decay augments the objective function with a "
-              "regularization term that penalizes large weights. "
-              "The penalty scales with the square of the magnitude of each weight.");
-    DMLC_DECLARE_FIELD(momentum)
-    .set_default(0.0f)
-    .describe("The decay rate of momentum estimates at each epoch.");
-    DMLC_DECLARE_FIELD(rescale_grad)
-    .set_default(1.0f)
-    .describe("Rescale gradient to grad = rescale_grad*grad.");
-    DMLC_DECLARE_FIELD(clip_gradient)
-    .set_default(-1.0f)
-    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
-              "If clip_gradient <= 0, gradient clipping is turned off. "
-              "grad = max(min(grad, clip_gradient), -clip_gradient).");
-    DMLC_DECLARE_FIELD(num_weights)
-    .set_default(1)
-    .describe("Number of updated weights.");
-  }
-};
-
-template<typename ParamType, int input_stride>
-inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs,
-                          std::vector<TShape> *in_attrs,
-                          std::vector<TShape> *out_attrs) {
-  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
-  CHECK_EQ(out_attrs->size(), param.num_weights);
-
-  bool all_inferred = true;
-  auto& input_shapes = *in_attrs;
-  auto& output_shapes = *out_attrs;
-  // Learning rates
-  CHECK_EQ(param.lrs.ndim(), param.num_weights)
-    << "Number of learning rates is inconsistent with num_weights "
-    << "parameter passed. Expected number of learning rates: "
-    << param.num_weights << ", and got " << param.lrs.ndim();
-  // Weight decays
-  CHECK_EQ(param.wds.ndim(), param.num_weights)
-    << "Number of weight decays is inconsistent with num_weights "
-    << "parameter passed. Expected number of weight decays: "
-    << param.num_weights << ", and got " << param.wds.ndim();
-  // Weights and gradients
-  for (int i = 0; i < param.num_weights; ++i) {
-    std::vector<TShape> input_vec;
-    std::vector<TShape> output_vec({output_shapes[i]});
-    for (int j = 0; j < input_stride; ++j) {
-      input_vec.push_back(input_shapes[i * input_stride + j]);
-    }
-    all_inferred = all_inferred && ElemwiseShape<input_stride, 1>(attrs, &input_vec, &output_vec);
-  }
-  return all_inferred;
-}
-
-template <typename ParamType, int input_stride, int num_fp32_inputs>
-inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs,
-                                  std::vector<int> *in_attrs,
-                                  std::vector<int> *out_attrs) {
-  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
-  CHECK_EQ(out_attrs->size(), param.num_weights);
-
-  bool all_inferred = true;
-  auto& input_types = *in_attrs;
-  auto& output_types = *out_attrs;
-  // Weights and gradients
-  for (int i = 0; i < param.num_weights; ++i) {
-    std::vector<int> input_vec;
-    std::vector<int> output_vec({output_types[i]});
-    for (int j = 0; j < input_stride - num_fp32_inputs; ++j) {
-      input_vec.push_back(input_types[i * input_stride + j]);
-    }
-    all_inferred = all_inferred &&
-                   ElemwiseType<input_stride - num_fp32_inputs, 1>(attrs, &input_vec, &output_vec);
-  }
-  // master copies of weights
-  for (int i = 0; i < param.num_weights; ++i) {
-    for (int j = 0; j < num_fp32_inputs; ++j) {
-      TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32);
-    }
-  }
-  return all_inferred;
-}
-
-template<typename DType, typename MPDType>
-struct MultiSGDKernelParam {
-  static const int N = 60;
-  int count;
-  size_t max_size;
-  size_t sizes[N];
-  DType * weights[N];
-  DType * grads[N];
-  MPDType * mom[N];
-  MPDType * weights32[N];
-  DType * out_data[N];
-  MPDType lrs[N];
-  MPDType wds[N];
-  MPDType clip_gradient;
-  MPDType rescale_grad;
-  MPDType momentum;
-};
-
-template <typename MPDType, bool has_momentum, bool has_mixed_precision>
-struct MultiSGDKernel {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam<DType, MPDType>& param,
-    const OpReqType req) {
-    for (int index = 0; index < param.count; ++index) {
-      if ((size_t)i < param.sizes[index]) {
-        MPDType w = has_mixed_precision ? param.weights32[index][i] :
-                                          MPDType(param.weights[index][i]);
-        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
-        if (param.clip_gradient >= 0.0f) {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]
-                *mshadow_op::clip::Map(param.rescale_grad *
-                                       static_cast<MPDType>(param.grads[index][i]),
-                                     param.clip_gradient);
-        } else {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
-        }
-        if (has_momentum) {
-          param.mom[index][i] = mom;
-        }
-        w = w + mom;
-        if (has_mixed_precision) {
-          param.weights32[index][i] = w;
-        }
-        KERNEL_ASSIGN(param.out_data[index][i], req, w);
-      }
-    }
-  }
-};
-
-template<typename xpu,
-         typename DType,
-         typename MPDType,
-         typename ParamType = MultiSGDParam,
-         int input_stride = 2>
-MultiSGDKernelParam<DType, MPDType> FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs,
-                                                            const OpContext &ctx,
-                                                            const std::vector<TBlob> &inputs,
-                                                            const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  const ParamType& p = nnvm::get<ParamType>(attrs.parsed);
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MultiSGDKernelParam<DType, MPDType> param;
-  param.clip_gradient = p.clip_gradient;
-  param.rescale_grad = p.rescale_grad;
-  param.momentum = 0;
-  param.count = p.num_weights;
-  param.max_size = 0;
-  for (int i = 0; i < param.count; ++i) {
-    param.sizes[i] = inputs[i * input_stride].shape_.Size();
-    if (param.max_size < param.sizes[i]) {
-      param.max_size = param.sizes[i];
-    }
-    param.weights[i] = inputs[i * input_stride].FlatTo2D<xpu, DType>(s).dptr_;
-    param.grads[i] = inputs[i * input_stride + 1].FlatTo2D<xpu, DType>(s).dptr_;
-    // if mixed precision, then the last input in a set
-    // is 32-bit master copy of the weights
-    if (!std::is_same<DType, MPDType>::value) {
-      param.weights32[i] = inputs[i * input_stride + input_stride - 1]
-                           .FlatTo2D<xpu, MPDType>(s).dptr_;
-    }
-    param.out_data[i] = outputs[i].FlatTo2D<xpu, DType>(s).dptr_;
-    param.lrs[i] = p.lrs[i];
-    param.wds[i] = p.wds[i];
-  }
-
-  return param;
-}
-
-
-template<typename xpu,
-         typename DType,
-         typename MPDType,
-         int input_stride = 3>
-MultiSGDKernelParam<DType, MPDType> FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs,
-                                                            const OpContext &ctx,
-                                                            const std::vector<TBlob> &inputs,
-                                                            const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  const MultiSGDMomParam& p = nnvm::get<MultiSGDMomParam>(attrs.parsed);
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MultiSGDKernelParam<DType, MPDType> param =
-    FillMultiSGDKernelParam<xpu,
-                            DType,
-                            MPDType,
-                            MultiSGDMomParam,
-                            input_stride>(attrs, ctx, inputs, outputs);
-  param.momentum = p.momentum;
-  for (int i = 0; i < param.count; ++i) {
-    param.mom[i] = inputs[i * input_stride + 2].FlatTo2D<xpu, MPDType>(s).dptr_;
-  }
-
-  return param;
-}
-
-template<typename T>
-class type_identity {
- public:
-  using type = T;
-};
-
-template<typename T>
-class single_precision {
- public:
-  using type = float;
-};
-
-template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
-inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs,
-                           const OpContext &ctx,
-                           const std::vector<TBlob> &inputs,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    using MPDType = typename MPTypeChooser<DType>::type;
-    MultiSGDKernelParam<DType, MPDType> param =
-      FillMultiSGDKernelParam<xpu,
-                              DType,
-                              MPDType,
-                              MultiSGDParam,
-                              input_stride>(attrs, ctx, inputs, outputs);
-    Kernel<MultiSGDKernel<MPDType,
-                          false,
-                          !std::is_same<DType, MPDType>::value>,
-                          xpu>::Launch(s, param.max_size, param, req[0]);
-  });
-}
-
-template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
-inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs,
-                              const OpContext &ctx,
-                              const std::vector<TBlob> &inputs,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<TBlob> &outputs) {
-  using namespace mxnet_op;
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    using MPDType = typename MPTypeChooser<DType>::type;
-    MultiSGDKernelParam<DType, MPDType> param =
-      FillMultiSGDMomKernelParam<xpu,
-                                 DType,
-                                 MPDType,
-                                 input_stride>(attrs, ctx, inputs, outputs);
-    Kernel<MultiSGDKernel<MPDType,
-                          true,
-                          !std::is_same<DType, MPDType>::value>,
-                          xpu>::Launch(s, param.max_size, param, req[0]);
-  });
-}
 
 struct SGDKernel {
   template<typename DType>
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 982995ad2f95..a52a6f32907c 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -31,8 +31,6 @@ namespace op {
 
 DMLC_REGISTER_PARAMETER(SGDParam);
 DMLC_REGISTER_PARAMETER(SGDMomParam);
-DMLC_REGISTER_PARAMETER(MultiSGDParam);
-DMLC_REGISTER_PARAMETER(MultiSGDMomParam);
 DMLC_REGISTER_PARAMETER(FTMLParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
@@ -54,7 +52,7 @@ It updates the weights using::
 
  weight = weight - learning_rate * sign(gradient)
 
-.. note::
+.. note:: 
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(2)
@@ -83,7 +81,7 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
-.. note::
+.. note:: 
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(3)
@@ -315,193 +313,6 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-NNVM_REGISTER_OP(multi_sgd_update)
-.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
-
-It updates the weights using::
-
- weight = weight - learning_rate * (gradient + wd * weight)
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 2);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 2>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, type_identity, 2>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_sgd_mom_update)
-.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer.
-
-Momentum update has better convergence rates on neural networks. Mathematically it looks
-like below:
-
-.. math::
-
-  v_1 = \alpha * \nabla J(W_0)\\
-  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
-  W_t = W_{t-1} + v_t
-
-It updates the weights using::
-
-  v = momentum * v - learning_rate * gradient
-  weight += v
-
-Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 3);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDMomParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 3>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("mom_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 3 + 2);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, type_identity, 3>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum")
-.add_arguments(MultiSGDMomParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_mp_sgd_update)
-.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer.
-
-It updates the weights using::
-
- weight = weight - learning_rate * (gradient + wd * weight)
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 3);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 3>)
-.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDParam, 3, 1>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("weight32_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 3 + 2);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, single_precision, 3>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDParam::__FIELDS__());
-
-NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
-.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer.
-
-Momentum update has better convergence rates on neural networks. Mathematically it looks
-like below:
-
-.. math::
-
-  v_1 = \alpha * \nabla J(W_0)\\
-  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
-  W_t = W_{t-1} + v_t
-
-It updates the weights using::
-
-  v = momentum * v - learning_rate * gradient
-  weight += v
-
-Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
-
-)code" ADD_FILELINE)
-.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights * 4);
-  })
-.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    return static_cast<uint32_t>(param.num_weights);
-  })
-.set_attr_parser(ParamParser<MultiSGDMomParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 4>)
-.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDMomParam, 4, 2>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    uint32_t num_args = dmlc::get<MultiSGDMomParam>(attrs.parsed).num_weights;
-    std::vector<std::string> ret;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      ret.push_back(std::string("weight_") + std::to_string(i));
-      ret.push_back(std::string("grad_") + std::to_string(i));
-      ret.push_back(std::string("mom_") + std::to_string(i));
-      ret.push_back(std::string("weight32_") + std::to_string(i));
-    }
-    return ret;
-  })
-.set_attr<nnvm::FMutateInputs>("FMutateInputs",
-  [](const nnvm::NodeAttrs& attrs) {
-    std::vector<uint32_t> ret;
-    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
-    for (int i = 0; i < param.num_weights; ++i) {
-      ret.push_back(i * 4 + 2);
-      ret.push_back(i * 4 + 3);
-    }
-    return ret;
-  })
-.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, single_precision, 4>)
-.add_argument("data", "NDArray-or-Symbol[]", "Weights")
-.add_arguments(MultiSGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_update)
 MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index c42cf1831c43..0fd2ca83fda4 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -242,15 +242,6 @@ NNVM_REGISTER_OP(mp_sgd_update)
 NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
-NNVM_REGISTER_OP(multi_sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
-NNVM_REGISTER_OP(multi_sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
-NNVM_REGISTER_OP(multi_mp_sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
-NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);
-
 NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 9f190a0a88c2..985c38c31356 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -17,7 +17,6 @@
 
 import mxnet as mx
 import unittest
-import os
 import numpy as np
 from mxnet import gluon
 from mxnet.gluon import nn
@@ -99,9 +98,6 @@ def dict_equ(a, b):
 
 @with_seed()
 def test_trainer_save_load():
-    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
-
     x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
     trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
@@ -116,7 +112,6 @@ def test_trainer_save_load():
     x.lr_mult = 2.0
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 @with_seed()
 def test_trainer_sparse_save_load():
@@ -241,11 +236,10 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
             assert isinstance(err, expected)
 
     kvs = ['local', 'device']
-    global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     for kv in kvs:
         check_trainer_sparse_kv(kv, 'default', 'default', True, True)
         check_trainer_sparse_kv(kv, 'default', 'default', False, False)
-        check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore)
+        check_trainer_sparse_kv(kv, 'default', 'default', None, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ae38a2297ded..144fbeef213f 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -174,8 +174,6 @@ def test_module_layout():
 
 @with_seed()
 def test_save_load():
-    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
         assert set(a) == set(b)
         for k in a:
@@ -213,7 +211,6 @@ def dict_equ(a, b):
     assert mod._symbol.tojson() == mod2._symbol.tojson()
     dict_equ(mod.get_params()[0], mod2.get_params()[0])
     dict_equ(mod._kvstore._updater.states, mod2._updater.states)
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 
 @with_seed()

From a649f670be13d85689d3fb1236cfb6cc562436b3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Thu, 14 Feb 2019 07:40:12 -0800
Subject: [PATCH 12/26] Revert "Revert "Aggregate SGD (#13346)""

This reverts commit fabc318a0ff7e9b22371e475edf0e3249f4d8b94.
---
 cpp-package/scripts/OpWrapperGenerator.py   |   4 +-
 docs/faq/env_var.md                         |   4 +
 python/mxnet/gluon/trainer.py               |  15 +-
 python/mxnet/model.py                       |  10 +-
 python/mxnet/optimizer/optimizer.py         | 231 +++++++++++----
 src/operator/optimizer_op-inl.h             | 295 ++++++++++++++++++++
 src/operator/optimizer_op.cc                | 193 ++++++++++++-
 src/operator/optimizer_op.cu                |   9 +
 tests/python/unittest/test_gluon_trainer.py |   8 +-
 tests/python/unittest/test_module.py        |   3 +
 10 files changed, 706 insertions(+), 66 deletions(-)

diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index ca430ec99e6e..65ba247c25c8 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -97,7 +97,8 @@ class Arg:
         'double':'double',\
         'double or None':'dmlc::optional<double>',\
         'Shape or None':'dmlc::optional<Shape>',\
-        'string':'const std::string&'}
+        'string':'const std::string&',\
+        'tuple of <float>':'nnvm::Tuple<mx_float>'}
     name = ''
     type = ''
     description = ''
@@ -407,6 +408,7 @@ def ParseAllOps():
                       "#include \"mxnet-cpp/op_util.h\"\n"
                       "#include \"mxnet-cpp/operator.h\"\n"
                       "#include \"dmlc/optional.h\"\n"
+                      "#include \"nnvm/tuple.h\"\n"
                       "\n"
                       "namespace mxnet {\n"
                       "namespace cpp {\n"
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 87882baa4f6b..c35d4e5723a5 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -162,6 +162,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
     when kvstore's type is `device`.
 
+* MXNET_UPDATE_ON_KVSTORE
+  - Values: 0(false) or 1(true) ```(default=1)```
+  - If true, weight updates are performed during the communication step, if possible.
+
 ## Memonger
 
 * MXNET_BACKWARD_DO_MIRROR
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index f6c0a31b52e2..8060f38ac2aa 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,7 +60,8 @@ class Trainer(object):
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
         Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore.
+        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
+        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -393,6 +394,8 @@ def update(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
+        updates = [[] for _ in self._updaters]
+
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
@@ -416,11 +419,17 @@ def _update(self, ignore_stale_grad=False):
                     self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
-            for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
+            for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
-                    upd(i, grad, arr)
+                    upd.append((i, grad, arr))
                     arr._fresh_grad = False
 
+        if not (self._kvstore and self._update_on_kvstore):
+            for updater, upd in zip(self._updaters, updates):
+                if upd:
+                    i, w, g = zip(*upd)
+                    updater(i, w, g)
+
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 38fe739154d5..c08077cc65f4 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to `NDArray`.
         Model parameter, dict of name to `NDArray` of net's weights.
     """
-    update_on_kvstore = True
+    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
         kv = kvstore
     elif isinstance(kvstore, str):
         # create kvstore using the string type
-        if num_device is 1 and 'dist' not in kvstore:
+        if num_device == 1 and 'dist' not in kvstore:
             # no need to use kv for single device and single machine
             kv = None
         else:
@@ -162,6 +162,7 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
+    updates = [[] for _ in range(num_device)]
     for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
@@ -178,7 +179,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
             w, g = p
-            updater(index*num_device+k, g, w)
+            updates[k].append((index*num_device+k, g, w))
+    for dev_updates in updates:
+        i, w, g = zip(*dev_updates)
+        updater(i, w, g)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 6ffbbcffc384..cb52ac54fdab 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -22,12 +22,15 @@
 import math
 import pickle
 import warnings
+import os
 import numpy
 from ..base import py_str
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update)
+                       signsgd_update, signum_update,
+                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
+                       multi_mp_sgd_mom_update)
 from ..ndarray import sparse
 from ..random import normal
 
@@ -37,6 +40,8 @@
     'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
 ]
 
+def _flatten_list(nested_list):
+    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -105,6 +110,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
+        self.aggregate_num = 0
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -380,13 +386,44 @@ def _update_count(self, index):
 
         Parameters
         ----------
-        index : int
+        index : int or list of int
             The index to be updated.
         """
-        if index not in self._index_update_count:
-            self._index_update_count[index] = self.begin_num_update
-        self._index_update_count[index] += 1
-        self.num_update = max(self._index_update_count[index], self.num_update)
+        if not isinstance(index, (list, tuple)):
+            index = [index]
+        for idx in index:
+            if idx not in self._index_update_count:
+                self._index_update_count[idx] = self.begin_num_update
+            self._index_update_count[idx] += 1
+            self.num_update = max(self._index_update_count[idx], self.num_update)
+
+    def _get_lrs(self, indices):
+        """Gets the learning rates given the indices of the weights.
+
+        Parameters
+        ----------
+        indices : list of int
+            Indices corresponding to weights.
+
+        Returns
+        -------
+        lrs : list of float
+            Learning rates for those indices.
+        """
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.num_update)
+        else:
+            lr = self.lr
+
+        lrs = [lr for _ in indices]
+        for i, index in enumerate(indices):
+            if index in self.param_dict:
+                lrs[i] *= self.param_dict[index].lr_mult
+            elif index in self.lr_mult:
+                lrs[i] *= self.lr_mult[index]
+            elif index in self.idx2name:
+                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
+        return lrs
 
     def _get_lr(self, index):
         """Gets the learning rate given the index of the weight.
@@ -401,18 +438,31 @@ def _get_lr(self, index):
         lr : float
             Learning rate for this index.
         """
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
+        return self._get_lrs([index])[0]
 
-        if index in self.param_dict:
-            lr *= self.param_dict[index].lr_mult
-        elif index in self.lr_mult:
-            lr *= self.lr_mult[index]
-        elif index in self.idx2name:
-            lr *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lr
+    def _get_wds(self, indices):
+        """Gets weight decays for indices.
+        Returns 0 for non-weights if the name of weights are provided for `__init__`.
+
+        Parameters
+        ----------
+        indices : list of int
+            Indices of weights.
+
+        Returns
+        -------
+        wds : list of float
+            Weight decays for those indices.
+        """
+        wds = [self.wd for _ in indices]
+        for i, index in enumerate(indices):
+            if index in self.param_dict:
+                wds[i] *= self.param_dict[index].wd_mult
+            elif index in self.wd_mult:
+                wds[i] *= self.wd_mult[index]
+            elif index in self.idx2name:
+                wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0)
+        return wds
 
     def _get_wd(self, index):
         """Gets weight decay for index.
@@ -421,21 +471,14 @@ def _get_wd(self, index):
         Parameters
         ----------
         index : int
-            The index for weight.
+            The index of weight.
 
         Returns
         -------
         wd : float
             Weight decay for this index.
         """
-        wd = self.wd
-        if index in self.param_dict:
-            wd *= self.param_dict[index].wd_mult
-        elif index in self.wd_mult:
-            wd *= self.wd_mult[index]
-        elif index in self.idx2name:
-            wd *= self.wd_mult.get(self.idx2name[index], 1.0)
-        return wd
+        return self._get_wds([index])[0]
 
     def __getstate__(self):
         ret = self.__dict__.copy()
@@ -471,6 +514,13 @@ class SGD(Optimizer):
     provides slightly different semantics than the original update, and
     may lead to different empirical results.
 
+    In the case when ``update_on_kvstore`` is set to False (either globally via
+    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
+    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
+    of parameters, which may lead to improved performance. The aggregation size
+    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
+    defaults to 4.
+
     Otherwise, **standard updates** are applied by::
 
         rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
@@ -502,6 +552,7 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
         self.lazy_update = lazy_update
+        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
 
     def create_state_multi_precision(self, index, weight):
         weight_master_copy = None
@@ -522,12 +573,22 @@ def create_state(self, index, weight):
             momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
         return momentum
 
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
+    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
+        aggregate = True
+        if not isinstance(indices, (tuple, list)):
+            indices = [indices]
+            weights = [weights]
+            grads = [grads]
+            states = [states]
+        for weight, grad in zip(weights, grads):
+            assert(isinstance(weight, NDArray))
+            assert(isinstance(grad, NDArray))
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
 
         kwargs = {'rescale_grad': self.rescale_grad}
         if self.momentum > 0:
@@ -535,26 +596,49 @@ def _update_impl(self, index, weight, grad, state, multi_precision=False):
         if self.clip_gradient:
             kwargs['clip_gradient'] = self.clip_gradient
 
-        if not multi_precision:
-            if state is not None:
-                sgd_mom_update(weight, grad, state, out=weight,
-                               lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+        if aggregate:
+            if not multi_precision:
+                if self.momentum > 0:
+                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
+                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
+                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
             else:
-                sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                           lr=lr, wd=wd, **kwargs)
+                if self.momentum > 0:
+                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
+                                            out=weights, num_weights=len(weights),
+                                            lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
+                                                           list(zip(*states))[1])),
+                                        out=weights, num_weights=len(weights),
+                                        lrs=lrs, wds=wds, **kwargs)
         else:
-            if state[0] is not None:
-                mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                  lr=lr, wd=wd, **kwargs)
-            else:
-                mp_sgd_update(weight, grad, state[1], out=weight,
-                              lr=lr, wd=wd, **kwargs)
+            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
+                if not multi_precision:
+                    if state is not None:
+                        sgd_mom_update(weight, grad, state, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    if state[0] is not None:
+                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, state[1], out=weight,
+                                      lr=lr, wd=wd, **kwargs)
 
     def update(self, index, weight, grad, state):
         self._update_impl(index, weight, grad, state, multi_precision=False)
 
     def update_multi_precision(self, index, weight, grad, state):
-        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
+        if not isinstance(index, (tuple, list)):
+            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
+        else:
+            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
         self._update_impl(index, weight, grad, state,
                           multi_precision=use_multi_precision)
 
@@ -1525,20 +1609,55 @@ def __init__(self, optimizer):
         self.optimizer = optimizer
         self.states = {}
         self.states_synced = {}
+        self.aggregate_updates = optimizer.aggregate_num > 0
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
-        # convert ctypes.char_p.value back to python str if needed
-        if isinstance(index, bytes):
-            index = py_str(index)
-        if index not in self.states:
-            self.states[index] = self.optimizer.create_state_multi_precision(index, weight)
-            self.states_synced[index] = True
-        elif not self.states_synced[index]:
-            self.states[index] = \
-                self.sync_state_context(self.states[index], weight.context)
-            self.states_synced[index] = True
-        self.optimizer.update_multi_precision(index, weight, grad, self.states[index])
+        if not isinstance(index, (list, tuple)):
+            indices = [index]
+            grads = [grad]
+            weights = [weight]
+        else:
+            indices = index
+            grads = grad
+            weights = weight
+        for i, idx in enumerate(indices):
+            # convert ctypes.char_p.value back to python str if needed
+            if isinstance(idx, bytes):
+                indices[i] = py_str(idx)
+                idx = indices[i]
+            if idx not in self.states:
+                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                self.states_synced[idx] = True
+            elif not self.states_synced[idx]:
+                self.states[idx] = \
+                    self.sync_state_context(self.states[idx], weights[i].context)
+                self.states_synced[idx] = True
+        if self.aggregate_updates:
+            # segregate values based on type
+            type_map = {}
+            for i, w, g in zip(indices, weights, grads):
+                if w.dtype in type_map:
+                    type_map[w.dtype].append((i, w, g))
+                else:
+                    type_map[w.dtype] = [(i, w, g)]
+            for idx in type_map:
+                current_index = 0
+                indices, weights, grads = zip(*type_map[idx])
+                while current_index < len(indices):
+                    states = []
+                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
+                    for j in range(step):
+                        states.append(self.states[indices[current_index + j]])
+                    self.optimizer.update_multi_precision(
+                        indices[current_index:current_index + self.optimizer.aggregate_num],
+                        weights[current_index:current_index + self.optimizer.aggregate_num],
+                        grads[current_index:current_index + self.optimizer.aggregate_num],
+                        states)
+                    current_index += self.optimizer.aggregate_num
+        else:
+            for i, w, g in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision(i, w, g, self.states[i])
 
     def sync_state_context(self, state, context):
         """sync state context."""
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 9251b8614806..223a1aa6c37d 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -82,6 +82,301 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
   }
 };
 
+struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
+  nnvm::Tuple<float> lrs;
+  nnvm::Tuple<float> wds;
+  float rescale_grad;
+  float clip_gradient;
+  int num_weights;
+  DMLC_DECLARE_PARAMETER(MultiSGDParam) {
+    DMLC_DECLARE_FIELD(lrs)
+    .describe("Learning rates.");
+    DMLC_DECLARE_FIELD(wds)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+    DMLC_DECLARE_FIELD(num_weights)
+    .set_default(1)
+    .describe("Number of updated weights.");
+  }
+};
+
+struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
+  nnvm::Tuple<float> lrs;
+  nnvm::Tuple<float> wds;
+  float momentum;
+  float rescale_grad;
+  float clip_gradient;
+  int num_weights;
+  DMLC_DECLARE_PARAMETER(MultiSGDMomParam) {
+    DMLC_DECLARE_FIELD(lrs)
+    .describe("Learning rates.");
+    DMLC_DECLARE_FIELD(wds)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(momentum)
+    .set_default(0.0f)
+    .describe("The decay rate of momentum estimates at each epoch.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+    DMLC_DECLARE_FIELD(num_weights)
+    .set_default(1)
+    .describe("Number of updated weights.");
+  }
+};
+
+template<typename ParamType, int input_stride>
+inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
+  CHECK_EQ(out_attrs->size(), param.num_weights);
+
+  bool all_inferred = true;
+  auto& input_shapes = *in_attrs;
+  auto& output_shapes = *out_attrs;
+  // Learning rates
+  CHECK_EQ(param.lrs.ndim(), param.num_weights)
+    << "Number of learning rates is inconsistent with num_weights "
+    << "parameter passed. Expected number of learning rates: "
+    << param.num_weights << ", and got " << param.lrs.ndim();
+  // Weight decays
+  CHECK_EQ(param.wds.ndim(), param.num_weights)
+    << "Number of weight decays is inconsistent with num_weights "
+    << "parameter passed. Expected number of weight decays: "
+    << param.num_weights << ", and got " << param.wds.ndim();
+  // Weights and gradients
+  for (int i = 0; i < param.num_weights; ++i) {
+    std::vector<TShape> input_vec;
+    std::vector<TShape> output_vec({output_shapes[i]});
+    for (int j = 0; j < input_stride; ++j) {
+      input_vec.push_back(input_shapes[i * input_stride + j]);
+    }
+    all_inferred = all_inferred && ElemwiseShape<input_stride, 1>(attrs, &input_vec, &output_vec);
+  }
+  return all_inferred;
+}
+
+template <typename ParamType, int input_stride, int num_fp32_inputs>
+inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
+  CHECK_EQ(out_attrs->size(), param.num_weights);
+
+  bool all_inferred = true;
+  auto& input_types = *in_attrs;
+  auto& output_types = *out_attrs;
+  // Weights and gradients
+  for (int i = 0; i < param.num_weights; ++i) {
+    std::vector<int> input_vec;
+    std::vector<int> output_vec({output_types[i]});
+    for (int j = 0; j < input_stride - num_fp32_inputs; ++j) {
+      input_vec.push_back(input_types[i * input_stride + j]);
+    }
+    all_inferred = all_inferred &&
+                   ElemwiseType<input_stride - num_fp32_inputs, 1>(attrs, &input_vec, &output_vec);
+  }
+  // master copies of weights
+  for (int i = 0; i < param.num_weights; ++i) {
+    for (int j = 0; j < num_fp32_inputs; ++j) {
+      TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32);
+    }
+  }
+  return all_inferred;
+}
+
+template<typename DType, typename MPDType>
+struct MultiSGDKernelParam {
+  static const int N = 60;
+  int count;
+  size_t max_size;
+  size_t sizes[N];
+  DType * weights[N];
+  DType * grads[N];
+  MPDType * mom[N];
+  MPDType * weights32[N];
+  DType * out_data[N];
+  MPDType lrs[N];
+  MPDType wds[N];
+  MPDType clip_gradient;
+  MPDType rescale_grad;
+  MPDType momentum;
+};
+
+template <typename MPDType, bool has_momentum, bool has_mixed_precision>
+struct MultiSGDKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam<DType, MPDType>& param,
+    const OpReqType req) {
+    for (int index = 0; index < param.count; ++index) {
+      if ((size_t)i < param.sizes[index]) {
+        MPDType w = has_mixed_precision ? param.weights32[index][i] :
+                                          MPDType(param.weights[index][i]);
+        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
+        if (param.clip_gradient >= 0.0f) {
+          mom = param.momentum*mom
+                - param.lrs[index]*param.wds[index]*w
+                - param.lrs[index]
+                *mshadow_op::clip::Map(param.rescale_grad *
+                                       static_cast<MPDType>(param.grads[index][i]),
+                                     param.clip_gradient);
+        } else {
+          mom = param.momentum*mom
+                - param.lrs[index]*param.wds[index]*w
+                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
+        }
+        if (has_momentum) {
+          param.mom[index][i] = mom;
+        }
+        w = w + mom;
+        if (has_mixed_precision) {
+          param.weights32[index][i] = w;
+        }
+        KERNEL_ASSIGN(param.out_data[index][i], req, w);
+      }
+    }
+  }
+};
+
+template<typename xpu,
+         typename DType,
+         typename MPDType,
+         typename ParamType = MultiSGDParam,
+         int input_stride = 2>
+MultiSGDKernelParam<DType, MPDType> FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs,
+                                                            const OpContext &ctx,
+                                                            const std::vector<TBlob> &inputs,
+                                                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  const ParamType& p = nnvm::get<ParamType>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MultiSGDKernelParam<DType, MPDType> param;
+  param.clip_gradient = p.clip_gradient;
+  param.rescale_grad = p.rescale_grad;
+  param.momentum = 0;
+  param.count = p.num_weights;
+  param.max_size = 0;
+  for (int i = 0; i < param.count; ++i) {
+    param.sizes[i] = inputs[i * input_stride].shape_.Size();
+    if (param.max_size < param.sizes[i]) {
+      param.max_size = param.sizes[i];
+    }
+    param.weights[i] = inputs[i * input_stride].FlatTo2D<xpu, DType>(s).dptr_;
+    param.grads[i] = inputs[i * input_stride + 1].FlatTo2D<xpu, DType>(s).dptr_;
+    // if mixed precision, then the last input in a set
+    // is 32-bit master copy of the weights
+    if (!std::is_same<DType, MPDType>::value) {
+      param.weights32[i] = inputs[i * input_stride + input_stride - 1]
+                           .FlatTo2D<xpu, MPDType>(s).dptr_;
+    }
+    param.out_data[i] = outputs[i].FlatTo2D<xpu, DType>(s).dptr_;
+    param.lrs[i] = p.lrs[i];
+    param.wds[i] = p.wds[i];
+  }
+
+  return param;
+}
+
+
+template<typename xpu,
+         typename DType,
+         typename MPDType,
+         int input_stride = 3>
+MultiSGDKernelParam<DType, MPDType> FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs,
+                                                            const OpContext &ctx,
+                                                            const std::vector<TBlob> &inputs,
+                                                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  const MultiSGDMomParam& p = nnvm::get<MultiSGDMomParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MultiSGDKernelParam<DType, MPDType> param =
+    FillMultiSGDKernelParam<xpu,
+                            DType,
+                            MPDType,
+                            MultiSGDMomParam,
+                            input_stride>(attrs, ctx, inputs, outputs);
+  param.momentum = p.momentum;
+  for (int i = 0; i < param.count; ++i) {
+    param.mom[i] = inputs[i * input_stride + 2].FlatTo2D<xpu, MPDType>(s).dptr_;
+  }
+
+  return param;
+}
+
+template<typename T>
+class type_identity {
+ public:
+  using type = T;
+};
+
+template<typename T>
+class single_precision {
+ public:
+  using type = float;
+};
+
+template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
+inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs,
+                           const OpContext &ctx,
+                           const std::vector<TBlob> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    using MPDType = typename MPTypeChooser<DType>::type;
+    MultiSGDKernelParam<DType, MPDType> param =
+      FillMultiSGDKernelParam<xpu,
+                              DType,
+                              MPDType,
+                              MultiSGDParam,
+                              input_stride>(attrs, ctx, inputs, outputs);
+    Kernel<MultiSGDKernel<MPDType,
+                          false,
+                          !std::is_same<DType, MPDType>::value>,
+                          xpu>::Launch(s, param.max_size, param, req[0]);
+  });
+}
+
+template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
+inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    using MPDType = typename MPTypeChooser<DType>::type;
+    MultiSGDKernelParam<DType, MPDType> param =
+      FillMultiSGDMomKernelParam<xpu,
+                                 DType,
+                                 MPDType,
+                                 input_stride>(attrs, ctx, inputs, outputs);
+    Kernel<MultiSGDKernel<MPDType,
+                          true,
+                          !std::is_same<DType, MPDType>::value>,
+                          xpu>::Launch(s, param.max_size, param, req[0]);
+  });
+}
 
 struct SGDKernel {
   template<typename DType>
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index a52a6f32907c..982995ad2f95 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -31,6 +31,8 @@ namespace op {
 
 DMLC_REGISTER_PARAMETER(SGDParam);
 DMLC_REGISTER_PARAMETER(SGDMomParam);
+DMLC_REGISTER_PARAMETER(MultiSGDParam);
+DMLC_REGISTER_PARAMETER(MultiSGDMomParam);
 DMLC_REGISTER_PARAMETER(FTMLParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
@@ -52,7 +54,7 @@ It updates the weights using::
 
  weight = weight - learning_rate * sign(gradient)
 
-.. note:: 
+.. note::
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(2)
@@ -81,7 +83,7 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
-.. note:: 
+.. note::
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(3)
@@ -313,6 +315,193 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
+NNVM_REGISTER_OP(multi_sgd_update)
+.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
+
+It updates the weights using::
+
+ weight = weight - learning_rate * (gradient + wd * weight)
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 2);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 2>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, type_identity, 2>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_sgd_mom_update)
+.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer.
+
+Momentum update has better convergence rates on neural networks. Mathematically it looks
+like below:
+
+.. math::
+
+  v_1 = \alpha * \nabla J(W_0)\\
+  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
+  W_t = W_{t-1} + v_t
+
+It updates the weights using::
+
+  v = momentum * v - learning_rate * gradient
+  weight += v
+
+Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 3);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDMomParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 3>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("mom_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 3 + 2);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, type_identity, 3>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum")
+.add_arguments(MultiSGDMomParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_mp_sgd_update)
+.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer.
+
+It updates the weights using::
+
+ weight = weight - learning_rate * (gradient + wd * weight)
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 3);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 3>)
+.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDParam, 3, 1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("weight32_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 3 + 2);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, single_precision, 3>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
+.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer.
+
+Momentum update has better convergence rates on neural networks. Mathematically it looks
+like below:
+
+.. math::
+
+  v_1 = \alpha * \nabla J(W_0)\\
+  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
+  W_t = W_{t-1} + v_t
+
+It updates the weights using::
+
+  v = momentum * v - learning_rate * gradient
+  weight += v
+
+Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 4);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDMomParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 4>)
+.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDMomParam, 4, 2>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDMomParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("mom_") + std::to_string(i));
+      ret.push_back(std::string("weight32_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 4 + 2);
+      ret.push_back(i * 4 + 3);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, single_precision, 4>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_update)
 MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 0fd2ca83fda4..c42cf1831c43 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -242,6 +242,15 @@ NNVM_REGISTER_OP(mp_sgd_update)
 NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
+NNVM_REGISTER_OP(multi_sgd_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
+NNVM_REGISTER_OP(multi_sgd_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
+NNVM_REGISTER_OP(multi_mp_sgd_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
+NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);
+
 NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 985c38c31356..9f190a0a88c2 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -17,6 +17,7 @@
 
 import mxnet as mx
 import unittest
+import os
 import numpy as np
 from mxnet import gluon
 from mxnet.gluon import nn
@@ -98,6 +99,9 @@ def dict_equ(a, b):
 
 @with_seed()
 def test_trainer_save_load():
+    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
+
     x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
     trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
@@ -112,6 +116,7 @@ def test_trainer_save_load():
     x.lr_mult = 2.0
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 @with_seed()
 def test_trainer_sparse_save_load():
@@ -236,10 +241,11 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
             assert isinstance(err, expected)
 
     kvs = ['local', 'device']
+    global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     for kv in kvs:
         check_trainer_sparse_kv(kv, 'default', 'default', True, True)
         check_trainer_sparse_kv(kv, 'default', 'default', False, False)
-        check_trainer_sparse_kv(kv, 'default', 'default', None, True)
+        check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 144fbeef213f..ae38a2297ded 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -174,6 +174,8 @@ def test_module_layout():
 
 @with_seed()
 def test_save_load():
+    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
         assert set(a) == set(b)
         for k in a:
@@ -211,6 +213,7 @@ def dict_equ(a, b):
     assert mod._symbol.tojson() == mod2._symbol.tojson()
     dict_equ(mod.get_params()[0], mod2.get_params()[0])
     dict_equ(mod._kvstore._updater.states, mod2._updater.states)
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 
 @with_seed()

From a6b0b9ef7d8dd287dc469e5c622031f92311186c Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 15 Feb 2019 16:55:34 -0800
Subject: [PATCH 13/26] add comments

---
 src/operator/nn/deconvolution-inl.h | 42 ++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 0947b63d5daa..0e436d4d8aa6 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -232,6 +232,19 @@ class DeconvolutionOp {
         << "Must init CuBLAS handle in stream";
 #endif
     auto in_data_shape = in_data[deconv::kData].shape_;
+    // G: num of groups
+    // N: num of batches
+    // C: num of channels
+    // IH: input height
+    // IW: input width
+    // KH: kernel height
+    // KW: kernel width
+    // OH: output width
+    // OW: output height
+    // OC: num of output channels
+
+    // 2D case: data (N, C, IH, IW)
+    // 2D case: out (N, OC, OH, OW)
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
     index_t o_pad[2], o_adj[2];
@@ -252,25 +265,40 @@ class DeconvolutionOp {
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,
                param_.num_filter / param_.num_group * kernel_size);
+    // 2D: wmat (G, C/G, OC/G * KH * KW)
     Tensor<xpu, 3, DType> wmat =
         in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
     const index_t nbatch = data.size(0);
+
+    // shape_colunit_ : (OC * KH * KW, IH * IW)
+    shape_colunit_ = mshadow::Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]);
+    // shape_dstunit_ : (G, C/G, IH * IW)
+    shape_dstunit_ = mshadow::Shape3(
+      param_.num_group,
+      data.shape_[1] / param_.num_group,
+      data.shape_[2] * data.shape_[3]
+    );
+
     Tensor<xpu, 1, DType> workspace =
-        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(out.shape_, data.shape_)), s);
+      ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
+        Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s);
+//    Tensor<xpu, 1, DType> workspace =
+//        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
+//            Shape1(this->InitTemp(out.shape_, data.shape_)), s);
     for (index_t i = 0; i < nbatch; ++i) {
-      // temp_col: (N * kernel_size, OW * OH)
+      // temp_col: (OC * KH * KW, IH * IW)
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                             workspace.dptr_,
-                                            Shape2(shape_colunit_[0], shape_colunit_[1]),
-                                            s);
-      // temp_dst: (N, N/n_grup, OW * OH)
+                                            Shape2(shape_colunit_[0],
+                                                   shape_colunit_[1]),
+                                                   s);
+      // temp_dst : (G, C/G, IH * IW)
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
                                                   shape_dstunit_[1],
                                                   shape_dstunit_[2]),
-                                           s);
+                                                  s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
 
       im2col(

From 3240833727ed96c46999a9c67d037d90839f75b3 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 15 Feb 2019 17:04:52 -0800
Subject: [PATCH 14/26] fix lint

---
 src/operator/nn/deconvolution-inl.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 0e436d4d8aa6..c87345a73601 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -276,8 +276,7 @@ class DeconvolutionOp {
     shape_dstunit_ = mshadow::Shape3(
       param_.num_group,
       data.shape_[1] / param_.num_group,
-      data.shape_[2] * data.shape_[3]
-    );
+      data.shape_[2] * data.shape_[3]);
 
     Tensor<xpu, 1, DType> workspace =
       ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(

From 88892d2debff9ca0ef957b0051a249fe8862106d Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 15 Mar 2019 14:52:33 -0700
Subject: [PATCH 15/26] fix lint error

---
 src/operator/nn/deconvolution-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index e18fb0afe57c..b791793e9f40 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -375,7 +375,7 @@ class DeconvolutionOp {
       TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
-  
+
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,

From 0675b3b36cc9e0a4ab7221863a736c48fbd160e8 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 29 Apr 2019 13:51:30 -0700
Subject: [PATCH 16/26] fix a bug in calling im2col (col_shape should be 3)

---
 src/operator/nn/deconvolution-inl.h | 75 +++++++++++++++--------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 5d860dd60e3d..198989a30495 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -34,6 +34,7 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <mshadow/tensor.h>
 #include "../operator_common.h"
 #include "../linalg.h"
 #include "./im2col.h"
@@ -263,69 +264,71 @@ class DeconvolutionOp {
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
-    Shape<3> wmat_shape =
+    Shape<3> weight_shape =
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,
                param_.num_filter / param_.num_group * kernel_size);
-    // 2D: wmat (G, C/G, OC/G * KH * KW)
-    Tensor<xpu, 3, DType> wmat =
-        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+    // 2D case: weight_3d (G, C/G, OC/G * KH * KW)
+    Tensor<xpu, 3, DType> weight_3d =
+        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
     const index_t nbatch = data.size(0);
 
     // shape_colunit_ : (OC * KH * KW, IH * IW)
-    shape_colunit_ = mshadow::Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]);
+    shape_colunit_ = Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]);
     // shape_dstunit_ : (G, C/G, IH * IW)
-    shape_dstunit_ = mshadow::Shape3(
-      param_.num_group,
-      data.shape_[1] / param_.num_group,
-      data.shape_[2] * data.shape_[3]);
+    shape_dstunit_ = Shape3(
+        param_.num_group,
+        data.shape_[1] / param_.num_group,
+        data.shape_[2] * data.shape_[3]);
+
 
     Tensor<xpu, 1, DType> workspace =
-      ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-        Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s);
-//    Tensor<xpu, 1, DType> workspace =
-//        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-//            Shape1(this->InitTemp(out.shape_, data.shape_)), s);
+        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
+            Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s);
+
+    Tensor<xpu, 3, DType> col_buffer_3d = Tensor<xpu, 3, DType>(
+        workspace.dptr_,
+        Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]),
+        s);
+    // temp_col: (N, OC * KH * KW, IH * IW)
+    // Tensor<xpu, 3, DType> temp_col = Tensor<xpu, 3, DType>(
+    //    workspace.dptr_,
+    //    Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]),
+    //    s);
+
     for (index_t i = 0; i < nbatch; ++i) {
-      // temp_col: (OC * KH * KW, IH * IW)
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-                                            workspace.dptr_,
-                                            Shape2(shape_colunit_[0],
-                                                   shape_colunit_[1]),
-                                                   s);
       // temp_dst : (G, C/G, IH * IW)
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                           workspace.dptr_ + temp_col.shape_.Size(),
-                                           Shape3(shape_dstunit_[0],
-                                                  shape_dstunit_[1],
-                                                  shape_dstunit_[2]),
-                                                  s);
+                                           workspace.dptr_ + shape_colunit_.Size(),
+                                           shape_dstunit_,
+                                           s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
 
       im2col(
         s,
         (out.Slice(i, i + 1)).dptr_,
         out.shape_,
-        temp_col.shape_,
+        col_buffer_3d.shape_,
         kernel,
         padding,
         stride,
         dilate,
-        temp_col.dptr_);
+        col_buffer_3d.dptr_);
 
-      const index_t gstride = temp_col.size(0) / param_.num_group;
+
+      const index_t gstride = col_buffer_3d.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+        //Tensor<xpu, 2, DType> tmpc = col_buffer_3d.Slice(gstride * gid, gstride * (gid + 1));
         // Legacy approach shown here for comparison:
-        // tmpc = dot(wmat[gid].T(), temp_dst[gid]);
-        linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
+        // tmpc = dot(weight_3d[gid].T(), temp_dst[gid]);
+        linalg_gemm(weight_3d[gid], temp_dst[gid], col_buffer_3d[gid], true, false, s);
       }
 
       col2im(
         s,
-        temp_col.dptr_,
+        col_buffer_3d.dptr_,
         out.Slice(i, i + 1).shape_,
-        temp_col.shape_,
+        col_buffer_3d.shape_,
         kernel,
         padding,
         stride,
@@ -378,14 +381,14 @@ class DeconvolutionOp {
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
-    Shape<3> wmat_shape =
+    Shape<3> weight_shape =
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,
                param_.num_filter / param_.num_group * kernel_size);
     Tensor<xpu, 3, DType> wmat =
-        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
     Tensor<xpu, 3, DType> gwmat =
-        in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+        in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
 
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =

From f403b9c66a8765fa38f8c39cc1cd449d63797ed7 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 3 May 2019 13:58:53 -0700
Subject: [PATCH 17/26] fix im2col parameter mismatch

---
 src/operator/nn/deconvolution-inl.h | 78 +++++++++++++----------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 198989a30495..7f5083e8c427 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -246,8 +246,8 @@ class DeconvolutionOp {
     // OW: output height
     // OC: num of output channels
 
-    // 2D case: data (N, C, IH, IW)
-    // 2D case: out (N, OC, OH, OW)
+    // data: (N, C, IH, IW)
+    // out: (N, OC, OH, OW)
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
     index_t o_pad[2], o_adj[2];
@@ -259,71 +259,61 @@ class DeconvolutionOp {
 
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
-    auto padding = param_.kernel.ndim() == 2 ?
-      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
+    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
-    auto kernel_size = kernel.Size();
 
-    Shape<3> weight_shape =
-        Shape3(param_.num_group,
-               data.shape_[1] / param_.num_group,
-               param_.num_filter / param_.num_group * kernel_size);
-    // 2D case: weight_3d (G, C/G, OC/G * KH * KW)
-    Tensor<xpu, 3, DType> weight_3d =
-        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
+    // C/G * KW * KH
+    auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size();
+
+    // OC/G
+    auto channel_group = out.shape_[1] / param_.num_group;
+
+    // IH*IW
+    auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim());
+
+    // OH*OW
+    auto out_spatial_size = out.shape_.ProdShape(2, out_data[deconv::kOut].ndim());
+
+    // weight_3d: (G, OC/G, KH * KW)
+    Shape<3> weight_shape = Shape3(param_.num_group, channel_group, kernel_size);
+    Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
+
     const index_t nbatch = data.size(0);
 
-    // shape_colunit_ : (OC * KH * KW, IH * IW)
-    shape_colunit_ = Shape2(out.shape_[1] * kernel_size, data.shape_[2] * data.shape_[3]);
+    auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size;
+
     // shape_dstunit_ : (G, C/G, IH * IW)
     shape_dstunit_ = Shape3(
         param_.num_group,
         data.shape_[1] / param_.num_group,
         data.shape_[2] * data.shape_[3]);
 
-
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(shape_colunit_.Size() + shape_dstunit_.Size()), s);
+            Shape1(col_buffer_size + data.shape_.Size()), s);
 
+    // col_buffer_3d : (G, KH * KW, IH * IW)
     Tensor<xpu, 3, DType> col_buffer_3d = Tensor<xpu, 3, DType>(
-        workspace.dptr_,
-        Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]),
-        s);
-    // temp_col: (N, OC * KH * KW, IH * IW)
-    // Tensor<xpu, 3, DType> temp_col = Tensor<xpu, 3, DType>(
-    //    workspace.dptr_,
-    //    Shape3(nbatch, shape_colunit_[0], shape_colunit_[1]),
-    //    s);
+        workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s);
 
     for (index_t i = 0; i < nbatch; ++i) {
-      // temp_dst : (G, C/G, IH * IW)
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                           workspace.dptr_ + shape_colunit_.Size(),
-                                           shape_dstunit_,
-                                           s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
+      Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
+          workspace.dptr_ + col_buffer_size,
+          Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s);
 
-      im2col(
-        s,
-        (out.Slice(i, i + 1)).dptr_,
-        out.shape_,
-        col_buffer_3d.shape_,
-        kernel,
-        padding,
-        stride,
-        dilate,
-        col_buffer_3d.dptr_);
+      // data_3d : (G, C/G, IH * IW)
+      data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_);
 
+     // im2col(s, (out.Slice(i, i + 1)).dptr_, out.shape_, col_buffer_3d.shape_,
+     //        kernel, padding, stride, dilate, col_buffer_3d.dptr_);
 
-      const index_t gstride = col_buffer_3d.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        //Tensor<xpu, 2, DType> tmpc = col_buffer_3d.Slice(gstride * gid, gstride * (gid + 1));
         // Legacy approach shown here for comparison:
-        // tmpc = dot(weight_3d[gid].T(), temp_dst[gid]);
-        linalg_gemm(weight_3d[gid], temp_dst[gid], col_buffer_3d[gid], true, false, s);
+        // tmpc = dot(weight_3d[gid].T(), data_3d[gid]);
+        linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s);
       }
 
+
       col2im(
         s,
         col_buffer_3d.dptr_,

From bdbf81d659469f951e6faea03758f5ea6e735136 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 6 May 2019 15:30:19 -0700
Subject: [PATCH 18/26] add debug

---
 src/operator/nn/deconvolution-inl.h | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 7f5083e8c427..5204b6e10a60 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -284,22 +284,22 @@ class DeconvolutionOp {
 
     // shape_dstunit_ : (G, C/G, IH * IW)
     shape_dstunit_ = Shape3(
-        param_.num_group,
-        data.shape_[1] / param_.num_group,
-        data.shape_[2] * data.shape_[3]);
+      param_.num_group,
+      data.shape_[1] / param_.num_group,
+      data.shape_[2] * data.shape_[3]);
 
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(col_buffer_size + data.shape_.Size()), s);
+    Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size + data.shape_.Size()), s);
 
     // col_buffer_3d : (G, KH * KW, IH * IW)
     Tensor<xpu, 3, DType> col_buffer_3d = Tensor<xpu, 3, DType>(
-        workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s);
+      workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s);
 
     for (index_t i = 0; i < nbatch; ++i) {
+      // Tensor<xpu, 3, DType> data_3d = data[i];
       Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
-          workspace.dptr_ + col_buffer_size,
-          Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s);
+        workspace.dptr_ + col_buffer_size,
+        Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s);
 
       // data_3d : (G, C/G, IH * IW)
       data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_);
@@ -309,10 +309,17 @@ class DeconvolutionOp {
 
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         // Legacy approach shown here for comparison:
-        // tmpc = dot(weight_3d[gid].T(), data_3d[gid]);
+        // col_buffer_3d[gid] = dot(weight_3d[gid].T(), data_3d[gid]);
         linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s);
       }
 
+      std::cout << "col buffer: " << std::endl;
+      for (auto j = 0; j < kernel_size; ++j) {
+        for (auto k = 0; k < data_spatial_size; ++k) {
+          std::cout << *(static_cast<DType *>(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " ";
+        }
+        std::cout << std::endl;
+      }
 
       col2im(
         s,

From 2c929805c8fe84b45f0ed32c28c53f836575ea08 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 10 May 2019 10:19:28 -0700
Subject: [PATCH 19/26] set col_buffer_shape

---
 src/operator/nn/deconvolution-inl.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 5204b6e10a60..4a5e8ff37382 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -262,11 +262,14 @@ class DeconvolutionOp {
     auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
 
+    auto conv_in_channels = data.shape_[1];
+    auto conv_out_channels = out.shape_[1];
+
     // C/G * KW * KH
     auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size();
 
     // OC/G
-    auto channel_group = out.shape_[1] / param_.num_group;
+    auto channel_group = conv_out_channels / param_.num_group;
 
     // IH*IW
     auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim());
@@ -281,6 +284,11 @@ class DeconvolutionOp {
     const index_t nbatch = data.size(0);
 
     auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size;
+    mxnet::TShape col_buffer_shape(3, 1);
+    col_buffer_shape[0] = conv_in_channels * kernel.Size();
+    for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = data.shape_[i+1];
+    }
 
     // shape_dstunit_ : (G, C/G, IH * IW)
     shape_dstunit_ = Shape3(
@@ -291,9 +299,11 @@ class DeconvolutionOp {
     Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
       .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size + data.shape_.Size()), s);
 
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
     // col_buffer_3d : (G, KH * KW, IH * IW)
-    Tensor<xpu, 3, DType> col_buffer_3d = Tensor<xpu, 3, DType>(
-      workspace.dptr_, Shape3(param_.num_group, kernel_size, data_spatial_size), s);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(param_.num_group, kernel_size, data_spatial_size), s);
 
     for (index_t i = 0; i < nbatch; ++i) {
       // Tensor<xpu, 3, DType> data_3d = data[i];
@@ -313,6 +323,7 @@ class DeconvolutionOp {
         linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s);
       }
 
+
       std::cout << "col buffer: " << std::endl;
       for (auto j = 0; j < kernel_size; ++j) {
         for (auto k = 0; k < data_spatial_size; ++k) {

From 0c44ec86dd56b1dd0fadf7e62fbfe30e11238146 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 10 May 2019 11:52:00 -0700
Subject: [PATCH 20/26] dump data from gpu to cpu to debug

---
 src/operator/nn/deconvolution-inl.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 4a5e8ff37382..9d6e69489c10 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -325,6 +325,15 @@ class DeconvolutionOp {
 
 
       std::cout << "col buffer: " << std::endl;
+      DType *tmp_data = new DType[col_buffer_size];
+      if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
+        std::cout << "running on GPU " << std::endl;
+        NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id);
+        col_data.SyncCopyToCPU(tmp_data, col_buffer_size);
+      } else {
+        tmp_data = static_cast<DType *>(col_buffer_3d[0].dptr_);
+      }
+
       for (auto j = 0; j < kernel_size; ++j) {
         for (auto k = 0; k < data_spatial_size; ++k) {
           std::cout << *(static_cast<DType *>(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " ";

From 2c868fd93b6facf270b373c95c6a0825ad43599a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 10 May 2019 12:00:13 -0700
Subject: [PATCH 21/26] debug

---
 src/operator/nn/deconvolution-inl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 9d6e69489c10..e545e1012206 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -330,6 +330,7 @@ class DeconvolutionOp {
         std::cout << "running on GPU " << std::endl;
         NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id);
         col_data.SyncCopyToCPU(tmp_data, col_buffer_size);
+        std::cout << "complete " << std::endl;
       } else {
         tmp_data = static_cast<DType *>(col_buffer_3d[0].dptr_);
       }

From 5dacddc88425476318e438c7f7d1f1fc5f825b91 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Fri, 10 May 2019 12:06:00 -0700
Subject: [PATCH 22/26] debug

---
 src/operator/nn/deconvolution-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index e545e1012206..101da3aeddfd 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -337,7 +337,7 @@ class DeconvolutionOp {
 
       for (auto j = 0; j < kernel_size; ++j) {
         for (auto k = 0; k < data_spatial_size; ++k) {
-          std::cout << *(static_cast<DType *>(col_buffer_3d[0].dptr_ + j * kernel_size + k)) << " ";
+          std::cout << *(tmp_data + j * kernel_size + k) << " ";
         }
         std::cout << std::endl;
       }

From 29c4488aebb1005ae7603a2de966869751e2c2a7 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 13 May 2019 10:24:58 -0700
Subject: [PATCH 23/26] update function call to col2im

---
 src/operator/nn/deconvolution-inl.h | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 101da3aeddfd..fd91743d9a17 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -342,17 +342,11 @@ class DeconvolutionOp {
         std::cout << std::endl;
       }
 
-      col2im(
-        s,
-        col_buffer_3d.dptr_,
-        out.Slice(i, i + 1).shape_,
-        col_buffer_3d.shape_,
-        kernel,
-        padding,
-        stride,
-        dilate,
-        out.Slice(i, i + 1).dptr_,
-        req[deconv::kOut]);
+      auto input_dim_ = in_data_shape.ProdShape(1, in_data_shape.ndim());
+
+      col2im(s, col_buffer.dptr<DType>(), out_data[deconv::kOut].shape_, col_buffer.shape_,
+        kernel, padding, stride, dilate, out_data[deconv::kOut].dptr<DType>() + i * input_dim_, req[deconv::kOut]);
+
     }
 
     if (!param_.no_bias) {

From 5f3c8813a53abab9df6db5477a3da18b6fff7292 Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 13 May 2019 15:50:13 -0700
Subject: [PATCH 24/26] fix backward pass

---
 src/operator/nn/deconvolution-inl.h | 263 ++++++++++++++--------------
 1 file changed, 127 insertions(+), 136 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index fd91743d9a17..df797db6cef1 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -229,10 +229,11 @@ class DeconvolutionOp {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1U);
+    LayerSetUp(in_data[deconv::kData].shape_, out_data[deconv::kData].shape_);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+        << "Must init cuBLAS handle in stream";
 #endif
     auto in_data_shape = in_data[deconv::kData].shape_;
     // G: num of groups
@@ -246,10 +247,10 @@ class DeconvolutionOp {
     // OW: output height
     // OC: num of output channels
 
-    // data: (N, C, IH, IW)
-    // out: (N, OC, OH, OW)
-    Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
-    Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
+    // input_4d: (N, C, IH, IW)
+    // output_4d: (N, OC, OH, OW)
+    Tensor<xpu, 4, DType> input_4d = TBlobTo4DTensor(in_data[deconv::kData], s);
+    Tensor<xpu, 4, DType> output_4d = TBlobTo4DTensor(out_data[deconv::kOut], s);
     index_t o_pad[2], o_adj[2];
     if (param_.kernel.ndim() == 2) {
       param_.InferPad(mxnet::TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
@@ -262,97 +263,71 @@ class DeconvolutionOp {
     auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
 
-    auto conv_in_channels = data.shape_[1];
-    auto conv_out_channels = out.shape_[1];
-
-    // C/G * KW * KH
-    auto kernel_size = data.shape_[1] / param_.num_group * kernel.Size();
-
-    // OC/G
-    auto channel_group = conv_out_channels / param_.num_group;
-
-    // IH*IW
-    auto data_spatial_size = data.shape_.ProdShape(2, in_data[deconv::kData].ndim());
-
-    // OH*OW
-    auto out_spatial_size = out.shape_.ProdShape(2, out_data[deconv::kOut].ndim());
-
     // weight_3d: (G, OC/G, KH * KW)
-    Shape<3> weight_shape = Shape3(param_.num_group, channel_group, kernel_size);
-    Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
+    Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(param_.num_group, conv_out_channels_ / group_, kernel_dim_), s);
 
-    const index_t nbatch = data.size(0);
 
-    auto col_buffer_size = param_.num_group * kernel_size * data_spatial_size;
-    mxnet::TShape col_buffer_shape(3, 1);
-    col_buffer_shape[0] = conv_in_channels * kernel.Size();
+    Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s);
+
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
+    col_buffer_shape[0] = conv_in_channels_ * kernel.Size();
     for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
-      col_buffer_shape[i] = data.shape_[i+1];
+      col_buffer_shape[i] = in_data[deconv::kData].shape_[i + 1];
     }
 
-    // shape_dstunit_ : (G, C/G, IH * IW)
-    shape_dstunit_ = Shape3(
-      param_.num_group,
-      data.shape_[1] / param_.num_group,
-      data.shape_[2] * data.shape_[3]);
-
-    Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
-      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size + data.shape_.Size()), s);
-
+    // create a colum buffer to hold the matrix product between weight_3d(T) and input_data
     TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
 
     // col_buffer_3d : (G, KH * KW, IH * IW)
     Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
-      Shape3(param_.num_group, kernel_size, data_spatial_size), s);
+      Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
 
-    for (index_t i = 0; i < nbatch; ++i) {
-      // Tensor<xpu, 3, DType> data_3d = data[i];
+    for (index_t i = 0; i < num_; ++i) {
+      // Tensor<xpu, 3, DType> data_3d = input_4d[i];
       Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
-        workspace.dptr_ + col_buffer_size,
-        Shape3(param_.num_group, data.shape_[1] / param_.num_group, data_spatial_size), s);
+        workspace.dptr_ + col_buffer_size_,
+        Shape3(param_.num_group, input_4d.shape_[1] / param_.num_group, conv_in_spatial_dim_), s);
 
       // data_3d : (G, C/G, IH * IW)
-      data_3d = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), data_3d.shape_);
-
-     // im2col(s, (out.Slice(i, i + 1)).dptr_, out.shape_, col_buffer_3d.shape_,
-     //        kernel, padding, stride, dilate, col_buffer_3d.dptr_);
+      data_3d = reshape(swapaxis<1, 0>(input_4d.Slice(i, i + 1)), data_3d.shape_);
 
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+      for (int g = 0; g < group_; ++g) {
         // Legacy approach shown here for comparison:
-        // col_buffer_3d[gid] = dot(weight_3d[gid].T(), data_3d[gid]);
-        linalg_gemm(weight_3d[gid], data_3d[gid], col_buffer_3d[gid], true, false, s);
+        // col_buffer_3d[g] = dot(weight_3d[g].T(), data_3d[g]);
+        linalg_gemm(weight_3d[g], data_3d[g], col_buffer_3d[g], true, false, s);
       }
 
 
+      // TODO: (lnyuan) remove debugging code
       std::cout << "col buffer: " << std::endl;
-      DType *tmp_data = new DType[col_buffer_size];
+      DType *tmp_data = new DType[col_buffer_size_];
       if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
         std::cout << "running on GPU " << std::endl;
         NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id);
-        col_data.SyncCopyToCPU(tmp_data, col_buffer_size);
+        col_data.SyncCopyToCPU(tmp_data, col_buffer_size_);
         std::cout << "complete " << std::endl;
       } else {
         tmp_data = static_cast<DType *>(col_buffer_3d[0].dptr_);
       }
 
-      for (auto j = 0; j < kernel_size; ++j) {
-        for (auto k = 0; k < data_spatial_size; ++k) {
-          std::cout << *(tmp_data + j * kernel_size + k) << " ";
+      for (auto j = 0; j < kernel_dim_; ++j) {
+        for (auto k = 0; k < conv_in_spatial_dim_; ++k) {
+          std::cout << *(tmp_data + j * kernel_dim_ + k) << " ";
         }
         std::cout << std::endl;
       }
 
-      auto input_dim_ = in_data_shape.ProdShape(1, in_data_shape.ndim());
-
       col2im(s, col_buffer.dptr<DType>(), out_data[deconv::kOut].shape_, col_buffer.shape_,
         kernel, padding, stride, dilate, out_data[deconv::kOut].dptr<DType>() + i * input_dim_, req[deconv::kOut]);
 
     }
 
-    if (!param_.no_bias) {
+    if (bias_term_) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
-      out += mshadow::expr::broadcast<1>(bias, out.shape_);
+      output_4d += mshadow::expr::broadcast<1>(bias, output_4d.shape_);
     }
   }
 
@@ -363,21 +338,22 @@ class DeconvolutionOp {
                 const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
     CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
-    // get data
+
+    LayerSetUp(out_grad[deconv::kOut].shape_, in_grad[deconv::kData].shape_);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+        << "Must init cuBLAS handle in stream";
 #endif
+
     auto in_data_shape = in_data[deconv::kData].shape_;
-    Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
+    Tensor<xpu, 4, DType> data_4d = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
     Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s);
     index_t o_pad[2], o_adj[2];
@@ -388,74 +364,58 @@ class DeconvolutionOp {
     }
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
-    auto padding = param_.kernel.ndim() == 2 ?
-      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
+    auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
-    Shape<3> weight_shape =
-        Shape3(param_.num_group,
-               data.shape_[1] / param_.num_group,
-               param_.num_filter / param_.num_group * kernel_size);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
-    Tensor<xpu, 3, DType> gwmat =
-        in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(weight_shape, s);
-
-    const index_t nbatch = data.size(0);
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
-    for (index_t i = 0; i < nbatch; ++i) {
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
-                                           workspace.dptr_,
-                                           Shape2(shape_colunit_[0], shape_colunit_[1]),
-                                           s);
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                           workspace.dptr_ + temp_col.shape_.Size(),
-                                           Shape3(shape_dstunit_[0],
-                                                  shape_dstunit_[1],
-                                                  shape_dstunit_[2]),
-                                           s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
-
-      im2col(
-        s,
-        (grad.Slice(i, i + 1)).dptr_,
-        grad.shape_,
-        temp_col.shape_,
-        kernel,
-        padding,
-        stride,
-        dilate,
-        temp_col.dptr_);
-
-      const index_t gstride = temp_col.size(0) / param_.num_group;
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        if (i == 0) {
-          Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-          // Legacy approach shown here for comparison:
-          // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
-          linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]);
-        } else {
-          // Legacy approach shown here for comparison:
-          // gwmat[gid] += dot(temp_dst[gid], tmpc.T());
-          linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
-        }
+    Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight]
+      .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s);
+    Tensor<xpu, 3, DType> dweight_3d = in_grad[deconv::kWeight]
+      .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s);
+
+    Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s);
+    // calculate shape of col_buffer
+    TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
+    col_buffer_shape[0] = conv_out_channels_ * kernel_size;
+    for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = out_grad[deconv::kOut].shape_[i+1];
+    }
+    // create a column buffer to store ograd
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
+
+    for (index_t i = 0; i < num_; ++i) {
+      // Tensor<xpu, 3, DType> data_3d = input_4d[i];
+      Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
+        workspace.dptr_ + col_buffer_size_,
+        Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s);
+
+      // data_3d : (G, C/G, IH * IW)
+      data_3d = reshape(swapaxis<1, 0>(data_4d.Slice(i, i + 1)), data_3d.shape_);
+
+      // convert output gradient array to column buffer
+      im2col(s, out_grad[deconv::kOut].dptr<DType>() + i * output_dim_, out_grad[deconv::kOut].shape_,
+        col_buffer.shape_, kernel, padding, stride, dilate, col_buffer.dptr<DType>());
+
+      for (int g = 0; g < group_; ++g) {
+        auto request = (i == 0) ? req[deconv::kWeight] : kAddTo;
+        // Legacy approach shown here for comparison:
+        // dweight_3d[gid] += dot(temp_dst[gid], tmpc.T());
+        linalg_gemm(data_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
       }
       if (req[deconv::kData] == kWriteTo ||
           req[deconv::kData] == kWriteInplace ||
           req[deconv::kData] == kAddTo) {
-        for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-          Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+        for (int g = 0; g < group_; ++g) {
           // Legacy approach shown here for comparison:
-          // temp_dst[gid] = dot(wmat[gid], tmpc);
-          linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
+          // temp_dst[gid] = dot(weight_3d[gid], tmpc);
+          linalg_gemm(weight_3d[g], col_buffer_3d[g], data_3d[g], false, false, s);
         }
         Assign(gdata.Slice(i, i + 1),
                req[deconv::kData],
-               (swapaxis<1, 0>(reshape(temp_dst,
+               (swapaxis<1, 0>(reshape(data_3d,
                                        Shape4(gdata.shape_[1],
                                               1,
                                               gdata.size(2),
@@ -469,22 +429,6 @@ class DeconvolutionOp {
   }
 
  private:
-  inline index_t InitTemp(const mshadow::Shape<4> &ishape,
-                          const mshadow::Shape<4> &oshape) {
-    const int ksize = param_.kernel.Size();
-    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize,
-                                     oshape[2] * oshape[3]);
-    shape_dstunit_ = mshadow::Shape3(param_.num_group,
-                                     oshape[1] / param_.num_group,
-                                     oshape[2] * oshape[3]);
-    mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-                                             shape_colunit_[1]);
-    mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
-                                             shape_dstunit_[1],
-                                             shape_dstunit_[2]);
-    index_t required_size = scol.Size() + sdst.Size();
-    return required_size;
-  }
 
   inline Tensor<xpu, 4, DType> TBlobTo4DTensor(const TBlob &tb, Stream<xpu> *s) {
     using namespace mshadow;
@@ -495,9 +439,56 @@ class DeconvolutionOp {
           Shape4(tb.shape_[0], tb.shape_[1], 1, tb.shape_[2]), s);
   }
 
+  void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
+    channel_axis_ = 1;  // hard code channel axis
+    const index_t first_spatial_axis = channel_axis_ + 1;
+    const int num_axes = param_.kernel.ndim() + 2;
+    num_spatial_axes_ = num_axes - first_spatial_axis;
+
+    // batch size
+    num_ = ishape[0];
+    // number of input channels
+    channels_ = ishape[1];
+    group_ = param_.num_group;
+    conv_out_channels_ = param_.num_filter;
+    conv_in_channels_ = channels_;
+    bias_term_ = !param_.no_bias;
+    kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
+    weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
+    conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
+    conv_in_spatial_dim_ = ishape.ProdShape(2, ishape.ndim());
+    col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
+    output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+    // size of the column buffer used for storing im2col-ed pixels
+    col_buffer_size_ = kernel_dim_ * group_ * conv_in_spatial_dim_;
+    // input/output image size (#channels * height * width)
+    input_dim_ = ishape.ProdShape(1, ishape.ndim());
+    output_dim_ = oshape.ProdShape(1, oshape.ndim());
+    num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
+    num_kernels_col2im_ = input_dim_;
+  }
+
+private:
   DeconvolutionParam param_;
-  mshadow::Shape<2> shape_colunit_;
-  mshadow::Shape<3> shape_dstunit_;
+  index_t channel_axis_;  // channel axis of the input
+  index_t channels_;  // number of channels of input image
+  index_t num_spatial_axes_;  // number of spatial axes
+  index_t num_;  // batch size
+  index_t group_;  // number of groups
+  index_t conv_out_channels_;  // number of output channels (num_filter)
+  index_t conv_out_spatial_dim_;  // number of pixels of output images per channel
+  index_t conv_in_spatial_dim_; // number of pixels of input images per channel
+  index_t conv_in_channels_;  // number of input channels
+  index_t kernel_dim_;  // number of input channels per group * kernel size
+  index_t weight_offset_;  // number of output channels per group * kernel_dim_
+  index_t col_offset_;
+  index_t output_offset_;
+  index_t col_buffer_size_;
+  index_t input_dim_;
+  index_t output_dim_;
+  index_t num_kernels_im2col_;
+  index_t num_kernels_col2im_;
+  bool bias_term_;  // has bias term?
 };  // class DeconvolutionOp
 
 template<typename xpu>

From db3aaef5c8e175e24d206878abf744f5608fa13c Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Mon, 13 May 2019 16:16:21 -0700
Subject: [PATCH 25/26] comment out debug message

---
 src/operator/nn/deconvolution-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index df797db6cef1..48026028be4b 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -301,6 +301,7 @@ class DeconvolutionOp {
 
 
       // TODO: (lnyuan) remove debugging code
+      /*
       std::cout << "col buffer: " << std::endl;
       DType *tmp_data = new DType[col_buffer_size_];
       if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
@@ -318,7 +319,7 @@ class DeconvolutionOp {
         }
         std::cout << std::endl;
       }
-
+      */
       col2im(s, col_buffer.dptr<DType>(), out_data[deconv::kOut].shape_, col_buffer.shape_,
         kernel, padding, stride, dilate, out_data[deconv::kOut].dptr<DType>() + i * input_dim_, req[deconv::kOut]);
 

From 424f36d358994e9d9c2db483eebdbd085409e76a Mon Sep 17 00:00:00 2001
From: Lin Yuan <apeforest@gmail.com>
Date: Wed, 15 May 2019 20:47:07 -0700
Subject: [PATCH 26/26] fix bug in backward

---
 src/operator/nn/deconvolution-inl.h | 117 +++++++++++++++-------------
 1 file changed, 65 insertions(+), 52 deletions(-)

diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 48026028be4b..8724d4e5a366 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -261,18 +261,16 @@ class DeconvolutionOp {
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
     auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
-    auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
 
-    // weight_3d: (G, OC/G, KH * KW)
+    // weight_3d: (G, C/G, OC/G * KH * KW)
     Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(
-      Shape3(param_.num_group, conv_out_channels_ / group_, kernel_dim_), s);
-
+        Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s);
 
     Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
-      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s);
+        .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + in_data[deconv::kData].shape_.Size()), s);
 
     mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
-    col_buffer_shape[0] = conv_in_channels_ * kernel.Size();
+    col_buffer_shape[0] = conv_out_channels_ * param_.kernel.Size();
     for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = in_data[deconv::kData].shape_[i + 1];
     }
@@ -280,19 +278,37 @@ class DeconvolutionOp {
     // create a colum buffer to hold the matrix product between weight_3d(T) and input_data
     TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
 
-    // col_buffer_3d : (G, KH * KW, IH * IW)
+    // col_buffer_3d : (G, OC/G * KH * KW, IH * IW)
     Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
-      Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
+        Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
 
     for (index_t i = 0; i < num_; ++i) {
       // Tensor<xpu, 3, DType> data_3d = input_4d[i];
       Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
-        workspace.dptr_ + col_buffer_size_,
-        Shape3(param_.num_group, input_4d.shape_[1] / param_.num_group, conv_in_spatial_dim_), s);
+          workspace.dptr_ + col_buffer_size_,
+          Shape3(group_, input_4d.shape_[1] / group_, conv_in_spatial_dim_), s);
 
-      // data_3d : (G, C/G, IH * IW)
+      // data_3d : (G, IC/G, IH * IW)
       data_3d = reshape(swapaxis<1, 0>(input_4d.Slice(i, i + 1)), data_3d.shape_);
-
+      /*
+          std::cout << "data_3d: " << std::endl;
+          DType *tmp_data = new DType[data_3d.shape_.Size()];
+          if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
+            std::cout << "running on GPU " << std::endl;
+            NDArray data(data_3d, ctx.run_ctx.get_ctx().dev_id);
+            data.SyncCopyToCPU(tmp_data, data_3d.shape_.Size());
+            std::cout << "complete " << std::endl;
+          } else {
+            tmp_data = static_cast<DType *>(data_3d[0].dptr_);
+          }
+
+          for (auto j = 0; j < data_3d.shape_[1]; ++j) {
+            for (auto k = 0; k < data_3d.shape_[2]; ++k) {
+                std::cout << *(tmp_data + j * data_3d.shape_[2] + k) << " ";
+            }
+            std::cout << std::endl;
+          }
+      */
       for (int g = 0; g < group_; ++g) {
         // Legacy approach shown here for comparison:
         // col_buffer_3d[g] = dot(weight_3d[g].T(), data_3d[g]);
@@ -302,27 +318,27 @@ class DeconvolutionOp {
 
       // TODO: (lnyuan) remove debugging code
       /*
-      std::cout << "col buffer: " << std::endl;
-      DType *tmp_data = new DType[col_buffer_size_];
-      if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
-        std::cout << "running on GPU " << std::endl;
-        NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id);
-        col_data.SyncCopyToCPU(tmp_data, col_buffer_size_);
-        std::cout << "complete " << std::endl;
-      } else {
-        tmp_data = static_cast<DType *>(col_buffer_3d[0].dptr_);
-      }
-
-      for (auto j = 0; j < kernel_dim_; ++j) {
-        for (auto k = 0; k < conv_in_spatial_dim_; ++k) {
-          std::cout << *(tmp_data + j * kernel_dim_ + k) << " ";
-        }
-        std::cout << std::endl;
-      }
+            std::cout << "col buffer: " << std::endl;
+            DType *tmp_col = new DType[col_buffer_size_];
+            if (ctx.run_ctx.get_ctx().dev_mask() == gpu::kDevMask) {
+              std::cout << "running on GPU " << std::endl;
+              NDArray col_data(col_buffer, ctx.run_ctx.get_ctx().dev_id);
+              col_data.SyncCopyToCPU(tmp_col, col_buffer_size_);
+              std::cout << "complete " << std::endl;
+            } else {
+              tmp_col = static_cast<DType *>(col_buffer_3d[0].dptr_);
+            }
+
+            for (auto j = 0; j < col_buffer_3d.shape_[1]; ++j) {
+              for (auto k = 0; k < col_buffer_3d.shape_[2]; ++k) {
+                std::cout << *(tmp_col + j * col_buffer_3d.shape_[2] + k) << " ";
+              }
+              std::cout << std::endl;
+            }
       */
       col2im(s, col_buffer.dptr<DType>(), out_data[deconv::kOut].shape_, col_buffer.shape_,
-        kernel, padding, stride, dilate, out_data[deconv::kOut].dptr<DType>() + i * input_dim_, req[deconv::kOut]);
-
+          param_.kernel, padding, stride, dilate,
+          out_data[deconv::kOut].dptr<DType>() + i * output_dim_, req[deconv::kOut]);
     }
 
     if (bias_term_) {
@@ -346,7 +362,7 @@ class DeconvolutionOp {
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
 
-    LayerSetUp(out_grad[deconv::kOut].shape_, in_grad[deconv::kData].shape_);
+    LayerSetUp(in_grad[deconv::kData].shape_, out_grad[deconv::kOut].shape_);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
@@ -366,39 +382,44 @@ class DeconvolutionOp {
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
     auto padding = param_.kernel.ndim() == 2 ? TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
-    auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
-    auto kernel_size = kernel.Size();
 
+    // weight_3d: (G, C/G, OC * KH * KW)
     Tensor<xpu, 3, DType> weight_3d = in_data[deconv::kWeight]
-      .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s);
+        .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s);
+
+    // dweight_3d: (G, C/G, OC * KH * KW)
     Tensor<xpu, 3, DType> dweight_3d = in_grad[deconv::kWeight]
-      .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_out_channels_ / group_, kernel_dim_), s);
+        .get_with_shape<xpu, 3, DType>(Shape3(group_, conv_in_channels_ / group_, kernel_dim_), s);
 
     Tensor<xpu, 1, DType> workspace = ctx.requested[deconv::kTempSpace]
-      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s);
+        .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_ + data_4d.shape_.Size()), s);
+
     // calculate shape of col_buffer
     TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
-    col_buffer_shape[0] = conv_out_channels_ * kernel_size;
+    col_buffer_shape[0] = conv_out_channels_ * param_.kernel.Size();
     for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
-      col_buffer_shape[i] = out_grad[deconv::kOut].shape_[i+1];
+      col_buffer_shape[i] = in_data[deconv::kData].shape_[i+1];
     }
+
     // create a column buffer to store ograd
     TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // col_buffer_3d: (G, OC/G * KH * KW, IH * IW)
     Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
-      Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
+        Shape3(group_, kernel_dim_, conv_in_spatial_dim_), s);
 
     for (index_t i = 0; i < num_; ++i) {
       // Tensor<xpu, 3, DType> data_3d = input_4d[i];
       Tensor<xpu, 3, DType> data_3d = Tensor<xpu, 3, DType>(
-        workspace.dptr_ + col_buffer_size_,
-        Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s);
+          workspace.dptr_ + col_buffer_size_,
+          Shape3(group_, data_4d.shape_[1] / group_, conv_in_spatial_dim_), s);
 
       // data_3d : (G, C/G, IH * IW)
       data_3d = reshape(swapaxis<1, 0>(data_4d.Slice(i, i + 1)), data_3d.shape_);
 
       // convert output gradient array to column buffer
       im2col(s, out_grad[deconv::kOut].dptr<DType>() + i * output_dim_, out_grad[deconv::kOut].shape_,
-        col_buffer.shape_, kernel, padding, stride, dilate, col_buffer.dptr<DType>());
+          col_buffer.shape_, param_.kernel, padding, stride, dilate, col_buffer.dptr<DType>());
 
       for (int g = 0; g < group_; ++g) {
         auto request = (i == 0) ? req[deconv::kWeight] : kAddTo;
@@ -454,19 +475,15 @@ class DeconvolutionOp {
     conv_out_channels_ = param_.num_filter;
     conv_in_channels_ = channels_;
     bias_term_ = !param_.no_bias;
-    kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
+    kernel_dim_ = conv_out_channels_ / group_ * param_.kernel.Size();
     weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
     conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
     conv_in_spatial_dim_ = ishape.ProdShape(2, ishape.ndim());
-    col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
-    output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
     // size of the column buffer used for storing im2col-ed pixels
     col_buffer_size_ = kernel_dim_ * group_ * conv_in_spatial_dim_;
     // input/output image size (#channels * height * width)
     input_dim_ = ishape.ProdShape(1, ishape.ndim());
     output_dim_ = oshape.ProdShape(1, oshape.ndim());
-    num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
-    num_kernels_col2im_ = input_dim_;
   }
 
 private:
@@ -482,13 +499,9 @@ class DeconvolutionOp {
   index_t conv_in_channels_;  // number of input channels
   index_t kernel_dim_;  // number of input channels per group * kernel size
   index_t weight_offset_;  // number of output channels per group * kernel_dim_
-  index_t col_offset_;
-  index_t output_offset_;
   index_t col_buffer_size_;
   index_t input_dim_;
   index_t output_dim_;
-  index_t num_kernels_im2col_;
-  index_t num_kernels_col2im_;
   bool bias_term_;  // has bias term?
 };  // class DeconvolutionOp