From 2863a1f3cf8bedfcc3a856e6820a23b620cc6ac7 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 16 May 2018 06:12:24 +0000
Subject: [PATCH 01/21] + rsp parameter

---
 python/mxnet/gluon/block.py         |  8 +++
 python/mxnet/gluon/contrib/block.py | 62 ++++++++++++++++++++++
 python/mxnet/gluon/parameter.py     | 81 ++++++++++++++++++++++++++---
 tests/python/unittest/test_gluon.py | 50 +++++++++++++++---
 4 files changed, 187 insertions(+), 14 deletions(-)
 create mode 100644 python/mxnet/gluon/contrib/block.py

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 4779484ec3ee..09863f00e23c 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -713,6 +713,14 @@ def __init__(self, outputs, inputs, params=None):
                 "Input symbols must be variable, but %s is an output of operators"%str(i)
             input_names.add(i.name)
 
+        # check if any symbol is row_sparse
+        row_sparse_storage = ndarray.ndarray._STORAGE_TYPE_STR_TO_ID['row_sparse']
+        for i in out:
+            for j in i.get_internals():
+                assert(j.attr("__storage_type__") != str(row_sparse_storage)), \
+                    "SymbolBlock doesn't support Parameter '%s' because its storage " \
+                    "type is 'row_sparse'." % j.name
+
         for i in out.list_arguments():
             if i not in input_names:
                 self.params.get(i, allow_deferred_init=True)
diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py
new file mode 100644
index 000000000000..335aa585db10
--- /dev/null
+++ b/python/mxnet/gluon/contrib/block.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""Base container class for contrib neural network models."""
+__all__ = ['SparseBlock']
+
+from .. import Block
+
+class SparseBlock(Block):
+    """`SparseBlock` only supports forwarding with NDArray.
+    """
+    def __init__(self, prefix=None, params=None):
+        super(SparseBlock, self).__init__(prefix=prefix, params=params)
+
+    def forward(self, x, *args):
+        """Defines the forward computation. Arguments has to be :py:class:`NDArray`."""
+        assert isinstance(x, NDArray), \
+            "SparseBlock requires the first argument to forward be NDArray, " \
+            "but got %s"%type(x)
+        with x.context as ctx:
+            params = {}
+            for name, param in self._reg_params.items():
+                if param._stype != 'default':
+                    params[name] = param
+                else:
+                    params[name] = param.data(ctx)
+            ret = self.sparse_forward(x, *args, **params)
+            for name, param in params.items():
+                if param._dirty:
+                    raise RuntimeError("sparse Parameter '%s' was not pulled during "
+                                       "sparse_forward(). Please call param.row_sparse_data() "
+                                       "with indices to pull sparse parameters in "
+                                       "sparse_forward()" % name)
+
+    def sparse_forward(self, F, x, *args, **kwargs):
+        """Overrides to define sparse forward computation for this `SparseBlock`.
+
+        Parameters
+        ----------
+        x : NDArray
+            The first input tensor.
+        *args : list of Parameter or NDArray
+            Additional input tensors or sparse parameters.
+        """
+        # pylint: disable= invalid-name
+        raise NotImplementedError
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index c7cbcccc95ec..ed3fc86a885f 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -81,6 +81,8 @@ class Parameter(object):
         Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.
     init : Initializer, default None
         Initializer of this parameter. Will use the global initializer by default.
+    stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'.
+        The storage type of the parameter.
     grad_stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'.
         The storage type of the parameter's gradient.
 
@@ -99,12 +101,13 @@ class Parameter(object):
     """
     def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
                  lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False,
-                 differentiable=True, grad_stype='default'):
+                 differentiable=True, stype='default', grad_stype='default'):
         self._var = None
         self._data = None
         self._grad = None
         self._ctx_list = None
         self._ctx_map = None
+        self._trainer = None
         self._deferred_init = ()
         self._differentiable = differentiable
         self._allow_deferred_init = allow_deferred_init
@@ -116,10 +119,13 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self.wd_mult = wd_mult
         self.grad_req = grad_req
         self.init = init
-        assert grad_stype in ['default', 'row_sparse', 'csr'], \
-            "grad_stype for Parameter '%s' must be one of 'default', 'row_sparse', or 'csr'," \
-            " but got '%s'" % (name, grad_stype)
+        valid_stypes = ['default', 'row_sparse', 'csr']
+        assert grad_stype in valid_stypes, "grad_stype for Parameter %s must be " \
+            "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, grad_stype)
+        assert stype in valid_stypes, "stype for Parameter %s must be " \
+            "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, stype)
         self._grad_stype = grad_stype
+        self._stype = stype
 
 
     def __repr__(self):
@@ -243,7 +249,7 @@ def _finish_deferred_init(self):
         with autograd.pause():
             if data is None:
                 data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
-                                     ctx=context.cpu())
+                                     ctx=context.cpu(), stype=self._stype)
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
 
@@ -383,9 +389,54 @@ def set_data(self, data):
         for arr in self.list_data():
             arr[:] = data
 
+    def row_sparse_data(self, ctx, row_id):
+        """Returns a copy of the 'row_sparse' parameter on one context. The copy only
+        retains rows whose ids occur in provided row ids.
+        The parameter must have been initialized on this context before.
+
+        Parameters
+        ----------
+        ctx : Context
+            Desired context.
+        row_id: NDArray
+            Row ids to retain for the 'row_sparse' parameter.
+
+        Returns
+        -------
+        NDArray on ctx
+        """
+        if self._stype != 'row_sparse':
+            raise ValueError("Cannot return a copy of Parameter %s on ctx %s via " \
+                             "row_sparse_data() because its storage type is %s. Please " \
+                             "use data() instead." % (self.name, str(ctx), self._stype))
+        return self._get_row_sparse(self._data, ctx, row_id)
+
+    def list_row_sparse_data(self, row_id):
+        """Returns copies of the 'row_sparse' parameter on all contexts, in the same order
+        as creation. The copy only retains rows whose ids occur in provided row ids.
+        The parameter must have been initialized before.
+
+        Parameters
+        ----------
+        ctx : Context
+            Desired context.
+        row_id: NDArray
+            Row ids to retain for the 'row_sparse' parameter.
+
+        Returns
+        -------
+        list of NDArrays
+        """
+        if self._stype != 'row_sparse':
+            raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \
+                             "list_row_sparse_data() because its storage type is %s. Please " \
+                             "use data() instead." % (self.name, self._stype))
+        return self._get_row_sparse(self._data, list, row_id)
+
     def data(self, ctx=None):
         """Returns a copy of this parameter on one context. Must have been
-        initialized on this context before.
+        initialized on this context before. For sparse parameters, use
+        :py:meth:`Parameter.row_sparse_data` instead.
 
         Parameters
         ----------
@@ -396,11 +447,25 @@ def data(self, ctx=None):
         -------
         NDArray on ctx
         """
+        if self._stype != 'default':
+            raise ValueError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \
+                             "because its storage type is %s. Please use row_sparse_data() " \
+                             "instead." % (self.name, str(ctx), self._stype))
         return self._check_and_get(self._data, ctx)
 
     def list_data(self):
         """Returns copies of this parameter on all contexts, in the same order
-        as creation."""
+        as creation. For sparse parameters, use :py:meth:`Parameter.list_row_sparse_data`
+        instead.
+
+        Returns
+        -------
+        list of NDArrays
+        """
+        if self._stype != 'default':
+            raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \
+                             "list_data() because its storage type is %s. Please use " \
+                             "row_sparse_data() instead." % (self.name, self._stype))
         return self._check_and_get(self._data, list)
 
     def grad(self, ctx=None):
@@ -447,7 +512,7 @@ def var(self):
         if self._var is None:
             self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype,
                                    lr_mult=self.lr_mult, wd_mult=self.wd_mult,
-                                   init=self.init)
+                                   init=self.init, stype=self._stype)
         return self._var
 
     def cast(self, dtype):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index e3026743ba35..9640fc0d4473 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -39,33 +39,61 @@ def test_parameter():
     assert p.data(mx.cpu(0)).shape == (10, 10)
     assert p.var().name == 'weight'
     assert p.grad(mx.cpu(0)).stype == 'default'
+    assert p.data().stype == 'default'
 
     p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)])
     assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)]
 
 @with_seed()
 def test_sparse_parameter():
-    p = gluon.Parameter('weight', shape=(10, 10), grad_stype='row_sparse')
+    p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
     p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    assert len(p.list_data()) == 2
+    row_id = mx.nd.arange(0, 10)
+    assert len(p.list_row_sparse_data(row_id)) == 2
     assert len(p.list_grad()) == 2
-    assert p.data(mx.cpu(1)).context == mx.cpu(1)
-    assert p.data(mx.cpu(0)).shape == (10, 10)
+    weight = p.row_sparse_data(mx.cpu(1), row_id)
+    assert weight.context == mx.cpu(1)
+    assert weight.shape == (10, 10)
+    assert weight.stype == 'row_sparse'
     assert p.var().name == 'weight'
+    assert p.var().attr('__storage_type__') == str(_STORAGE_TYPE_STR_TO_ID['row_sparse'])
     assert p.grad(mx.cpu(0)).stype == 'row_sparse'
 
     p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)])
     assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)]
 
+@with_seed()
+def test_parameter_invalid_access():
+    # cannot call data on row_sparse parameters
+    p0 = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
+    p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    assertRaises(ValueError, p0.data)
+    assertRaises(ValueError, p0.list_data)
+    row_id = mx.nd.arange(0, 10)
+    # cannot call row_sparse_data on dense parameters
+    p1 = gluon.Parameter('weight', shape=(10, 10))
+    p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    assertRaises(ValueError, p1.row_sparse_data, mx.cpu(0), row_id)
+    assertRaises(ValueError, p1.list_row_sparse_data, row_id)
 
 @with_seed()
 def test_paramdict():
     params = gluon.ParameterDict('net_')
-    params.get('weight', shape=(10, 10))
-    assert list(params.keys()) == ['net_weight']
+    params.get('w0', shape=(10, 10))
+    params.get('w1', shape=(10, 10), stype='row_sparse')
+    all_row_ids = mx.nd.arange(0, 10)
+    assert list(params.keys()) == ['net_w0', 'net_w1']
     params.initialize(ctx=mx.cpu())
+    prev_w0 = params.get('w0').data(mx.cpu())
+    prev_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids)
+
     params.save('test.params')
     params.load('test.params', mx.cpu())
+    # compare the values before and after save/load
+    cur_w0 = params.get('w0').data(mx.cpu())
+    cur_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids)
+    mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
+    mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
 
 @with_seed()
@@ -246,6 +274,16 @@ def hybrid_forward(self, F, x):
     net.hybridize()
     assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray)
 
+@with_seed()
+@raises(AssertionError)
+def test_symbol_sparse_block():
+    data = mx.sym.var('data')
+    weight = mx.sym.var('weight', stype='row_sparse')
+    bias = mx.sym.var('bias')
+    out = mx.sym.broadcast_add(mx.sym.dot(data, weight), bias)
+    # an exception is expected
+    net = gluon.SymbolBlock(out, data)
+
 
 def check_layer_forward(layer, dshape):
     layer.collect_params().initialize()

From e3d20c760e2b4919b71e5ee1b7a72911dd003c05 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 16 May 2018 18:05:41 +0000
Subject: [PATCH 02/21] draft

---
 python/mxnet/gluon/block.py                   |  7 ++
 python/mxnet/gluon/contrib/__init__.py        |  2 +
 python/mxnet/gluon/contrib/nn/basic_layers.py |  1 +
 python/mxnet/gluon/parameter.py               | 47 +++++++++-
 python/mxnet/gluon/trainer.py                 | 94 ++++++++++++++-----
 python/mxnet/model.py                         | 19 ++++
 tests/python/unittest/test_gluon.py           | 45 ++++++++-
 7 files changed, 184 insertions(+), 31 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 09863f00e23c..1a06d2ab3340 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -445,6 +445,13 @@ class HybridBlock(Block):
     the end-to-end usage.
     """
     def __init__(self, prefix=None, params=None):
+        if isinstance(params, ParameterDict):
+            for param in params.values():
+                stype = param._stype
+                if stype != 'default':
+                    raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \
+                                     "because its storage type is %s. Please consider " \
+                                     "using a SparseBlock instead."%(param.name, stype))
         super(HybridBlock, self).__init__(prefix=prefix, params=params)
         self._cached_graph = ()
         self._cached_op = None
diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index f708fb900227..a1d96e5ad408 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -23,3 +23,5 @@
 from . import rnn
 
 from . import data
+
+from .block import *
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index eccdf18c1bbb..3e68bd6cb9e5 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -22,6 +22,7 @@
 
 from .... import nd
 from ...block import HybridBlock
+from ..block import SparseBlock
 from ...nn import Sequential, HybridSequential
 
 class Concurrent(Sequential):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index ed3fc86a885f..33a1051b93b4 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -168,6 +168,15 @@ def shape(self, new_shape):
 
         self._shape = new_shape
 
+    def _set_trainer(self, trainer):
+        """ Set the trainer this parameter is associated with. """
+        if self._trainer and self._trainer is not trainer:
+            raise RuntimeError(
+                "Failed to set the trainer for Parameter %s to %s because it was set to %s. " \
+                "More than one trainers for a single parameter is not supported." %(
+                    self.name, str(trainer), str(self._trainer)))
+        self._trainer = trainer
+
     def _check_and_get(self, arr_list, ctx):
         if arr_list is not None:
             if ctx is list:
@@ -200,6 +209,31 @@ def _check_and_get(self, arr_list, ctx):
             "because the later does not include Parameters of " \
             "nested child Blocks"%(self.name))
 
+    def _get_row_sparse(self, arr_list, ctx, row_id):
+        results = self._check_and_get(arr_list, ctx)
+
+        # get row sparse params based on row ids
+        if not isinstance(row_id, ndarray.NDArray):
+            raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. "
+                            "NDArray type is expected." % (self.name, row_id))
+        if not self._trainer:
+            # the sparse param is not yet associated with any trainer.
+            # and create new buffers to hold the outputs.
+            ret = []
+            if isinstance(results, ndarray.NDArray):
+                rows = row_id.as_in_context(results.context)
+                ret = ndarray.sparse.retain(results, rows)
+            else:
+               # list of NDArrays
+               for result in results:
+                   rows = row_id.as_in_context(result.context)
+                   ret.append(ndarray.sparse.retain(result, rows))
+            return ret;
+
+        # fetch row sparse params from the trainer
+        self._trainer._row_sparse_pull(self, results, row_id)
+        return results
+
     def _load_init(self, data, ctx):
         """(Re)initializes by loading from data."""
         if self.shape:
@@ -277,12 +311,17 @@ def _init_grad(self):
         self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
                                     stype=self._grad_stype) for i in self._data]
 
-        autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req)
+        autograd.mark_variables(self._check_and_get(self._data, list),
+                                self._grad, self.grad_req)
 
     def _reduce(self):
         """Reduce data from multiple context."""
-        block = self.list_data()
-        data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
+        if self._stype == 'default':
+            block = self.list_data()
+            data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
+        else:
+            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64')
+            data = self.row_sparse_data(context.cpu(), all_row_ids)
         return data
 
     def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
@@ -386,7 +425,7 @@ def set_data(self, data):
             self._deferred_init = self._deferred_init[:3] + (data,)
             return
 
-        for arr in self.list_data():
+        for arr in self._check_and_get(self._data, list):
             arr[:] = data
 
     def row_sparse_data(self, ctx, row_id):
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 39c4a1fd6104..23665ce5070a 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -21,7 +21,7 @@
 __all__ = ['Trainer']
 
 from .. import optimizer as opt
-from ..model import _create_kvstore
+from ..model import _create_kvstore, _create_sparse_kvstore
 from .parameter import ParameterDict, Parameter
 
 class Trainer(object):
@@ -68,12 +68,20 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
                 "First argument must be a list or dict of Parameters, " \
                 "got %s."%(type(params)))
         self._params = []
-        for param in params:
+        self._params_to_init = []
+        self._contains_sparse = False
+        self._param2idx = {}
+        for i, param in enumerate(params):
             if not isinstance(param, Parameter):
                 raise ValueError(
                     "First argument must be a list or dict of Parameters, " \
                     "got list of %s."%(type(param)))
+            self._param2idx[param.name] = i
             self._params.append(param)
+            self._params_to_init.append(param)
+            param._set_trainer(self)
+            if param._stype != 'default':
+                self._contains_sparse = True
         self._compression_params = compression_params
         optimizer_params = optimizer_params if optimizer_params else {}
         self._scale = float(optimizer_params.get('rescale_grad', 1.0))
@@ -109,38 +117,55 @@ def _init_optimizer(self, optimizer, optimizer_params):
         self._updaters = [opt.get_updater(self._optimizer) \
                             for _ in self._contexts]
 
+    def _init_params(self):
+        """ Initialize parameters in the KVStore. Parameters whose
+            intiailization is incomplete are ignored.
+        """
+        assert self._kv_initialized, "Cannot initialize parameters in KVStore " \
+                                     "when KVStore is not initialized."
+        params_to_init = []
+        if self._kvstore:
+            params = [param for param in self._params_to_init \
+                      if not param._deferred_init]
+            params_to_init = [param for param in self._params_to_init \
+                              if param._deferred_init]
+            print('initing ', params)
+            for param in params:
+                param_arrays = param._check_and_get(param._data, list)
+                idx = self._param2idx[param.name]
+                self._kvstore.init(idx, param_arrays[0])
+                if param._stype == 'default':
+                    self._kvstore.pull(idx, param_arrays, priority=-idx)
+
+        self._params_to_init = params_to_init
+
     def _init_kvstore(self):
+        """ Create kvstore """
         arg_arrays = {}
-        contains_sparse = False
-        for param in self._params:
-            arg_arrays[param.name] = param.data(self._contexts[0])
-            if param._grad_stype != 'default':
-                contains_sparse = True
-                # update_on_kvstore is set to False by the user
-                if self._update_on_kvstore is False:
-                    raise RuntimeError("Cannot set update_on_kvstore to False when sparse "
-                                       "gradients and/or sparse weights are present for "
-                                       "Parameter %s." % param.name)
-        kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
-                                                     arg_arrays)
-        update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \
-                            else update_on_kvstore
+        if self._contains_sparse:
+            kvstore, update_on_kvstore = _create_kvstore(self._kvstore)
+            # update_on_kvstore is set to False by the user
+            if self._update_on_kvstore is False:
+                raise RuntimeError("Cannot set update_on_kvstore to False when sparse "
+                                   "gradients and/or sparse weights are present for "
+                                   "Parameter %s." % param.name)
+        else:
+            kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
+                                                         arg_arrays)
+            update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \
+                                else update_on_kvstore
         if kvstore:
             if self._compression_params:
                 kvstore.set_gradient_compression(self._compression_params)
             # kv.pull(row_sparse_grad) is not supported
-            if contains_sparse:
+            if self._contains_sparse:
                 update_on_kvstore = True
             else:
                 if 'dist' in kvstore.type:
                     update_on_kvstore = False
             if update_on_kvstore:
+                # optimizer preferably needs to be set before init for multiprecision
                 kvstore.set_optimizer(self._optimizer)
-            # optimizer preferably needs to be set before init for multiprecision
-            for i, param in enumerate(self._params):
-                param_arrays = param.list_data()
-                kvstore.init(i, param_arrays[0])
-                kvstore.pull(i, param_arrays, priority=-i)
             self._kvstore = kvstore
             self._update_on_kvstore = update_on_kvstore
         else:
@@ -171,6 +196,17 @@ def set_learning_rate(self, lr):
         else:
             self._optimizer.set_learning_rate(lr)
 
+    def _row_sparse_pull(self, parameter, out, row_id):
+        # initialize kv and params if not already
+        assert self._kvstore, "Cannot perform row_sparse_pull() when KVStore is not set"
+        if not self._kv_initialized:
+            self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
+        # TODO check state
+        self._kvstore.row_sparse_pull(self._param2idx[parameter.name], \
+                                      out=out, row_ids=row_id)
+
     def step(self, batch_size, ignore_stale_grad=False):
         """Makes one step of parameter update. Should be called after
         `autograd.backward()` and outside of `record()` scope.
@@ -191,6 +227,8 @@ def step(self, batch_size, ignore_stale_grad=False):
         """
         if not self._kv_initialized:
             self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
 
         self._optimizer.rescale_grad = self._scale / batch_size
 
@@ -210,6 +248,8 @@ def allreduce_grads(self):
         """
         if not self._kv_initialized:
             self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
         assert not (self._kvstore and self._update_on_kvstore), \
                 'allreduce_grads() when parameters are updated on kvstore ' \
                 'is not supported. Try setting `update_on_kvstore` ' \
@@ -250,6 +290,8 @@ def update(self, batch_size, ignore_stale_grad=False):
         """
         if not self._kv_initialized:
             self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
         assert not (self._kvstore and self._update_on_kvstore), \
                 'update() when parameters are updated on kvstore ' \
                 'is not supported. Try setting `update_on_kvstore` ' \
@@ -264,7 +306,7 @@ def _update(self, ignore_stale_grad=False):
                 continue
 
             if not ignore_stale_grad:
-                for data in param.list_data():
+                for data in param._check_and_get(param._data, list):
                     if not data._fresh_grad:
                         raise UserWarning(
                             "Gradient of Parameter `%s` on context %s has not been updated "
@@ -275,7 +317,7 @@ def _update(self, ignore_stale_grad=False):
                             "warning and skip updating of Parameters with stale gradient" \
                             %(param.name, str(data.context)))
 
-            if self._kvstore and self._update_on_kvstore:
+            if self._kvstore and self._update_on_kvstore and param._stype == 'default':
                 self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
@@ -296,6 +338,8 @@ def save_states(self, fname):
 
         if not self._kv_initialized:
             self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
 
         if self._update_on_kvstore:
             self._kvstore.save_optimizer_states(fname, dump_optimizer=True)
@@ -313,6 +357,8 @@ def load_states(self, fname):
         """
         if not self._kv_initialized:
             self._init_kvstore()
+        if self._params_to_init:
+            self._init_params()
 
         if self._update_on_kvstore:
             self._kvstore.load_optimizer_states(fname)
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index ae7726d76a7a..3a50553a615c 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -55,6 +55,25 @@
                             'eval_metric',
                             'locals'])
 
+def _create_sparse_kvstore(kvstore):
+    """Create kvstore assuming some parameters' storage types are row_sparse.
+
+    Parameters
+    ----------
+    kvstore : KVStore or str
+        The kvstore.
+    """
+    # always update on kvstore
+    update_on_kvstore = True
+    if isinstance(kvstore, kvs.KVStore):
+        kv = kvstore
+    elif isinstance(kvstore, str):
+        kv = kvs.create(kvstore)
+    else:
+        raise TypeError("Cannot create '%s' KVStore with row_sparse parameters. "
+                        "The type must be KVStore or str." % kvstore)
+    return (kv, update_on_kvstore)
+
 def _create_kvstore(kvstore, num_device, arg_params):
     """Create kvstore
     This function select and create a proper kvstore if given the kvstore type.
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 9640fc0d4473..857b56510f4d 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -19,7 +19,8 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import assert_almost_equal
-from common import setup_module, with_seed
+from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
+from common import setup_module, with_seed, assertRaises
 import numpy as np
 from nose.tools import raises, assert_raises
 from copy import deepcopy
@@ -95,6 +96,35 @@ def test_paramdict():
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
+@with_seed()
+def test_parameter_row_sparse_data():
+    def check_parameter_row_sparse_data(with_trainer):
+        ctx0 = mx.cpu(1)
+        ctx1 = mx.cpu(2)
+        dim0 = 4
+        x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse')
+        x.initialize(init='xavier', ctx=[ctx0, ctx1])
+        if with_trainer:
+            trainer = gluon.Trainer([x], 'sgd')
+        x_param = x._data[0].copy()
+        assert x_param.stype == 'row_sparse'
+        row_id_0 = mx.nd.array([0,1])
+        retained_0 = x.row_sparse_data(ctx0, row_id_0)
+        retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0))
+        mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy())
+        assert retained_0.context == ctx0
+        row_id_1 = mx.nd.arange(0, dim0)
+        retained_1 = x.row_sparse_data(ctx1, row_id_1)
+        retained_target_1 = x_param
+        mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy())
+        assert retained_1.context == ctx1
+        row_id_2 = mx.nd.array([0,1,2])
+        retained_2 = x.list_row_sparse_data(row_id_2)
+        retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0))
+        mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy())
+
+    check_parameter_row_sparse_data(True)
+    check_parameter_row_sparse_data(False)
 
 @with_seed()
 def test_constant():
@@ -276,15 +306,24 @@ def hybrid_forward(self, F, x):
 
 @with_seed()
 @raises(AssertionError)
-def test_symbol_sparse_block():
+def test_sparse_symbol_block():
     data = mx.sym.var('data')
     weight = mx.sym.var('weight', stype='row_sparse')
     bias = mx.sym.var('bias')
     out = mx.sym.broadcast_add(mx.sym.dot(data, weight), bias)
-    # an exception is expected
+    # an exception is expected when creating a SparseBlock w/ sparse param
     net = gluon.SymbolBlock(out, data)
 
+@with_seed()
+@raises(ValueError)
+def test_sparse_hybrid_block():
+    params = gluon.ParameterDict('net_')
+    params.get('weight', shape=(5, 5), stype='row_sparse')
+    params.get('bias', shape=(5,))
+    # an exception is expected when creating a HybridBlock w/ sparse param
+    net = gluon.nn.Dense(5, params=params)
 
+@with_seed()
 def check_layer_forward(layer, dshape):
     layer.collect_params().initialize()
     x = mx.nd.ones(shape=dshape)

From ad672a79dda1ec8414a59715aaadaf5d0d5f15af Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 17 May 2018 05:13:57 +0000
Subject: [PATCH 03/21] Fix optimizer pickle

---
 python/mxnet/gluon/trainer.py       |  2 ++
 python/mxnet/optimizer.py           | 11 +++++++++++
 tests/python/unittest/test_gluon.py | 20 ++++++++++++++++++--
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 23665ce5070a..d0eb0327ae08 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -363,6 +363,8 @@ def load_states(self, fname):
         if self._update_on_kvstore:
             self._kvstore.load_optimizer_states(fname)
             self._optimizer = self._kvstore._updater.optimizer
+            param_dict = {i: param for i, param in enumerate(self._params)}
+            self._optimizer.param_dict = param_dict
         else:
             with open(fname, 'rb') as f:
                 states = f.read()
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 1d2fd2e73dfd..0c3fc904fb1f 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -426,6 +426,17 @@ def _get_wd(self, index):
             wd *= self.wd_mult.get(self.idx2name[index], 1.0)
         return wd
 
+    def __getstate__(self):
+        ret = self.__dict__.copy()
+        # do not include param_dict in the state
+        del ret['param_dict']
+        return ret
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        # param_dict needs to be explicitly set by the trainer
+        self.param_dict = {}
+
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 857b56510f4d..8b546a9c6588 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -601,10 +601,10 @@ def dict_equ(a, b):
 
     assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
 
-    trainer.save_states('test.states')
+    trainer.save_states('test_trainer.states')
     states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \
              else deepcopy(trainer._updaters[0].states)
-    trainer.load_states('test.states')
+    trainer.load_states('test_trainer.states')
     if trainer._update_on_kvstore:
         dict_equ(trainer._kvstore._updater.states, states)
         assert trainer._optimizer == trainer._kvstore._updater.optimizer
@@ -630,6 +630,22 @@ def dict_equ(a, b):
 
     assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy()
 
+@with_seed()
+def test_trainer_save_load():
+    x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
+    trainer.save_states('test_trainer_save_load.states')
+    trainer.load_states('test_trainer_save_load.states')
+    x.lr_mult = 2.0
+    # check if parameter dict is correctly associated with optimizer after load_state
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
 
 @with_seed()
 def test_block_attr_hidden():

From 674d374b937f3c9711051a2830e57a0fb5e0ea2a Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 17 May 2018 16:58:44 +0000
Subject: [PATCH 04/21] refactor and document

---
 python/mxnet/gluon/block.py         |  1 +
 python/mxnet/gluon/contrib/block.py | 20 ++++++++++--------
 python/mxnet/gluon/parameter.py     | 32 ++++++++++++++---------------
 python/mxnet/gluon/trainer.py       |  3 ++-
 tests/python/unittest/test_gluon.py | 26 +++++++++++++++--------
 5 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 1a06d2ab3340..52b385daa14f 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -445,6 +445,7 @@ class HybridBlock(Block):
     the end-to-end usage.
     """
     def __init__(self, prefix=None, params=None):
+        # check if any parameter is row_sparse
         if isinstance(params, ParameterDict):
             for param in params.values():
                 stype = param._stype
diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py
index 335aa585db10..036cd58e39d6 100644
--- a/python/mxnet/gluon/contrib/block.py
+++ b/python/mxnet/gluon/contrib/block.py
@@ -31,25 +31,29 @@ def __init__(self, prefix=None, params=None):
     def forward(self, x, *args):
         """Defines the forward computation. Arguments has to be :py:class:`NDArray`."""
         assert isinstance(x, NDArray), \
-            "SparseBlock requires the first argument to forward be NDArray, " \
+            "SparseBlock requires the first argument to forward to be an NDArray, " \
             "but got %s"%type(x)
         with x.context as ctx:
             params = {}
             for name, param in self._reg_params.items():
+                # If a parameter is not dense, instead of passing the NDArray to
+                # sparse_forward(), the parameter itself is passed upon which
+                # row_sparse_data() will be performed.
                 if param._stype != 'default':
                     params[name] = param
                 else:
                     params[name] = param.data(ctx)
-            ret = self.sparse_forward(x, *args, **params)
-            for name, param in params.items():
-                if param._dirty:
-                    raise RuntimeError("sparse Parameter '%s' was not pulled during "
-                                       "sparse_forward(). Please call param.row_sparse_data() "
-                                       "with indices to pull sparse parameters in "
-                                       "sparse_forward()" % name)
+            return self.sparse_forward(x, *args, **params)
 
     def sparse_forward(self, F, x, *args, **kwargs):
         """Overrides to define sparse forward computation for this `SparseBlock`.
+        Note that the *args for :py:meth:`SparseBlock.sparse_forward` is a list of
+        :py:class:`NDArray`s and :py:class:`Parameter`s. If the storage type of any
+        Parameter is sparse, the Parameter is passed as :py:class:`Parameter` by itself.
+        Otherwise, the Parameter is passed as a :py:class:`NDArray`.
+
+        When overridding sparse_forward, typically one needs to invoke
+        :py:meth:`Parameter.row_sparse_data` to access the data of the Parameter.
 
         Parameters
         ----------
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 33a1051b93b4..306e725acb7c 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -119,10 +119,11 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self.wd_mult = wd_mult
         self.grad_req = grad_req
         self.init = init
+        # sparse related storage type information
         valid_stypes = ['default', 'row_sparse', 'csr']
-        assert grad_stype in valid_stypes, "grad_stype for Parameter %s must be " \
+        assert grad_stype in valid_stypes, "grad_stype for Parameter '%s' must be " \
             "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, grad_stype)
-        assert stype in valid_stypes, "stype for Parameter %s must be " \
+        assert stype in valid_stypes, "stype for Parameter '%s' must be " \
             "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, stype)
         self._grad_stype = grad_stype
         self._stype = stype
@@ -172,8 +173,8 @@ def _set_trainer(self, trainer):
         """ Set the trainer this parameter is associated with. """
         if self._trainer and self._trainer is not trainer:
             raise RuntimeError(
-                "Failed to set the trainer for Parameter %s to %s because it was set to %s. " \
-                "More than one trainers for a single parameter is not supported." %(
+                "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \
+                "More than one trainers for a single Parameter is not supported." %(
                     self.name, str(trainer), str(self._trainer)))
         self._trainer = trainer
 
@@ -210,6 +211,7 @@ def _check_and_get(self, arr_list, ctx):
             "nested child Blocks"%(self.name))
 
     def _get_row_sparse(self, arr_list, ctx, row_id):
+        """ Get row_sparse data from row_sparse parameters based on row_id. """
         results = self._check_and_get(arr_list, ctx)
 
         # get row sparse params based on row ids
@@ -320,8 +322,8 @@ def _reduce(self):
             block = self.list_data()
             data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
         else:
-            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64')
-            data = self.row_sparse_data(context.cpu(), all_row_ids)
+            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=context.cpu())
+            data = self.row_sparse_data(all_row_ids)
         return data
 
     def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
@@ -428,27 +430,25 @@ def set_data(self, data):
         for arr in self._check_and_get(self._data, list):
             arr[:] = data
 
-    def row_sparse_data(self, ctx, row_id):
-        """Returns a copy of the 'row_sparse' parameter on one context. The copy only
-        retains rows whose ids occur in provided row ids.
+    def row_sparse_data(self, row_id):
+        """Returns a copy of the 'row_sparse' parameter on the same context as row_id's.
+        The copy only retains rows whose ids occur in provided row ids.
         The parameter must have been initialized on this context before.
 
         Parameters
         ----------
-        ctx : Context
-            Desired context.
         row_id: NDArray
             Row ids to retain for the 'row_sparse' parameter.
 
         Returns
         -------
-        NDArray on ctx
+        NDArray on row_id's context
         """
         if self._stype != 'row_sparse':
-            raise ValueError("Cannot return a copy of Parameter %s on ctx %s via " \
-                             "row_sparse_data() because its storage type is %s. Please " \
-                             "use data() instead." % (self.name, str(ctx), self._stype))
-        return self._get_row_sparse(self._data, ctx, row_id)
+            raise ValueError("Cannot return a copy of Parameter %s via row_sparse_data() " \
+                             "because its storage type is %s. Please use data() instead." \
+                             %(self.name, self._stype))
+        return self._get_row_sparse(self._data, row_id.context, row_id)
 
     def list_row_sparse_data(self, row_id):
         """Returns copies of the 'row_sparse' parameter on all contexts, in the same order
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index d0eb0327ae08..d07e6e75fdf5 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -143,7 +143,7 @@ def _init_kvstore(self):
         """ Create kvstore """
         arg_arrays = {}
         if self._contains_sparse:
-            kvstore, update_on_kvstore = _create_kvstore(self._kvstore)
+            kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore)
             # update_on_kvstore is set to False by the user
             if self._update_on_kvstore is False:
                 raise RuntimeError("Cannot set update_on_kvstore to False when sparse "
@@ -342,6 +342,7 @@ def save_states(self, fname):
             self._init_params()
 
         if self._update_on_kvstore:
+            assert not self._params_to_init, "Cannot save trainer states when some parameters are not yet initialized in kvstore."
             self._kvstore.save_optimizer_states(fname, dump_optimizer=True)
         else:
             with open(fname, 'wb') as fout:
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 8b546a9c6588..8164d2fac5d5 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -49,10 +49,10 @@ def test_parameter():
 def test_sparse_parameter():
     p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
     p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    row_id = mx.nd.arange(0, 10)
+    row_id = mx.nd.arange(0, 10, ctx=mx.cpu(1))
     assert len(p.list_row_sparse_data(row_id)) == 2
     assert len(p.list_grad()) == 2
-    weight = p.row_sparse_data(mx.cpu(1), row_id)
+    weight = p.row_sparse_data(row_id)
     assert weight.context == mx.cpu(1)
     assert weight.shape == (10, 10)
     assert weight.stype == 'row_sparse'
@@ -74,7 +74,7 @@ def test_parameter_invalid_access():
     # cannot call row_sparse_data on dense parameters
     p1 = gluon.Parameter('weight', shape=(10, 10))
     p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    assertRaises(ValueError, p1.row_sparse_data, mx.cpu(0), row_id)
+    assertRaises(ValueError, p1.row_sparse_data, row_id.copyto(mx.cpu(0)))
     assertRaises(ValueError, p1.list_row_sparse_data, row_id)
 
 @with_seed()
@@ -86,13 +86,13 @@ def test_paramdict():
     assert list(params.keys()) == ['net_w0', 'net_w1']
     params.initialize(ctx=mx.cpu())
     prev_w0 = params.get('w0').data(mx.cpu())
-    prev_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids)
+    prev_w1 = params.get('w1').row_sparse_data(all_row_ids)
 
     params.save('test.params')
     params.load('test.params', mx.cpu())
     # compare the values before and after save/load
     cur_w0 = params.get('w0').data(mx.cpu())
-    cur_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids)
+    cur_w1 = params.get('w1').row_sparse_data(all_row_ids)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
@@ -108,13 +108,13 @@ def check_parameter_row_sparse_data(with_trainer):
             trainer = gluon.Trainer([x], 'sgd')
         x_param = x._data[0].copy()
         assert x_param.stype == 'row_sparse'
-        row_id_0 = mx.nd.array([0,1])
-        retained_0 = x.row_sparse_data(ctx0, row_id_0)
+        row_id_0 = mx.nd.array([0,1], ctx=ctx0)
+        retained_0 = x.row_sparse_data(row_id_0)
         retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0))
         mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy())
         assert retained_0.context == ctx0
-        row_id_1 = mx.nd.arange(0, dim0)
-        retained_1 = x.row_sparse_data(ctx1, row_id_1)
+        row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1)
+        retained_1 = x.row_sparse_data(row_id_1)
         retained_target_1 = x_param
         mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy())
         assert retained_1.context == ctx1
@@ -573,6 +573,14 @@ def test_flatten():
     x = mx.nd.zeros((3,))
     assert flatten(x).shape == (3, 1)
 
+@with_seed()
+@raises(RuntimeError)
+def test_multi_trainer():
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize()
+    trainer0 = gluon.Trainer([x], 'sgd')
+    # multiple trainers for a single Parameter is not allowed
+    trainer1 = gluon.Trainer([x], 'sgd')
 
 @with_seed()
 def test_trainer():

From 6db6e291504ae9beb6f3bd7b6ae207b1758013ee Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 17 May 2018 18:08:17 +0000
Subject: [PATCH 05/21] add test for save load with cast_stype

---
 python/mxnet/gluon/parameter.py     | 17 +++++++++----
 python/mxnet/gluon/trainer.py       |  7 ++++--
 tests/python/unittest/test_gluon.py | 37 ++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 306e725acb7c..224759f6c66a 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -230,13 +230,13 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
                for result in results:
                    rows = row_id.as_in_context(result.context)
                    ret.append(ndarray.sparse.retain(result, rows))
-            return ret;
+            return ret
 
         # fetch row sparse params from the trainer
         self._trainer._row_sparse_pull(self, results, row_id)
         return results
 
-    def _load_init(self, data, ctx):
+    def _load_init(self, data, ctx, cast_stype=False):
         """(Re)initializes by loading from data."""
         if self.shape:
             for self_dim, data_dim in zip(self.shape, data.shape):
@@ -250,6 +250,14 @@ def _load_init(self, data, ctx):
                 "Failed loading Parameter '%s' from saved params: " \
                 "dtype incompatible expected %s vs saved %s"%(
                     self.name, str(self.dtype), str(data.dtype))
+        if self._stype != data.stype:
+            if not cast_stype:
+                raise RuntimeError("Failed loading Parameter '%s' from saved params: storage " \
+                                   "type incompatible expected %s vs saved %s. Set " \
+                                   "cast_stype=True to cast saved params to the same stype " \
+                                   "as '%s'."%(self.name, self._stype, data.stype, self.name))
+            else:
+                data = data.tostype(self._stype)
         if isinstance(ctx, Context):
             ctx = [ctx]
         if self._data is None:
@@ -322,6 +330,7 @@ def _reduce(self):
             block = self.list_data()
             data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block)
         else:
+            # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=context.cpu())
             data = self.row_sparse_data(all_row_ids)
         return data
@@ -870,7 +879,7 @@ def save(self, filename, strip_prefix=''):
         ndarray.save(filename, arg_dict)
 
     def load(self, filename, ctx=None, allow_missing=False,
-             ignore_extra=False, restore_prefix=''):
+             ignore_extra=False, restore_prefix='', cast_stype=False):
         """Load parameters from file.
 
         filename : str
@@ -908,4 +917,4 @@ def load(self, filename, ctx=None, allow_missing=False,
                     "Please make sure source and target networks have the same prefix."%(
                         name[lprefix:], filename, _brief_print_list(self._params.keys()))
                 continue
-            self[name]._load_init(arg_dict[name], ctx)
+            self[name]._load_init(arg_dict[name], ctx, cast_stype=cast_stype)
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index d07e6e75fdf5..5bd9f2012595 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -317,8 +317,11 @@ def _update(self, ignore_stale_grad=False):
                             "warning and skip updating of Parameters with stale gradient" \
                             %(param.name, str(data.context)))
 
-            if self._kvstore and self._update_on_kvstore and param._stype == 'default':
-                self._kvstore.pull(i, param.list_data(), priority=-i)
+            if self._kvstore and self._update_on_kvstore:
+                if param._stype == 'default':
+                    # 'row_sparse' parameters are not pulled immediately - they're pulled
+                    # in `SparseBlock.sparse_forward`
+                    self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
             for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 8164d2fac5d5..350cfff85e31 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -79,23 +79,37 @@ def test_parameter_invalid_access():
 
 @with_seed()
 def test_paramdict():
-    params = gluon.ParameterDict('net_')
-    params.get('w0', shape=(10, 10))
-    params.get('w1', shape=(10, 10), stype='row_sparse')
+    params0 = gluon.ParameterDict('net_')
+    params0.get('w0', shape=(10, 10))
+    params0.get('w1', shape=(10, 10), stype='row_sparse')
     all_row_ids = mx.nd.arange(0, 10)
-    assert list(params.keys()) == ['net_w0', 'net_w1']
-    params.initialize(ctx=mx.cpu())
-    prev_w0 = params.get('w0').data(mx.cpu())
-    prev_w1 = params.get('w1').row_sparse_data(all_row_ids)
+    assert list(params0.keys()) == ['net_w0', 'net_w1']
+    params0.initialize(ctx=mx.cpu())
+    prev_w0 = params0.get('w0').data(mx.cpu())
+    prev_w1 = params0.get('w1').row_sparse_data(all_row_ids)
 
-    params.save('test.params')
-    params.load('test.params', mx.cpu())
+    params0.save('test.params')
+    params0.load('test.params', mx.cpu())
     # compare the values before and after save/load
-    cur_w0 = params.get('w0').data(mx.cpu())
-    cur_w1 = params.get('w1').row_sparse_data(all_row_ids)
+    cur_w0 = params0.get('w0').data(mx.cpu())
+    cur_w1 = params0.get('w1').row_sparse_data(all_row_ids)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
+    # create a new param dict with dense params, and load from the checkpoint
+    # of sparse & dense params
+    params1 = gluon.ParameterDict('net_')
+    params1.get('w0', shape=(10, 10))
+    params1.get('w1', shape=(10, 10))
+    assertRaises(RuntimeError, params1.load, 'test.params', mx.cpu())
+    params1.load('test.params', mx.cpu(), cast_stype=True)
+    # compare the values before and after save/load
+    cur_w0 = params1.get('w0').data(mx.cpu())
+    cur_w1 = params1.get('w1').data(mx.cpu())
+    mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
+    mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
+
+
 @with_seed()
 def test_parameter_row_sparse_data():
     def check_parameter_row_sparse_data(with_trainer):
@@ -1105,6 +1119,7 @@ def test_save_load():
 
     net.load_params('test.params')
 
+@with_seed()
 def test_symbol_block_save_load():
     class Net(gluon.HybridBlock):
         def __init__(self):

From 6f0f403e5c99a31e2100d3b23063ace397b0182c Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Fri, 18 May 2018 00:46:23 +0000
Subject: [PATCH 06/21] refactor trainer tests

---
 python/mxnet/gluon/contrib/__init__.py        |   2 -
 python/mxnet/gluon/contrib/block.py           |  66 --------
 python/mxnet/gluon/contrib/nn/basic_layers.py |   1 -
 python/mxnet/gluon/trainer.py                 |   1 -
 src/operator/tensor/indexing_op.h             |   6 +
 tests/python/unittest/test_gluon.py           |  84 ----------
 tests/python/unittest/test_gluon_trainer.py   | 150 ++++++++++++++++++
 7 files changed, 156 insertions(+), 154 deletions(-)
 delete mode 100644 python/mxnet/gluon/contrib/block.py
 create mode 100644 tests/python/unittest/test_gluon_trainer.py

diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py
index a1d96e5ad408..f708fb900227 100644
--- a/python/mxnet/gluon/contrib/__init__.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -23,5 +23,3 @@
 from . import rnn
 
 from . import data
-
-from .block import *
diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py
deleted file mode 100644
index 036cd58e39d6..000000000000
--- a/python/mxnet/gluon/contrib/block.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable= arguments-differ
-"""Base container class for contrib neural network models."""
-__all__ = ['SparseBlock']
-
-from .. import Block
-
-class SparseBlock(Block):
-    """`SparseBlock` only supports forwarding with NDArray.
-    """
-    def __init__(self, prefix=None, params=None):
-        super(SparseBlock, self).__init__(prefix=prefix, params=params)
-
-    def forward(self, x, *args):
-        """Defines the forward computation. Arguments has to be :py:class:`NDArray`."""
-        assert isinstance(x, NDArray), \
-            "SparseBlock requires the first argument to forward to be an NDArray, " \
-            "but got %s"%type(x)
-        with x.context as ctx:
-            params = {}
-            for name, param in self._reg_params.items():
-                # If a parameter is not dense, instead of passing the NDArray to
-                # sparse_forward(), the parameter itself is passed upon which
-                # row_sparse_data() will be performed.
-                if param._stype != 'default':
-                    params[name] = param
-                else:
-                    params[name] = param.data(ctx)
-            return self.sparse_forward(x, *args, **params)
-
-    def sparse_forward(self, F, x, *args, **kwargs):
-        """Overrides to define sparse forward computation for this `SparseBlock`.
-        Note that the *args for :py:meth:`SparseBlock.sparse_forward` is a list of
-        :py:class:`NDArray`s and :py:class:`Parameter`s. If the storage type of any
-        Parameter is sparse, the Parameter is passed as :py:class:`Parameter` by itself.
-        Otherwise, the Parameter is passed as a :py:class:`NDArray`.
-
-        When overridding sparse_forward, typically one needs to invoke
-        :py:meth:`Parameter.row_sparse_data` to access the data of the Parameter.
-
-        Parameters
-        ----------
-        x : NDArray
-            The first input tensor.
-        *args : list of Parameter or NDArray
-            Additional input tensors or sparse parameters.
-        """
-        # pylint: disable= invalid-name
-        raise NotImplementedError
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index 3e68bd6cb9e5..eccdf18c1bbb 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -22,7 +22,6 @@
 
 from .... import nd
 from ...block import HybridBlock
-from ..block import SparseBlock
 from ...nn import Sequential, HybridSequential
 
 class Concurrent(Sequential):
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 5bd9f2012595..453055e09472 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -203,7 +203,6 @@ def _row_sparse_pull(self, parameter, out, row_id):
             self._init_kvstore()
         if self._params_to_init:
             self._init_params()
-        # TODO check state
         self._kvstore.row_sparse_pull(self._param2idx[parameter.name], \
                                       out=out, row_ids=row_id)
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 28827db0e635..23a866d75af2 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -270,6 +270,12 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
       dispatched = dispatch_mode_assign(dispatch_mode, target_mode);
     }
   }
+  // Print user friendly error message to notify misuses of sparse_grad
+  if (weight_grad_stype != target_stype) {
+    LOG(FATAL) << "Cannot use sparse_grad = " << sparse_grad
+               << ", while stype of gradients w.r.t embedding weight is "
+               << common::stype_string(weight_grad_stype);
+  }
   return dispatched;
 }
 
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 350cfff85e31..cc55644db68e 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -28,8 +28,6 @@
 import json
 import unittest
 
-
-
 @with_seed()
 def test_parameter():
     p = gluon.Parameter('weight', shape=(10, 10))
@@ -587,88 +585,6 @@ def test_flatten():
     x = mx.nd.zeros((3,))
     assert flatten(x).shape == (3, 1)
 
-@with_seed()
-@raises(RuntimeError)
-def test_multi_trainer():
-    x = gluon.Parameter('x', shape=(10,))
-    x.initialize()
-    trainer0 = gluon.Trainer([x], 'sgd')
-    # multiple trainers for a single Parameter is not allowed
-    trainer1 = gluon.Trainer([x], 'sgd')
-
-@with_seed()
-def test_trainer():
-    def dict_equ(a, b):
-        assert set(a) == set(b)
-        for k in a:
-            assert (a[k].asnumpy() == b[k].asnumpy()).all()
-    x = gluon.Parameter('x', shape=(10,))
-    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5})
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-
-    assert (x.data(mx.cpu(1)).asnumpy() == -2).all()
-
-    x.lr_mult = 0.5
-
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-
-    assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
-
-    trainer.save_states('test_trainer.states')
-    states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \
-             else deepcopy(trainer._updaters[0].states)
-    trainer.load_states('test_trainer.states')
-    if trainer._update_on_kvstore:
-        dict_equ(trainer._kvstore._updater.states, states)
-        assert trainer._optimizer == trainer._kvstore._updater.optimizer
-    else:
-        for updater in trainer._updaters:
-            dict_equ(updater.states, states)
-        assert trainer._optimizer == trainer._updaters[0].optimizer
-    assert_raises(AssertionError, trainer.update, 1)
-    assert_raises(AssertionError, trainer.allreduce_grads)
-
-    x = gluon.Parameter('x', shape=(10,))
-    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer2 = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5},
-                             update_on_kvstore=False)
-    with mx.autograd.record():
-        for i, w in enumerate(x.list_data()):
-            y = i*w
-            y.backward()
-    assert (x.grad(mx.cpu(0)).asnumpy() != x.grad(mx.cpu(1)).asnumpy()).all()
-    trainer2.allreduce_grads()
-    assert (x.grad(mx.cpu(0)).asnumpy() == x.grad(mx.cpu(1)).asnumpy()).all()
-    trainer2.update(1)
-
-    assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy()
-
-@with_seed()
-def test_trainer_save_load():
-    x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
-    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
-    with mx.autograd.record():
-        for w in x.list_data():
-            y = w + 1
-            y.backward()
-    trainer.step(1)
-    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
-    trainer.save_states('test_trainer_save_load.states')
-    trainer.load_states('test_trainer_save_load.states')
-    x.lr_mult = 2.0
-    # check if parameter dict is correctly associated with optimizer after load_state
-    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
-
 @with_seed()
 def test_block_attr_hidden():
     b = gluon.Block()
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
new file mode 100644
index 000000000000..2d7e97b9dd3f
--- /dev/null
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import unittest
+import numpy as np
+from mxnet import gluon
+from mxnet.gluon import nn
+from mxnet.test_utils import assert_almost_equal
+from common import setup_module, with_seed, assertRaises
+from copy import deepcopy
+from nose.tools import raises, assert_raises
+
+@with_seed()
+@raises(RuntimeError)
+def test_multi_trainer():
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize()
+    trainer0 = gluon.Trainer([x], 'sgd')
+    # multiple trainers for a single Parameter is not allowed
+    trainer1 = gluon.Trainer([x], 'sgd')
+
+@with_seed()
+def test_trainer():
+    def dict_equ(a, b):
+        assert set(a) == set(b)
+        for k in a:
+            assert (a[k].asnumpy() == b[k].asnumpy()).all()
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5})
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+
+    assert (x.data(mx.cpu(1)).asnumpy() == -2).all()
+
+    x.lr_mult = 0.5
+
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+
+    assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
+
+    trainer.save_states('test_trainer.states')
+    states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \
+             else deepcopy(trainer._updaters[0].states)
+    trainer.load_states('test_trainer.states')
+    if trainer._update_on_kvstore:
+        dict_equ(trainer._kvstore._updater.states, states)
+        assert trainer._optimizer == trainer._kvstore._updater.optimizer
+    else:
+        for updater in trainer._updaters:
+            dict_equ(updater.states, states)
+        assert trainer._optimizer == trainer._updaters[0].optimizer
+    assert_raises(AssertionError, trainer.update, 1)
+    assert_raises(AssertionError, trainer.allreduce_grads)
+
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    trainer2 = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5},
+                             update_on_kvstore=False)
+    with mx.autograd.record():
+        for i, w in enumerate(x.list_data()):
+            y = i*w
+            y.backward()
+    assert (x.grad(mx.cpu(0)).asnumpy() != x.grad(mx.cpu(1)).asnumpy()).all()
+    trainer2.allreduce_grads()
+    assert (x.grad(mx.cpu(0)).asnumpy() == x.grad(mx.cpu(1)).asnumpy()).all()
+    trainer2.update(1)
+
+    assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy()
+
+@with_seed()
+def test_trainer_save_load():
+    x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1
+    trainer.save_states('test_trainer_save_load.states')
+    trainer.load_states('test_trainer_save_load.states')
+    x.lr_mult = 2.0
+    # check if parameter dict is correctly associated with optimizer after load_state
+    assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
+
+@with_seed()
+def test_trainer_multi_layer_init():
+    class Net(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Net, self).__init__(**kwargs)
+            with self.name_scope():
+                # sparse param
+                self.embed_weight = self.params.get('embed_weight', stype='row_sparse',
+                                                    shape=(4,3), grad_stype='row_sparse')
+                # dense param from a hybrid block
+                self.dense0 = nn.Dense(2)
+
+        def forward(self, x):
+            embed_weight = self.embed_weight.row_sparse_data(x)
+            embed = mx.nd.Embedding(data=x, weight=embed_weight,
+                                    input_dim=4, output_dim=3, sparse_grad=True)
+            return self.dense0(embed)
+
+    net = Net(prefix='net_')
+    ctxes = [mx.cpu(1), mx.cpu(2)]
+    net.initialize(mx.init.One(), ctx=ctxes)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
+    data = mx.nd.array([[0,2], [1,2]])
+    xs = gluon.utils.split_and_load(data, ctxes)
+    ys = []
+    with mx.autograd.record():
+        for x in xs:
+            y = net(x)
+            ys.append(y)
+    for y in ys:
+        y.backward()
+    trainer.step(1)
+    # all parameters should be initialized
+    assert not trainer._params_to_init
+    all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1))
+    # check the updated weights
+    weight = net.embed_weight.row_sparse_data(all_rows).asnumpy()
+    assert (weight[0] == -1).all()
+    assert (weight[1] == -1).all()
+    assert (weight[2] == -3).all()
+    assert (weight[3] == 1).all()

From 8db0499bc585616e8f2df5da1842aab3ca22abf8 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Fri, 18 May 2018 18:04:51 +0000
Subject: [PATCH 07/21] add test

---
 tests/python/unittest/test_gluon_trainer.py | 49 +++++++++++----------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 2d7e97b9dd3f..0f0f6c8dca75 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -125,26 +125,29 @@ def forward(self, x):
                                     input_dim=4, output_dim=3, sparse_grad=True)
             return self.dense0(embed)
 
-    net = Net(prefix='net_')
-    ctxes = [mx.cpu(1), mx.cpu(2)]
-    net.initialize(mx.init.One(), ctx=ctxes)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
-    data = mx.nd.array([[0,2], [1,2]])
-    xs = gluon.utils.split_and_load(data, ctxes)
-    ys = []
-    with mx.autograd.record():
-        for x in xs:
-            y = net(x)
-            ys.append(y)
-    for y in ys:
-        y.backward()
-    trainer.step(1)
-    # all parameters should be initialized
-    assert not trainer._params_to_init
-    all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1))
-    # check the updated weights
-    weight = net.embed_weight.row_sparse_data(all_rows).asnumpy()
-    assert (weight[0] == -1).all()
-    assert (weight[1] == -1).all()
-    assert (weight[2] == -3).all()
-    assert (weight[3] == 1).all()
+    def check_init(ctxes):
+        net = Net(prefix='net_')
+        net.initialize(mx.init.One(), ctx=ctxes)
+        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
+        data = mx.nd.array([[0,2], [1,2]])
+        xs = gluon.utils.split_and_load(data, ctxes)
+        ys = []
+        with mx.autograd.record():
+            for x in xs:
+                y = net(x)
+                ys.append(y)
+        for y in ys:
+            y.backward()
+        trainer.step(1)
+        # all parameters should be initialized
+        assert not trainer._params_to_init
+        all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1))
+        # check the updated weights
+        weight = net.embed_weight.row_sparse_data(all_rows).asnumpy()
+        assert (weight[0] == -1).all()
+        assert (weight[1] == -1).all()
+        assert (weight[2] == -3).all()
+        assert (weight[3] == 1).all()
+
+    check_init([mx.cpu(1), mx.cpu(2)])
+    check_init([mx.cpu(1)])

From 83009bc655610bc9f85495cca4a7eea768d96a55 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Fri, 18 May 2018 18:16:47 +0000
Subject: [PATCH 08/21] add back test

---
 tests/python/unittest/test_gluon.py | 500 ++++++++++++++++++++++++++++
 1 file changed, 500 insertions(+)

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 4c60492718d0..59f3c0ffbe8b 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -585,6 +585,506 @@ def test_flatten():
     x = mx.nd.zeros((3,))
     assert flatten(x).shape == (3, 1)
 
+@with_seed()
+def test_block_attr_hidden():
+    b = gluon.Block()
+
+    # regular attributes can change types
+    b.a = None
+    b.a = 1
+
+
+@raises(TypeError)
+@with_seed()
+def test_block_attr_block():
+    b = gluon.Block()
+
+    # regular variables can't change types
+    b.b = gluon.Block()
+    b.b = (2,)
+
+
+@raises(TypeError)
+@with_seed()
+def test_block_attr_param():
+    b = gluon.Block()
+
+    # regular variables can't change types
+    b.b = gluon.Parameter()
+    b.b = (2,)
+
+
+@with_seed()
+def test_block_attr_regular():
+    b = gluon.Block()
+
+    # set block attribute also sets _children
+    b.c = gluon.Block()
+    c2 = gluon.Block()
+    b.c = c2
+    assert b.c is c2 and list(b._children.values())[0] is c2
+
+
+@with_seed()
+def test_block_attr_list_of_block():
+    class Model1(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Model1, self).__init__(**kwargs)
+            with self.name_scope():
+                self.layers = [nn.Dense(i * 10) for i in range(6)]
+
+    class Model2(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Model2, self).__init__(**kwargs)
+            with self.name_scope():
+                self.layers = dict()
+                self.layers['a'] = [nn.Dense(10), nn.Dense(10)]
+
+    class Model3(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Model3, self).__init__(**kwargs)
+            with self.name_scope():
+                self.layers = nn.Sequential()
+                self.layers.add(*[nn.Dense(i * 10) for i in range(6)])
+
+    class Model4(gluon.Block):
+        def __init__(self, **kwargs):
+            super(Model4, self).__init__(**kwargs)
+            with self.name_scope():
+                self.data = {'a': '4', 'b': 123}
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter('always')
+        model = Model1()
+        model.collect_params()
+        assert len(w) > 0
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter('always')
+        model = Model2()
+        model.collect_params()
+        assert len(w) > 0
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter('always')
+        model = Model3()
+        model.collect_params()
+        assert len(w) == 0
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter('always')
+        model = Model4()
+        model.collect_params()
+        assert len(w) == 0
+
+def check_sequential(net):
+    dense1 = gluon.nn.Dense(10)
+    net.add(dense1)
+    dense2 = gluon.nn.Dense(10)
+    net.add(dense2)
+    dense3 = gluon.nn.Dense(10)
+    net.add(dense3)
+
+    assert net[1] is dense2
+    assert net[-1] is dense3
+    slc = net[1:3]
+    assert len(slc) == 2 and slc[0] is dense2 and slc[1] is dense3
+    assert isinstance(slc, type(net))
+
+@with_seed()
+def test_sequential():
+    check_sequential(gluon.nn.Sequential())
+    check_sequential(gluon.nn.HybridSequential())
+
+@with_seed()
+def test_sequential_warning():
+    with warnings.catch_warnings(record=True) as w:
+        # The following line permits the test to pass if run multiple times
+        warnings.simplefilter('always')
+        b = gluon.nn.Sequential()
+        b.add(gluon.nn.Dense(20))
+        b.hybridize()
+        assert len(w) == 1
+
+
+@with_seed()
+def test_global_norm_clip():
+    x1 = mx.nd.ones((3,3))
+    x2 = mx.nd.ones((4,4))
+    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
+    assert norm == 5.0
+    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
+    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+
+    x3 = mx.nd.array([1.0, 2.0, float('nan')])
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        gluon.utils.clip_global_norm([x1, x3], 2.0)
+        assert len(w) == 1
+
+
+@with_seed()
+def test_embedding():
+    def check_embedding(sparse_grad):
+        layer = gluon.nn.Embedding(10, 100, sparse_grad=sparse_grad)
+        layer.initialize()
+        x = mx.nd.array([3,4,2,0,1])
+        with mx.autograd.record():
+            y = layer(x)
+            y.backward()
+        assert (layer.weight.grad().asnumpy()[:5] == 1).all()
+        assert (layer.weight.grad().asnumpy()[5:] == 0).all()
+    check_embedding(True)
+    check_embedding(False)
+
+@with_seed()
+def test_export():
+    ctx = mx.context.current_context()
+    model = gluon.model_zoo.vision.resnet18_v1(
+        prefix='resnet', ctx=ctx, pretrained=True)
+    model.hybridize()
+    data = mx.nd.random.normal(shape=(1, 3, 224, 224))
+    out = model(data)
+
+    model.export('gluon')
+
+    module = mx.mod.Module.load('gluon', 0, label_names=None, context=ctx)
+    module.bind(data_shapes=[('data', data.shape)])
+    module.forward(mx.io.DataBatch([data], None), is_train=False)
+    mod_out, = module.get_outputs()
+
+    assert_almost_equal(out.asnumpy(), mod_out.asnumpy())
+
+    model2 = gluon.model_zoo.vision.resnet18_v1(prefix='resnet', ctx=ctx)
+    model2.collect_params().load('gluon-0000.params', ctx)
+    out2 = model2(data)
+
+    assert_almost_equal(out.asnumpy(), out2.asnumpy())
+
+
+@with_seed()
+def test_hybrid_stale_cache():
+    net = mx.gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(mx.gluon.nn.Dense(10, weight_initializer='zeros', bias_initializer='ones', flatten=False))
+
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5)))
+
+    net.add(mx.gluon.nn.Flatten())
+    assert net(mx.nd.ones((2,3,5))).shape == (2, 30)
+
+    net = mx.gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.fc1 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                    bias_initializer='ones', flatten=False)
+        net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                    bias_initializer='ones', flatten=False)
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5)))
+
+    net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                bias_initializer='ones', flatten=True)
+    net.initialize()
+    assert net(mx.nd.ones((2,3,5))).shape == (2, 10)
+
+
+@with_seed()
+def test_lambda():
+    net1 = mx.gluon.nn.HybridSequential()
+    net1.add(nn.Activation('tanh'),
+             nn.LeakyReLU(0.1))
+
+    net2 = mx.gluon.nn.HybridSequential()
+    op3 = lambda F, x, *args: F.LeakyReLU(x, *args, slope=0.1)
+    net2.add(nn.HybridLambda('tanh'),
+             nn.HybridLambda(op3))
+
+    op4 = lambda x: mx.nd.LeakyReLU(x, slope=0.1)
+    net3 = mx.gluon.nn.Sequential()
+    net3.add(nn.Lambda('tanh'),
+             nn.Lambda(op4))
+
+    input_data = mx.nd.random.uniform(shape=(2, 3, 5, 7))
+    out1, out2, out3 = net1(input_data), net2(input_data), net3(input_data)
+    assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-3, atol=1e-3)
+    assert_almost_equal(out1.asnumpy(), out3.asnumpy(), rtol=1e-3, atol=1e-3)
+
+
+@with_seed()
+def test_fill_shape_deferred():
+    net = nn.HybridSequential()
+    with net.name_scope():
+        net.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                nn.BatchNorm(),
+                nn.Dense(10))
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5,7)))
+    assert net[0].weight.shape[1] == 3, net[0].weight.shape[1]
+    assert net[1].gamma.shape[0] == 64, net[1].gamma.shape[0]
+    assert net[2].weight.shape[1] == 3072, net[2].weight.shape[1]
+
+
+@with_seed()
+def test_dtype():
+    net = mx.gluon.model_zoo.vision.resnet18_v1()
+    net.initialize()
+    net.cast('float64')
+    with mx.autograd.record():
+        y = net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+        y.backward()
+
+    net = mx.gluon.model_zoo.vision.resnet18_v1()
+    net.initialize()
+    net.hybridize()
+    net(mx.nd.ones((16, 3, 32, 32), dtype='float32'))
+
+    net.cast('float64')
+    net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+
+    mx.nd.waitall()
+
+    class Net(gluon.Block):
+        def __init__(self, in_dim, output_dim):
+            super(Net, self).__init__()
+            with self.name_scope():
+                self.embed = gluon.nn.Embedding(input_dim=in_dim, output_dim=output_dim,dtype=np.float64)
+                self.dense = gluon.nn.Dense(2, dtype=np.float64)
+
+        def forward(self, x):
+            e = self.embed(x)
+            assert(e.dtype == np.float64)
+            y = self.dense(e)
+            assert(y.dtype == np.float64)
+            return y
+
+    net = Net(5, 10)
+    net.initialize()
+    out = net(mx.nd.ones((3,), dtype=np.float64))
+    mx.nd.waitall()
+
+@with_seed()
+def test_fill_shape_load():
+    ctx = mx.context.current_context()
+    net1 = nn.HybridSequential()
+    with net1.name_scope():
+        net1.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                 nn.BatchNorm(),
+                 nn.Dense(10))
+    net1.hybridize()
+    net1.initialize(ctx=ctx)
+    net1(mx.nd.ones((2,3,5,7), ctx))
+    net1.save_params('net_fill.params')
+
+    net2 = nn.HybridSequential()
+    with net2.name_scope():
+        net2.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                 nn.BatchNorm(),
+                 nn.Dense(10))
+    net2.hybridize()
+    net2.initialize()
+    net2.load_params('net_fill.params', ctx)
+    assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1]
+    assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0]
+    assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1]
+
+
+@with_seed()
+def test_inline():
+    net = mx.gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(mx.gluon.nn.Dense(10))
+        net.add(mx.gluon.nn.Dense(10))
+        net.add(mx.gluon.nn.Dense(10))
+
+    net.initialize()
+    net.hybridize(inline_limit=3)
+    with mx.autograd.record():
+        y = net(mx.nd.zeros((1,10)))
+
+    len_1 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes'])
+    y.backward()
+
+    net.hybridize(inline_limit=0)
+    with mx.autograd.record():
+        y = net(mx.nd.zeros((1,10)))
+
+    len_2 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes'])
+    y.backward()
+
+    assert len_1 == len_2 + 2
+
+
+@with_seed()
+def test_activations():
+    point_to_validate = mx.nd.array([-0.1, 0.1] * 3)
+
+    swish = mx.gluon.nn.Swish()
+    def swish_test(x):
+        return x * mx.nd.sigmoid(x)
+
+    for test_point, ref_point in zip(swish_test(point_to_validate), swish(point_to_validate)):
+        assert test_point == ref_point
+
+    elu = mx.gluon.nn.ELU()
+    def elu_test(x):
+        def elu(x):
+            return 1.0 * (mx.nd.exp(x) - 1) if x < 0 else x
+        return [elu(x_i) for x_i in x]
+
+    for test_point, ref_point in zip(elu_test(point_to_validate), elu(point_to_validate)):
+        assert test_point == ref_point
+
+    selu = mx.gluon.nn.SELU()
+    def selu_test(x):
+        def selu(x):
+            scale, alpha = 1.0507009873554804934193349852946, 1.6732632423543772848170429916717
+            return scale * x if x >= 0 else alpha * mx.nd.exp(x) - alpha
+        return [selu(x_i) for x_i in x]
+
+    for test_point, ref_point in zip(selu(point_to_validate), selu(point_to_validate)):
+        assert test_point == ref_point
+
+    prelu = mx.gluon.nn.PReLU()
+    prelu.initialize()
+    x = point_to_validate.reshape((1, 3, 2))
+    assert_almost_equal(prelu(x).asnumpy(), mx.nd.where(x >= 0, x, 0.25 * x).asnumpy())
+
+@with_seed()
+def test_dropout():
+    def get_slice(x, axis, idx):
+        ix = ()
+        for i in range(x.ndim):
+            if i == axis:
+                ix += (idx,)
+            else:
+                ix += (slice(None, None, None),)
+        return x[ix]
+
+    def check_dropout_axes(ratio, shape, axes):
+        compactshape = list(shape)
+        for axis in axes:
+            compactshape[axis] = 1
+        compactx = mx.random.uniform(shape=tuple(compactshape))
+        broadcastx = compactx.broadcast_to(shape)
+        dropouty = mx.gluon.nn.Dropout(rate=ratio, axes=axes)(broadcastx)
+        for axis in axes:
+            target = get_slice(dropouty, axis, 0).asnumpy()
+            for i in range(1, shape[axis]):
+                assert(get_slice(dropouty, axis, i).asnumpy() == target).all()
+
+    nshape = (10, 10, 10, 10)
+    with mx.autograd.train_mode():
+        check_dropout_axes(0.25, nshape, axes = (0,))
+        check_dropout_axes(0.25, nshape, axes = (1,))
+        check_dropout_axes(0.25, nshape, axes = (2,))
+        check_dropout_axes(0.25, nshape, axes = (3,))
+        check_dropout_axes(0.25, nshape, axes = (0, 1))
+        check_dropout_axes(0.25, nshape, axes = (0, 2))
+        check_dropout_axes(0.25, nshape, axes = (0, 3))
+        check_dropout_axes(0.25, nshape, axes = (1, 2))
+        check_dropout_axes(0.25, nshape, axes = (1, 3))
+        check_dropout_axes(0.25, nshape, axes = (2, 3))
+        check_dropout_axes(0.25, nshape, axes = (0, 1, 2))
+        check_dropout_axes(0.25, nshape, axes = (0, 2, 3))
+        check_dropout_axes(0.25, nshape, axes = (1, 2, 3))
+
+@with_seed()
+def test_req():
+    data = mx.nd.random.uniform(shape=(1,3,224,224))
+    label = mx.nd.random.uniform(shape=(1))
+    label[:] = 1
+    loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+    net = nn.HybridSequential()
+    net1 = nn.HybridSequential()
+    net1.add(nn.Dense(4))
+    net2 = nn.HybridSequential()
+    net2.add(nn.Dense(3))
+    net2.add(nn.Dense(2))
+    net.add(net1)
+    net.add(net2)
+    net.initialize()
+
+    net.hybridize()
+
+    for v in net.collect_params().values():
+        v.grad_req = 'add'
+
+    net.collect_params().zero_grad()
+    with mx.autograd.record():
+        pred = net(data)
+        l = loss(pred, label)
+        l.backward()
+        grad = net[0][0].weight.grad().mean().asnumpy()
+        # run twice to check req = add
+        pred = net(data)
+        l = loss(pred, label)
+        l.backward()
+
+    grad_double = net[0][0].weight.grad().mean().asnumpy()
+    assert_almost_equal(grad * 2, grad_double)
+
+
+@with_seed()
+def test_save_load():
+    net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True)
+    net.save_params('test.params')
+
+    net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
+    net.output = mx.gluon.nn.Dense(1000)
+
+    net.load_params('test.params')
+
+@with_seed()
+def test_symbol_block_save_load():
+    class Net(gluon.HybridBlock):
+        def __init__(self):
+            super(Net, self).__init__()
+            with self.name_scope():
+                backbone = gluon.model_zoo.vision.resnet18_v1()
+                data = mx.sym.var('data')
+                featnames = ['stage1_activation0', 'stage2_activation0', 'stage3_activation0']
+                out_names = ['_'.join([backbone.name, featname, 'output']) for featname in featnames]
+                internals = backbone(data).get_internals()
+                outs = [internals[out_name] for out_name in out_names]
+                self.backbone = gluon.SymbolBlock(outs, data, params=backbone.collect_params())
+                self.body = nn.Conv2D(3, 1)
+
+        def hybrid_forward(self, F, x):
+            x = self.body(x)
+            return self.backbone(x)
+
+    net1 = Net()
+    net1.initialize(mx.init.Normal())
+    net1.hybridize()
+    net1(mx.nd.random.normal(shape=(1, 3, 32, 32)))
+    net1.save_params('./test.params')
+
+    net2 = Net()
+    net2.load_params('./test.params', ctx=mx.cpu())
+
+
+@with_seed()
+def test_hybrid_multi_context():
+    net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
+    net.initialize(ctx=[mx.cpu(0), mx.cpu(1)])
+    net.hybridize()
+    net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
+
+
+@with_seed()
+def test_zero_grad():
+    data = mx.nd.random.uniform(shape=(3,3))
+    net = nn.Embedding(3, 4, sparse_grad=True, prefix='test_zero_grad_')
+    net.initialize()
+    with mx.autograd.record():
+        l = net(data)
+        l.backward()
+    net.collect_params().zero_grad()
+    grad = net.collect_params()['test_zero_grad_weight'].grad()
+    assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0)
+
 
 if __name__ == '__main__':
     import nose

From cf006c8036b240d6e77577de6c40a00ffc6862ba Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Fri, 18 May 2018 23:48:35 +0000
Subject: [PATCH 09/21] raise error for load params

---
 python/mxnet/gluon/parameter.py     | 20 ++----
 python/mxnet/gluon/trainer.py       |  1 -
 tests/python/unittest/test_gluon.py | 94 +++++++++++++++--------------
 3 files changed, 56 insertions(+), 59 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 224759f6c66a..6360135b439c 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -212,25 +212,14 @@ def _check_and_get(self, arr_list, ctx):
 
     def _get_row_sparse(self, arr_list, ctx, row_id):
         """ Get row_sparse data from row_sparse parameters based on row_id. """
-        results = self._check_and_get(arr_list, ctx)
-
         # get row sparse params based on row ids
         if not isinstance(row_id, ndarray.NDArray):
             raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. "
                             "NDArray type is expected." % (self.name, row_id))
         if not self._trainer:
-            # the sparse param is not yet associated with any trainer.
-            # and create new buffers to hold the outputs.
-            ret = []
-            if isinstance(results, ndarray.NDArray):
-                rows = row_id.as_in_context(results.context)
-                ret = ndarray.sparse.retain(results, rows)
-            else:
-               # list of NDArrays
-               for result in results:
-                   rows = row_id.as_in_context(result.context)
-                   ret.append(ndarray.sparse.retain(result, rows))
-            return ret
+            raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \
+                               "Trainer is created with it."%self.name)
+        results = self._check_and_get(arr_list, ctx)
 
         # fetch row sparse params from the trainer
         self._trainer._row_sparse_pull(self, results, row_id)
@@ -238,6 +227,9 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
 
     def _load_init(self, data, ctx, cast_stype=False):
         """(Re)initializes by loading from data."""
+        if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore:
+            raise RuntimeError("Cannot (Re)initialize Parameter '%s' when its Trainer " \
+                               "already initialized the parameter on KVStore."%(self.name))
         if self.shape:
             for self_dim, data_dim in zip(self.shape, data.shape):
                 assert self_dim == 0 or self_dim == data_dim, \
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 453055e09472..188ecedcecec 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -198,7 +198,6 @@ def set_learning_rate(self, lr):
 
     def _row_sparse_pull(self, parameter, out, row_id):
         # initialize kv and params if not already
-        assert self._kvstore, "Cannot perform row_sparse_pull() when KVStore is not set"
         if not self._kv_initialized:
             self._init_kvstore()
         if self._params_to_init:
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 59f3c0ffbe8b..0285fac8b544 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -48,8 +48,11 @@ def test_sparse_parameter():
     p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
     p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
     row_id = mx.nd.arange(0, 10, ctx=mx.cpu(1))
-    assert len(p.list_row_sparse_data(row_id)) == 2
     assert len(p.list_grad()) == 2
+    # getting row_sparse data without trainer throws an exception
+    assertRaises(RuntimeError, p.list_row_sparse_data, row_id)
+    trainer = mx.gluon.Trainer([p], 'sgd')
+    assert len(p.list_row_sparse_data(row_id)) == 2
     weight = p.row_sparse_data(row_id)
     assert weight.context == mx.cpu(1)
     assert weight.shape == (10, 10)
@@ -83,60 +86,63 @@ def test_paramdict():
     all_row_ids = mx.nd.arange(0, 10)
     assert list(params0.keys()) == ['net_w0', 'net_w1']
     params0.initialize(ctx=mx.cpu())
+    trainer0 = mx.gluon.Trainer(params0, 'sgd')
     prev_w0 = params0.get('w0').data(mx.cpu())
     prev_w1 = params0.get('w1').row_sparse_data(all_row_ids)
+    params0.save('test_paramdict.params')
+    # Cannot load parameters if they are already initialized on trainer's kvstore
+    assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu())
+
+    params1 = gluon.ParameterDict('net_')
+    params1.get('w0', shape=(10, 10))
+    params1.get('w1', shape=(10, 10), stype='row_sparse')
+    params1.load('test_paramdict.params', mx.cpu())
+    trainer1 = mx.gluon.Trainer(params1, 'sgd')
 
-    params0.save('test.params')
-    params0.load('test.params', mx.cpu())
     # compare the values before and after save/load
-    cur_w0 = params0.get('w0').data(mx.cpu())
-    cur_w1 = params0.get('w1').row_sparse_data(all_row_ids)
+    cur_w0 = params1.get('w0').data(mx.cpu())
+    cur_w1 = params1.get('w1').row_sparse_data(all_row_ids)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
-
     # create a new param dict with dense params, and load from the checkpoint
     # of sparse & dense params
-    params1 = gluon.ParameterDict('net_')
-    params1.get('w0', shape=(10, 10))
-    params1.get('w1', shape=(10, 10))
-    assertRaises(RuntimeError, params1.load, 'test.params', mx.cpu())
-    params1.load('test.params', mx.cpu(), cast_stype=True)
+    params2 = gluon.ParameterDict('net_')
+    params2.get('w0', shape=(10, 10))
+    params2.get('w1', shape=(10, 10))
+    assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu())
+    params2.load('test_paramdict.params', mx.cpu(), cast_stype=True)
     # compare the values before and after save/load
-    cur_w0 = params1.get('w0').data(mx.cpu())
-    cur_w1 = params1.get('w1').data(mx.cpu())
+    cur_w0 = params2.get('w0').data(mx.cpu())
+    cur_w1 = params2.get('w1').data(mx.cpu())
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
 
 
 @with_seed()
 def test_parameter_row_sparse_data():
-    def check_parameter_row_sparse_data(with_trainer):
-        ctx0 = mx.cpu(1)
-        ctx1 = mx.cpu(2)
-        dim0 = 4
-        x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse')
-        x.initialize(init='xavier', ctx=[ctx0, ctx1])
-        if with_trainer:
-            trainer = gluon.Trainer([x], 'sgd')
-        x_param = x._data[0].copy()
-        assert x_param.stype == 'row_sparse'
-        row_id_0 = mx.nd.array([0,1], ctx=ctx0)
-        retained_0 = x.row_sparse_data(row_id_0)
-        retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0))
-        mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy())
-        assert retained_0.context == ctx0
-        row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1)
-        retained_1 = x.row_sparse_data(row_id_1)
-        retained_target_1 = x_param
-        mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy())
-        assert retained_1.context == ctx1
-        row_id_2 = mx.nd.array([0,1,2])
-        retained_2 = x.list_row_sparse_data(row_id_2)
-        retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0))
-        mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy())
-
-    check_parameter_row_sparse_data(True)
-    check_parameter_row_sparse_data(False)
+    ctx0 = mx.cpu(1)
+    ctx1 = mx.cpu(2)
+    dim0 = 4
+    x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse')
+    x.initialize(init='xavier', ctx=[ctx0, ctx1])
+    trainer = gluon.Trainer([x], 'sgd')
+    x_param = x._data[0].copy()
+    assert x_param.stype == 'row_sparse'
+    row_id_0 = mx.nd.array([0,1], ctx=ctx0)
+    retained_0 = x.row_sparse_data(row_id_0)
+    retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0))
+    mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy())
+    assert retained_0.context == ctx0
+    row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1)
+    retained_1 = x.row_sparse_data(row_id_1)
+    retained_target_1 = x_param
+    mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy())
+    assert retained_1.context == ctx1
+    row_id_2 = mx.nd.array([0,1,2])
+    retained_2 = x.list_row_sparse_data(row_id_2)
+    retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0))
+    mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy())
+
 
 @with_seed()
 def test_constant():
@@ -1029,12 +1035,12 @@ def test_req():
 @with_seed()
 def test_save_load():
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True)
-    net.save_params('test.params')
+    net.save_params('test_save_load.params')
 
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
     net.output = mx.gluon.nn.Dense(1000)
 
-    net.load_params('test.params')
+    net.load_params('test_save_load.params')
 
 @with_seed()
 def test_symbol_block_save_load():
@@ -1059,10 +1065,10 @@ def hybrid_forward(self, F, x):
     net1.initialize(mx.init.Normal())
     net1.hybridize()
     net1(mx.nd.random.normal(shape=(1, 3, 32, 32)))
-    net1.save_params('./test.params')
+    net1.save_params('./test_symbol_block_save_load.params')
 
     net2 = Net()
-    net2.load_params('./test.params', ctx=mx.cpu())
+    net2.load_params('./test_symbol_block_save_load.params', ctx=mx.cpu())
 
 
 @with_seed()

From 4e9ab9c8424bcbda8c424712a293de269bc663ca Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Sat, 19 May 2018 00:02:12 +0000
Subject: [PATCH 10/21] add comment

---
 python/mxnet/gluon/parameter.py     | 4 ++--
 python/mxnet/gluon/trainer.py       | 3 ++-
 tests/python/unittest/test_gluon.py | 7 ++++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 6360135b439c..8e6cfed94d22 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -214,8 +214,8 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
         """ Get row_sparse data from row_sparse parameters based on row_id. """
         # get row sparse params based on row ids
         if not isinstance(row_id, ndarray.NDArray):
-            raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. "
-                            "NDArray type is expected." % (self.name, row_id))
+            raise TypeError("Cannot get 'row_sparse' Parameter %s with %s type. "
+                            "NDArray type is expected." % (self.name, type(row_id)))
         if not self._trainer:
             raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \
                                "Trainer is created with it."%self.name)
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 188ecedcecec..b2f29acf0b66 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -343,7 +343,8 @@ def save_states(self, fname):
             self._init_params()
 
         if self._update_on_kvstore:
-            assert not self._params_to_init, "Cannot save trainer states when some parameters are not yet initialized in kvstore."
+            assert not self._params_to_init, "Cannot save trainer states when some " \
+                                             "parameters are not yet initialized in kvstore."
             self._kvstore.save_optimizer_states(fname, dump_optimizer=True)
         else:
             with open(fname, 'wb') as fout:
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 0285fac8b544..fcd04298e886 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -84,15 +84,18 @@ def test_paramdict():
     params0.get('w0', shape=(10, 10))
     params0.get('w1', shape=(10, 10), stype='row_sparse')
     all_row_ids = mx.nd.arange(0, 10)
+    # check param names
     assert list(params0.keys()) == ['net_w0', 'net_w1']
     params0.initialize(ctx=mx.cpu())
     trainer0 = mx.gluon.Trainer(params0, 'sgd')
     prev_w0 = params0.get('w0').data(mx.cpu())
     prev_w1 = params0.get('w1').row_sparse_data(all_row_ids)
+    # save params
     params0.save('test_paramdict.params')
-    # Cannot load parameters if they are already initialized on trainer's kvstore
+    # cannot load parameters if they are already initialized on trainer's kvstore
     assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu())
 
+    # load params
     params1 = gluon.ParameterDict('net_')
     params1.get('w0', shape=(10, 10))
     params1.get('w1', shape=(10, 10), stype='row_sparse')
@@ -104,6 +107,7 @@ def test_paramdict():
     cur_w1 = params1.get('w1').row_sparse_data(all_row_ids)
     mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy())
     mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy())
+
     # create a new param dict with dense params, and load from the checkpoint
     # of sparse & dense params
     params2 = gluon.ParameterDict('net_')
@@ -111,6 +115,7 @@ def test_paramdict():
     params2.get('w1', shape=(10, 10))
     assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu())
     params2.load('test_paramdict.params', mx.cpu(), cast_stype=True)
+
     # compare the values before and after save/load
     cur_w0 = params2.get('w0').data(mx.cpu())
     cur_w1 = params2.get('w1').data(mx.cpu())

From a991e988f3701567ccab9b6e60c801c831f0758a Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Sat, 19 May 2018 00:10:38 +0000
Subject: [PATCH 11/21] remove print

---
 python/mxnet/gluon/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index b2f29acf0b66..5422295d45db 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -129,7 +129,6 @@ def _init_params(self):
                       if not param._deferred_init]
             params_to_init = [param for param in self._params_to_init \
                               if param._deferred_init]
-            print('initing ', params)
             for param in params:
                 param_arrays = param._check_and_get(param._data, list)
                 idx = self._param2idx[param.name]

From 468b5999f9787a4088bf48737bb68f99b41d196c Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Sat, 19 May 2018 00:19:37 +0000
Subject: [PATCH 12/21] fix doc

---
 python/mxnet/gluon/block.py     | 1 +
 python/mxnet/gluon/parameter.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 52b385daa14f..206991ccae3e 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -443,6 +443,7 @@ class HybridBlock(Block):
 
     Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
     the end-to-end usage.
+
     """
     def __init__(self, prefix=None, params=None):
         # check if any parameter is row_sparse
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 8e6cfed94d22..3b0ecdd56530 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -458,8 +458,6 @@ def list_row_sparse_data(self, row_id):
 
         Parameters
         ----------
-        ctx : Context
-            Desired context.
         row_id: NDArray
             Row ids to retain for the 'row_sparse' parameter.
 
@@ -885,6 +883,9 @@ def load(self, filename, ctx=None, allow_missing=False,
             present in this ParameterDict.
         restore_prefix : str, default ''
             prepend prefix to names of stored parameters before loading.
+        cast_stype: bool, default False
+            Whether to cast the storage type of parameters from the file to current
+            Parameter's storage type if stypes mismatch.
         """
         if restore_prefix:
             for name in self.keys():

From 0f703440f1ca9dafdecc36727aad0fbb80be38c7 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Mon, 21 May 2018 21:19:11 +0000
Subject: [PATCH 13/21] CR comments

---
 python/mxnet/gluon/block.py                 |  4 ++--
 python/mxnet/gluon/parameter.py             |  2 +-
 tests/python/unittest/test_gluon.py         | 14 ++++++++++++--
 tests/python/unittest/test_gluon_trainer.py |  6 ++++++
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 206991ccae3e..9e5ecfc4ea31 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -452,8 +452,8 @@ def __init__(self, prefix=None, params=None):
                 stype = param._stype
                 if stype != 'default':
                     raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \
-                                     "because its storage type is %s. Please consider " \
-                                     "using a SparseBlock instead."%(param.name, stype))
+                                     "because its storage type is %s. Please use " \
+                                     "a SparseBlock instead."%(param.name, stype))
         super(HybridBlock, self).__init__(prefix=prefix, params=params)
         self._cached_graph = ()
         self._cached_op = None
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 3b0ecdd56530..c323c0c2f6b0 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -171,7 +171,7 @@ def shape(self, new_shape):
 
     def _set_trainer(self, trainer):
         """ Set the trainer this parameter is associated with. """
-        if self._trainer and self._trainer is not trainer:
+        if self._trainer and trainer and self._trainer is not trainer:
             raise RuntimeError(
                 "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \
                 "More than one trainers for a single Parameter is not supported." %(
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index fcd04298e886..c7e348eef522 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -38,11 +38,21 @@ def test_parameter():
     assert p.data(mx.cpu(0)).shape == (10, 10)
     assert p.var().name == 'weight'
     assert p.grad(mx.cpu(0)).stype == 'default'
-    assert p.data().stype == 'default'
+    assert p.data(mx.cpu(0)).stype == 'default'
 
     p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)])
     assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)]
 
+@with_seed()
+@raises(AssertionError)
+def test_invalid_parameter_stype():
+    p = gluon.Parameter('weight', shape=(10, 10), stype='invalid')
+
+@with_seed()
+@raises(AssertionError)
+def test_invalid_parameter_grad_stype():
+    p = gluon.Parameter('weight', shape=(10, 10), grad_stype='invalid')
+
 @with_seed()
 def test_sparse_parameter():
     p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
@@ -83,7 +93,7 @@ def test_paramdict():
     params0 = gluon.ParameterDict('net_')
     params0.get('w0', shape=(10, 10))
     params0.get('w1', shape=(10, 10), stype='row_sparse')
-    all_row_ids = mx.nd.arange(0, 10)
+    all_row_ids = mx.nd.arange(0, 10, ctx=mx.cpu())
     # check param names
     assert list(params0.keys()) == ['net_w0', 'net_w1']
     params0.initialize(ctx=mx.cpu())
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index fca2cb442563..3af0aa8734c5 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -30,7 +30,13 @@
 def test_multi_trainer():
     x = gluon.Parameter('x', shape=(10,))
     x.initialize()
+    # test set trainer
     trainer0 = gluon.Trainer([x], 'sgd')
+    assert(x._trainer is trainer0)
+    # test unset trainer
+    x._set_trainer(None)
+    assert(x._trainer is None)
+    x._set_trainer(trainer0)
     # multiple trainers for a single Parameter is not allowed
     trainer1 = gluon.Trainer([x], 'sgd')
 

From ff9bf84c76567de58679cea80fb58234e0a569e6 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 23 May 2018 00:56:04 +0000
Subject: [PATCH 14/21] CR comments

---
 python/mxnet/gluon/block.py                 |  8 ------
 python/mxnet/gluon/parameter.py             | 19 ++++++++------
 python/mxnet/gluon/trainer.py               | 28 +++++++++++----------
 tests/python/unittest/test_gluon.py         |  9 ++++---
 tests/python/unittest/test_gluon_trainer.py |  4 +--
 5 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 9e5ecfc4ea31..483c06bccc26 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -446,14 +446,6 @@ class HybridBlock(Block):
 
     """
     def __init__(self, prefix=None, params=None):
-        # check if any parameter is row_sparse
-        if isinstance(params, ParameterDict):
-            for param in params.values():
-                stype = param._stype
-                if stype != 'default':
-                    raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \
-                                     "because its storage type is %s. Please use " \
-                                     "a SparseBlock instead."%(param.name, stype))
         super(HybridBlock, self).__init__(prefix=prefix, params=params)
         self._cached_graph = ()
         self._cached_op = None
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index c323c0c2f6b0..adbe0243c8ba 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -171,11 +171,12 @@ def shape(self, new_shape):
 
     def _set_trainer(self, trainer):
         """ Set the trainer this parameter is associated with. """
-        if self._trainer and trainer and self._trainer is not trainer:
+        # trainer cannot be replaced for sparse params
+        if self._stype != 'default' and self._trainer and trainer and self._trainer is not trainer:
             raise RuntimeError(
-                "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \
-                "More than one trainers for a single Parameter is not supported." %(
-                    self.name, str(trainer), str(self._trainer)))
+                "Failed to set the trainer for Parameter '%s' because it was already set. " \
+                "More than one trainers for a %s Parameter is not supported." \
+                %(self.name, self._stype))
         self._trainer = trainer
 
     def _check_and_get(self, arr_list, ctx):
@@ -214,8 +215,7 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
         """ Get row_sparse data from row_sparse parameters based on row_id. """
         # get row sparse params based on row ids
         if not isinstance(row_id, ndarray.NDArray):
-            raise TypeError("Cannot get 'row_sparse' Parameter %s with %s type. "
-                            "NDArray type is expected." % (self.name, type(row_id)))
+            raise TypeError("row_id must have NDArray type, but %s is given"%(type(row_id)))
         if not self._trainer:
             raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \
                                "Trainer is created with it."%self.name)
@@ -228,8 +228,11 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
     def _load_init(self, data, ctx, cast_stype=False):
         """(Re)initializes by loading from data."""
         if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore:
-            raise RuntimeError("Cannot (Re)initialize Parameter '%s' when its Trainer " \
-                               "already initialized the parameter on KVStore."%(self.name))
+            if self not in self._trainer._params_to_init:
+                raise RuntimeError("Cannot (re)initialize Parameter '%s' because it was " \
+                                   "already initialized on the trainer's KVStore, and " \
+                                   "trainer._update_on_kvstore is True. Please create a " \
+                                   "new Trainer with this Parameter."%(self.name))
         if self.shape:
             for self_dim, data_dim in zip(self.shape, data.shape):
                 assert self_dim == 0 or self_dim == data_dim, \
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 5422295d45db..50a6526ef831 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -68,6 +68,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
                 "First argument must be a list or dict of Parameters, " \
                 "got %s."%(type(params)))
         self._params = []
+        # parameters to initialize on the kvstore
         self._params_to_init = []
         self._contains_sparse = False
         self._param2idx = {}
@@ -118,28 +119,29 @@ def _init_optimizer(self, optimizer, optimizer_params):
                             for _ in self._contexts]
 
     def _init_params(self):
-        """ Initialize parameters in the KVStore. Parameters whose
-            intiailization is incomplete are ignored.
+        """Initialize parameters in the KVStore.
+
+        Parameters with incomplete initialization are ignored.
+
         """
         assert self._kv_initialized, "Cannot initialize parameters in KVStore " \
                                      "when KVStore is not initialized."
         params_to_init = []
         if self._kvstore:
-            params = [param for param in self._params_to_init \
-                      if not param._deferred_init]
-            params_to_init = [param for param in self._params_to_init \
-                              if param._deferred_init]
-            for param in params:
-                param_arrays = param._check_and_get(param._data, list)
-                idx = self._param2idx[param.name]
-                self._kvstore.init(idx, param_arrays[0])
-                if param._stype == 'default':
-                    self._kvstore.pull(idx, param_arrays, priority=-idx)
+            for param in self._params_to_init:
+                if param._deferred_init:
+                    params_to_init.append(param)
+                else:
+                    param_arrays = param._check_and_get(param._data, list)
+                    idx = self._param2idx[param.name]
+                    self._kvstore.init(idx, param_arrays[0])
+                    if param._stype == 'default':
+                        self._kvstore.pull(idx, param_arrays, priority=-idx)
 
         self._params_to_init = params_to_init
 
     def _init_kvstore(self):
-        """ Create kvstore """
+        """Create kvstore."""
         arg_arrays = {}
         if self._contains_sparse:
             kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index c7e348eef522..6c8ff8677aba 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -351,10 +351,13 @@ def test_sparse_symbol_block():
 @raises(ValueError)
 def test_sparse_hybrid_block():
     params = gluon.ParameterDict('net_')
-    params.get('weight', shape=(5, 5), stype='row_sparse')
-    params.get('bias', shape=(5,))
-    # an exception is expected when creating a HybridBlock w/ sparse param
+    params.get('weight', shape=(5,5), stype='row_sparse', dtype='float32')
+    params.get('bias', shape=(5,), dtype='float32')
     net = gluon.nn.Dense(5, params=params)
+    net.initialize()
+    x = mx.nd.ones((2,5))
+    # an exception is expected when forwarding a HybridBlock w/ sparse param
+    y = net(x)
 
 @with_seed()
 def check_layer_forward(layer, dshape):
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 3af0aa8734c5..4923149f8cfa 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -28,7 +28,7 @@
 @with_seed()
 @raises(RuntimeError)
 def test_multi_trainer():
-    x = gluon.Parameter('x', shape=(10,))
+    x = gluon.Parameter('x', shape=(10,), stype='row_sparse')
     x.initialize()
     # test set trainer
     trainer0 = gluon.Trainer([x], 'sgd')
@@ -37,7 +37,7 @@ def test_multi_trainer():
     x._set_trainer(None)
     assert(x._trainer is None)
     x._set_trainer(trainer0)
-    # multiple trainers for a single Parameter is not allowed
+    # multiple trainers for a sparse Parameter is not allowed
     trainer1 = gluon.Trainer([x], 'sgd')
 
 @with_seed()

From bee6774da08f169f4f4a1823bdb390a0e99b1c6a Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 23 May 2018 01:01:51 +0000
Subject: [PATCH 15/21] change error

---
 python/mxnet/gluon/parameter.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index adbe0243c8ba..e7ba794c9cda 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -449,9 +449,9 @@ def row_sparse_data(self, row_id):
         NDArray on row_id's context
         """
         if self._stype != 'row_sparse':
-            raise ValueError("Cannot return a copy of Parameter %s via row_sparse_data() " \
-                             "because its storage type is %s. Please use data() instead." \
-                             %(self.name, self._stype))
+            raise RuntimeError("Cannot return a copy of Parameter %s via row_sparse_data() " \
+                               "because its storage type is %s. Please use data() instead." \
+                               %(self.name, self._stype))
         return self._get_row_sparse(self._data, row_id.context, row_id)
 
     def list_row_sparse_data(self, row_id):
@@ -469,9 +469,9 @@ def list_row_sparse_data(self, row_id):
         list of NDArrays
         """
         if self._stype != 'row_sparse':
-            raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \
-                             "list_row_sparse_data() because its storage type is %s. Please " \
-                             "use data() instead." % (self.name, self._stype))
+            raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \
+                               "list_row_sparse_data() because its storage type is %s. Please " \
+                               "use data() instead." % (self.name, self._stype))
         return self._get_row_sparse(self._data, list, row_id)
 
     def data(self, ctx=None):
@@ -489,9 +489,9 @@ def data(self, ctx=None):
         NDArray on ctx
         """
         if self._stype != 'default':
-            raise ValueError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \
-                             "because its storage type is %s. Please use row_sparse_data() " \
-                             "instead." % (self.name, str(ctx), self._stype))
+            raise RuntimeError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \
+                               "because its storage type is %s. Please use row_sparse_data() " \
+                               "instead." % (self.name, str(ctx), self._stype))
         return self._check_and_get(self._data, ctx)
 
     def list_data(self):
@@ -504,9 +504,9 @@ def list_data(self):
         list of NDArrays
         """
         if self._stype != 'default':
-            raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \
-                             "list_data() because its storage type is %s. Please use " \
-                             "row_sparse_data() instead." % (self.name, self._stype))
+            raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \
+                               "list_data() because its storage type is %s. Please use " \
+                               "row_sparse_data() instead." % (self.name, self._stype))
         return self._check_and_get(self._data, list)
 
     def grad(self, ctx=None):

From 077b7a5ea15169c00f35d5df75fb2d9e7c651548 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 23 May 2018 03:12:18 +0000
Subject: [PATCH 16/21] remove cast stype

---
 python/mxnet/gluon/parameter.py     | 17 ++++-------------
 tests/python/unittest/test_gluon.py |  2 +-
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index e7ba794c9cda..10d2123295d5 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -225,7 +225,7 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
         self._trainer._row_sparse_pull(self, results, row_id)
         return results
 
-    def _load_init(self, data, ctx, cast_stype=False):
+    def _load_init(self, data, ctx):
         """(Re)initializes by loading from data."""
         if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore:
             if self not in self._trainer._params_to_init:
@@ -246,13 +246,7 @@ def _load_init(self, data, ctx, cast_stype=False):
                 "dtype incompatible expected %s vs saved %s"%(
                     self.name, str(self.dtype), str(data.dtype))
         if self._stype != data.stype:
-            if not cast_stype:
-                raise RuntimeError("Failed loading Parameter '%s' from saved params: storage " \
-                                   "type incompatible expected %s vs saved %s. Set " \
-                                   "cast_stype=True to cast saved params to the same stype " \
-                                   "as '%s'."%(self.name, self._stype, data.stype, self.name))
-            else:
-                data = data.tostype(self._stype)
+            data = data.tostype(self._stype)
         if isinstance(ctx, Context):
             ctx = [ctx]
         if self._data is None:
@@ -872,7 +866,7 @@ def save(self, filename, strip_prefix=''):
         ndarray.save(filename, arg_dict)
 
     def load(self, filename, ctx=None, allow_missing=False,
-             ignore_extra=False, restore_prefix='', cast_stype=False):
+             ignore_extra=False, restore_prefix=''):
         """Load parameters from file.
 
         filename : str
@@ -886,9 +880,6 @@ def load(self, filename, ctx=None, allow_missing=False,
             present in this ParameterDict.
         restore_prefix : str, default ''
             prepend prefix to names of stored parameters before loading.
-        cast_stype: bool, default False
-            Whether to cast the storage type of parameters from the file to current
-            Parameter's storage type if stypes mismatch.
         """
         if restore_prefix:
             for name in self.keys():
@@ -913,4 +904,4 @@ def load(self, filename, ctx=None, allow_missing=False,
                     "Please make sure source and target networks have the same prefix."%(
                         name[lprefix:], filename, _brief_print_list(self._params.keys()))
                 continue
-            self[name]._load_init(arg_dict[name], ctx, cast_stype=cast_stype)
+            self[name]._load_init(arg_dict[name], ctx)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 6c8ff8677aba..1d3aaf6036a5 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -124,7 +124,7 @@ def test_paramdict():
     params2.get('w0', shape=(10, 10))
     params2.get('w1', shape=(10, 10))
     assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu())
-    params2.load('test_paramdict.params', mx.cpu(), cast_stype=True)
+    params2.load('test_paramdict.params', mx.cpu())
 
     # compare the values before and after save/load
     cur_w0 = params2.get('w0').data(mx.cpu())

From 6038fe9513b4bbdc80c9850fc24a3e3a0a8e5afc Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Wed, 23 May 2018 03:15:34 +0000
Subject: [PATCH 17/21] fix test

---
 tests/python/unittest/test_gluon.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 1d3aaf6036a5..88eddcdc1456 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -79,14 +79,14 @@ def test_parameter_invalid_access():
     # cannot call data on row_sparse parameters
     p0 = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
     p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    assertRaises(ValueError, p0.data)
-    assertRaises(ValueError, p0.list_data)
+    assertRaises(RuntimeError, p0.data)
+    assertRaises(RuntimeError, p0.list_data)
     row_id = mx.nd.arange(0, 10)
     # cannot call row_sparse_data on dense parameters
     p1 = gluon.Parameter('weight', shape=(10, 10))
     p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    assertRaises(ValueError, p1.row_sparse_data, row_id.copyto(mx.cpu(0)))
-    assertRaises(ValueError, p1.list_row_sparse_data, row_id)
+    assertRaises(RuntimeError, p1.row_sparse_data, row_id.copyto(mx.cpu(0)))
+    assertRaises(RuntimeError, p1.list_row_sparse_data, row_id)
 
 @with_seed()
 def test_paramdict():
@@ -123,7 +123,6 @@ def test_paramdict():
     params2 = gluon.ParameterDict('net_')
     params2.get('w0', shape=(10, 10))
     params2.get('w1', shape=(10, 10))
-    assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu())
     params2.load('test_paramdict.params', mx.cpu())
 
     # compare the values before and after save/load
@@ -348,7 +347,7 @@ def test_sparse_symbol_block():
     net = gluon.SymbolBlock(out, data)
 
 @with_seed()
-@raises(ValueError)
+@raises(RuntimeError)
 def test_sparse_hybrid_block():
     params = gluon.ParameterDict('net_')
     params.get('weight', shape=(5,5), stype='row_sparse', dtype='float32')

From 70de567d1cf496f8b359b7fc9bdd382f9907de55 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 24 May 2018 21:24:31 +0000
Subject: [PATCH 18/21] add reset kvstore to trainer

---
 python/mxnet/gluon/parameter.py             | 11 +++---
 python/mxnet/gluon/trainer.py               | 38 +++++++++++++--------
 tests/python/unittest/test_gluon.py         |  2 --
 tests/python/unittest/test_gluon_trainer.py | 24 +++++++++++++
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 10d2123295d5..3265fef2b6c0 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -227,12 +227,6 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
 
     def _load_init(self, data, ctx):
         """(Re)initializes by loading from data."""
-        if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore:
-            if self not in self._trainer._params_to_init:
-                raise RuntimeError("Cannot (re)initialize Parameter '%s' because it was " \
-                                   "already initialized on the trainer's KVStore, and " \
-                                   "trainer._update_on_kvstore is True. Please create a " \
-                                   "new Trainer with this Parameter."%(self.name))
         if self.shape:
             for self_dim, data_dim in zip(self.shape, data.shape):
                 assert self_dim == 0 or self_dim == data_dim, \
@@ -425,6 +419,11 @@ def set_data(self, data):
             self._deferred_init = self._deferred_init[:3] + (data,)
             return
 
+        # if update_on_kvstore, we need to make sure the copy stored in kvstore is in sync
+        if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore:
+            if self not in self._trainer._params_to_init:
+                self._trainer._reset_kvstore()
+
         for arr in self._check_and_get(self._data, list):
             arr[:] = data
 
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 50a6526ef831..ef20109021aa 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -69,7 +69,6 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
                 "got %s."%(type(params)))
         self._params = []
         # parameters to initialize on the kvstore
-        self._params_to_init = []
         self._contains_sparse = False
         self._param2idx = {}
         for i, param in enumerate(params):
@@ -79,7 +78,6 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
                     "got list of %s."%(type(param)))
             self._param2idx[param.name] = i
             self._params.append(param)
-            self._params_to_init.append(param)
             param._set_trainer(self)
             if param._stype != 'default':
                 self._contains_sparse = True
@@ -88,9 +86,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
         self._scale = float(optimizer_params.get('rescale_grad', 1.0))
         self._contexts = self._check_contexts()
         self._init_optimizer(optimizer, optimizer_params)
+        self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
-        self._kvstore = kvstore
-        self._update_on_kvstore = update_on_kvstore
+        self._kvstore = None
+        self._update_on_kvstore = None
+        self._params_to_init = []
+        self._reset_kvstore()
 
     def _check_contexts(self):
         contexts = None
@@ -140,30 +141,37 @@ def _init_params(self):
 
         self._params_to_init = params_to_init
 
+    def _reset_kvstore(self):
+        """Reset kvstore."""
+        if self._kvstore and 'dist' in self._kvstore.type:
+            raise RuntimeError("Cannot reset distributed KVStore.")
+        self._kv_initialized = False
+        self._kvstore = None
+        self._update_on_kvstore = None
+        self._params_to_init = [param for param in self._params]
+
     def _init_kvstore(self):
         """Create kvstore."""
         arg_arrays = {}
+        config = self._kvstore_params
         if self._contains_sparse:
-            kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore)
+            kvstore, update_on_kvstore = _create_sparse_kvstore(config['kvstore'])
             # update_on_kvstore is set to False by the user
-            if self._update_on_kvstore is False:
+            if config['update_on_kvstore'] is False:
                 raise RuntimeError("Cannot set update_on_kvstore to False when sparse "
                                    "gradients and/or sparse weights are present for "
-                                   "Parameter %s." % param.name)
+                                   "Parameter '%s'."%param.name)
         else:
-            kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
+            kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts),
                                                          arg_arrays)
-            update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \
-                                else update_on_kvstore
+            if config['update_on_kvstore'] is not None:
+                update_on_kvstore = config['update_on_kvstore']
         if kvstore:
             if self._compression_params:
                 kvstore.set_gradient_compression(self._compression_params)
             # kv.pull(row_sparse_grad) is not supported
-            if self._contains_sparse:
-                update_on_kvstore = True
-            else:
-                if 'dist' in kvstore.type:
-                    update_on_kvstore = False
+            if 'dist' in kvstore.type and not self._contains_sparse:
+                update_on_kvstore = False
             if update_on_kvstore:
                 # optimizer preferably needs to be set before init for multiprecision
                 kvstore.set_optimizer(self._optimizer)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 88eddcdc1456..82bad5baeb74 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -102,8 +102,6 @@ def test_paramdict():
     prev_w1 = params0.get('w1').row_sparse_data(all_row_ids)
     # save params
     params0.save('test_paramdict.params')
-    # cannot load parameters if they are already initialized on trainer's kvstore
-    assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu())
 
     # load params
     params1 = gluon.ParameterDict('net_')
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 4923149f8cfa..c2e11ebb18ee 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -174,3 +174,27 @@ def test_trainer_save_load():
     x.lr_mult = 2.0
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
+
+@with_seed()
+def test_trainer_reset_kv():
+    params = gluon.ParameterDict()
+    x = params.get('x', shape=(10,), lr_mult=1.0)
+    params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1})
+    params.save('test_trainer_reset_kv.params')
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+    # load would reset kvstore
+    params.load('test_trainer_reset_kv.params')
+    assert trainer._kvstore is None
+    assert trainer._kv_initialized is False
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    trainer.step(1)
+    # the updated parameter should be based on the loaded checkpoint
+    assert (x.data(mx.cpu()) == -0.2).asnumpy().all()

From 12a8b59913e9a30044d4b982bdc6ea74f45c74f9 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 24 May 2018 23:09:02 +0000
Subject: [PATCH 19/21] lint

---
 tests/nightly/dist_sync_kvstore.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index 3bf5cbffa13a..de49b5822fa4 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -357,13 +357,13 @@ def check_init(kv, cur_keys, cur_shape, device=False):
     parser.add_argument('--no-gpu', dest='gpu', action='store_false')
     parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false')
     opt = parser.parse_args()
-    if opt.type == 'all' or  opt.type == 'init':
+    if opt.type == 'all' or opt.type == 'init':
         test_sync_init(opt.gpu)
     kv = init_kv()
-    if opt.type == 'all' or  opt.type == 'default':
+    if opt.type == 'all' or opt.type == 'default':
         kv = set_optimizer(use_multiprecision=opt.multiprecision)
         test_sync_push_pull(opt.nrepeat)
     # dont run non compressed tests after this as kvstore compression will be set here
-    if opt.type == 'all' or  opt.type == 'compressed':
+    if opt.type == 'all' or opt.type == 'compressed':
         kv, threshold = init_kv_compressed(kv)
         test_sync_2bit_compression(threshold, opt.nrepeat)

From 2a06884657ade6b69bfbceb8ab618739f5bca370 Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Thu, 24 May 2018 23:33:16 +0000
Subject: [PATCH 20/21] add test to CI

---
 ci/docker/runtime_functions.sh     |  1 +
 tests/nightly/dist_sync_kvstore.py | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7abe767c8691..10bca17b5ff4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -544,6 +544,7 @@ integrationtest_ubuntu_gpu_dist_kvstore() {
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision
     ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py
+    ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon
 }
 
 test_ubuntu_cpu_python2() {
diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index de49b5822fa4..32fb6a5ff077 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -24,7 +24,7 @@
 import mxnet as mx
 import numpy as np
 import numpy.random as rnd
-from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import assert_almost_equal, assert_exception
 from test_kvstore import compute_expected_2bit_quantization
 
 def check_diff(A, x, rank=None):
@@ -350,6 +350,23 @@ def check_init(kv, cur_keys, cur_shape, device=False):
         check_init(kv, init_test_keys_device_big, big_shape, device=True)
     print('worker ' + str(kv.rank) + ' is initialized')
 
+def test_gluon_trainer_reset():
+    params = mx.gluon.ParameterDict()
+    x = params.get('x', shape=(4, 2), lr_mult=1.0, stype='row_sparse')
+    params.initialize(ctx=mx.cpu(0), init='zeros')
+    trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
+    params.save('test_gluon_trainer_reset_' + str(my_rank) + '.params')
+    row_id = mx.nd.arange(0, 4)
+    w = x.row_sparse_data(row_id)
+    with mx.autograd.record():
+        y = w + 1
+        y.backward()
+    trainer.step(1)
+    # load would fail to reset kvstore since update_on_kvstore is True
+    assert_exception(params.load, RuntimeError, 'test_gluon_trainer_reset_' + str(my_rank) + '.params')
+    print('worker ' + str(my_rank) + ' passed test_gluon_trainer_reset')
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='test distributed kvstore in dist_sync mode')
     parser.add_argument('--nrepeat', type=int, default=7)
@@ -357,13 +374,16 @@ def check_init(kv, cur_keys, cur_shape, device=False):
     parser.add_argument('--no-gpu', dest='gpu', action='store_false')
     parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false')
     opt = parser.parse_args()
+    if opt.type == 'gluon':
+        test_gluon_trainer_reset()
     if opt.type == 'all' or opt.type == 'init':
         test_sync_init(opt.gpu)
-    kv = init_kv()
     if opt.type == 'all' or opt.type == 'default':
+        kv = init_kv()
         kv = set_optimizer(use_multiprecision=opt.multiprecision)
         test_sync_push_pull(opt.nrepeat)
     # dont run non compressed tests after this as kvstore compression will be set here
     if opt.type == 'all' or opt.type == 'compressed':
+        kv = init_kv()
         kv, threshold = init_kv_compressed(kv)
         test_sync_2bit_compression(threshold, opt.nrepeat)

From 01b3e4d1b6d8d8157157e08adaa5436f1a03fc1a Mon Sep 17 00:00:00 2001
From: eric-haibin-lin <linhaibin.eric@gmail.com>
Date: Sat, 26 May 2018 03:32:27 +0000
Subject: [PATCH 21/21] add more checks

---
 tests/nightly/dist_sync_kvstore.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index 32fb6a5ff077..32ed2dddb6fb 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -358,10 +358,7 @@ def test_gluon_trainer_reset():
     params.save('test_gluon_trainer_reset_' + str(my_rank) + '.params')
     row_id = mx.nd.arange(0, 4)
     w = x.row_sparse_data(row_id)
-    with mx.autograd.record():
-        y = w + 1
-        y.backward()
-    trainer.step(1)
+    assert trainer._kv_initialized and trainer._update_on_kvstore
     # load would fail to reset kvstore since update_on_kvstore is True
     assert_exception(params.load, RuntimeError, 'test_gluon_trainer_reset_' + str(my_rank) + '.params')
     print('worker ' + str(my_rank) + ' passed test_gluon_trainer_reset')