From f9e160cf6c953177ebc7cfd8e7fb46bff0f82d79 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sun, 13 May 2018 17:10:39 +0000 Subject: [PATCH 1/7] draft --- python/mxnet/gluon/nn/basic_layers.py | 7 +++--- python/mxnet/gluon/parameter.py | 15 ++++++++++--- src/operator/tensor/init_op.h | 4 ++-- tests/python/unittest/test_gluon.py | 31 +++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py index d86c3e6ce4f3..b3ad32df3494 100644 --- a/python/mxnet/gluon/nn/basic_layers.py +++ b/python/mxnet/gluon/nn/basic_layers.py @@ -390,13 +390,14 @@ class Embedding(HybridBlock): - **out**: N-D tensor with shape: `(x1, x2, ..., xN-1, output_dim)`. """ def __init__(self, input_dim, output_dim, dtype='float32', - weight_initializer=None, **kwargs): + weight_initializer=None, sparse_grad=False, **kwargs): super(Embedding, self).__init__(**kwargs) + grad_stype = 'row_sparse' if sparse_grad else 'default' self._kwargs = {'input_dim': input_dim, 'output_dim': output_dim, - 'dtype': dtype} + 'dtype': dtype, 'sparse_grad': sparse_grad} self.weight = self.params.get('weight', shape=(input_dim, output_dim), init=weight_initializer, dtype=dtype, - allow_deferred_init=True) + allow_deferred_init=True, grad_stype=grad_stype) def hybrid_forward(self, F, x, weight): return F.Embedding(x, weight, name='fwd', **self._kwargs) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 320b376fe0b0..cb8313239478 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -81,6 +81,8 @@ class Parameter(object): Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult. init : Initializer, default None Initializer of this parameter. Will use the global initializer by default. + grad_stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'. + The storage type of the parameter's gradient. Attributes ---------- @@ -97,7 +99,7 @@ class Parameter(object): """ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False, - differentiable=True): + differentiable=True, grad_stype=None): self._var = None self._data = None self._grad = None @@ -114,6 +116,11 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, self.wd_mult = wd_mult self.grad_req = grad_req self.init = init + grad_stype = 'default' if grad_stype is None else grad_stype + assert grad_stype in ['default', 'row_sparse', 'csr'], \ + "grad_stype must be one of 'default', 'row_sparse', or 'csr', but got '%s'"%grad_stype + self._grad_stype = grad_stype + def __repr__(self): s = 'Parameter {name} (shape={shape}, dtype={dtype})' @@ -261,7 +268,9 @@ def _init_grad(self): self._grad = None return - self._grad = [ndarray.zeros_like(i) for i in self._data] + self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context, + stype=self._grad_stype) for i in self._data] + autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req) def _reduce(self): @@ -431,7 +440,7 @@ def zero_grad(self): if self._grad is None: return for i in self._grad: - i[:] = 0 + ndarray.zeros_like(i, out=i) def var(self): """Returns a symbol representing this parameter.""" diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index 0c74cac2dca5..6c9660556371 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -383,8 +383,8 @@ void FillComputeZerosEx(const nnvm::NodeAttrs& attrs, Stream *s = ctx.get_stream(); CHECK_EQ(outputs.size(), 1); auto stype = outputs[0].storage_type(); - if (req[0] == kNullOp) return; - CHECK_EQ(req[0], kWriteTo) << "kWriteTo is expected for FillComputeZerosEx"; + // x + 0 == x + if (req[0] == kNullOp || req[0] == kAddTo) return; if (stype == kRowSparseStorage) { FillZerosRspImpl(s, outputs[0]); } else if (stype == kCSRStorage) { diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 350f88564360..ccc378afd6ee 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -38,6 +38,21 @@ def test_parameter(): assert p.data(mx.cpu(1)).context == mx.cpu(1) assert p.data(mx.cpu(0)).shape == (10, 10) assert p.var().name == 'weight' + assert p.grad().stype == 'default' + + p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) + assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] + +@with_seed() +def test_sparse_parameter(): + p = gluon.Parameter('weight', shape=(10, 10), grad_stype='row_sparse') + p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) + assert len(p.list_data()) == 2 + assert len(p.list_grad()) == 2 + assert p.data(mx.cpu(1)).context == mx.cpu(1) + assert p.data(mx.cpu(0)).shape == (10, 10) + assert p.var().name == 'weight' + assert p.grad().stype == 'row_sparse' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] @@ -960,6 +975,7 @@ def test_req(): assert_almost_equal(grad * 2, grad_double) +@with_seed() def test_save_load(): net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True) net.save_params('test.params') @@ -970,6 +986,7 @@ def test_save_load(): net.load_params('test.params') +@with_seed() def test_hybrid_multi_context(): net = mx.gluon.model_zoo.vision.get_resnet(1, 18) net.initialize(ctx=[mx.cpu(0), mx.cpu(1)]) @@ -977,6 +994,20 @@ def test_hybrid_multi_context(): net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy() +@with_seed() +def test_zero_grad(): + data = mx.nd.random.uniform(shape=(3,3)) + net = nn.HybridSequential() + net.add(nn.Embedding(3, 4, sparse_grad=True)) + net.initialize() + with mx.autograd.record(): + l = net(data) + l.backward() + net.collect_params().zero_grad() + grad = net.collect_params()['embedding0_weight'].grad(mx.cpu()) + assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0) + + if __name__ == '__main__': import nose nose.runmodule() From 200dc6fe837e3ba038aba82969c79ebbc6e0a3b8 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sun, 13 May 2018 17:35:37 +0000 Subject: [PATCH 2/7] updat test --- tests/python/unittest/test_gluon.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index ccc378afd6ee..ef3c56dfbc15 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -38,7 +38,7 @@ def test_parameter(): assert p.data(mx.cpu(1)).context == mx.cpu(1) assert p.data(mx.cpu(0)).shape == (10, 10) assert p.var().name == 'weight' - assert p.grad().stype == 'default' + assert p.grad(mx.cpu(0)).stype == 'default' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] @@ -52,7 +52,7 @@ def test_sparse_parameter(): assert p.data(mx.cpu(1)).context == mx.cpu(1) assert p.data(mx.cpu(0)).shape == (10, 10) assert p.var().name == 'weight' - assert p.grad().stype == 'row_sparse' + assert p.grad(mx.cpu(0)).stype == 'row_sparse' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] @@ -674,15 +674,17 @@ def test_global_norm_clip(): @with_seed() def test_embedding(): - layer = gluon.nn.Embedding(10, 100) - layer.initialize() - x = mx.nd.array([3,4,2,0,1]) - with mx.autograd.record(): - y = layer(x) - y.backward() - assert (layer.weight.grad()[:5] == 1).asnumpy().all() - assert (layer.weight.grad()[5:] == 0).asnumpy().all() - + def check_embedding(sparse_grad): + layer = gluon.nn.Embedding(10, 100, sparse_grad=sparse_grad) + layer.initialize() + x = mx.nd.array([3,4,2,0,1]) + with mx.autograd.record(): + y = layer(x) + y.backward() + assert (layer.weight.grad().asnumpy()[:5] == 1).all() + assert (layer.weight.grad().asnumpy()[5:] == 0).all() + check_embedding(True) + check_embedding(False) @with_seed() def test_export(): @@ -1004,7 +1006,7 @@ def test_zero_grad(): l = net(data) l.backward() net.collect_params().zero_grad() - grad = net.collect_params()['embedding0_weight'].grad(mx.cpu()) + grad = net.collect_params()['embedding0_weight'].grad() assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0) From 9c84de813ec4a1cf3eeca5977dd1538b786a1f48 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sun, 13 May 2018 18:11:44 +0000 Subject: [PATCH 3/7] fix kvstore --- python/mxnet/gluon/trainer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 5ae0e46b7dc3..a1a698b7e709 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -106,14 +106,23 @@ def _init_optimizer(self, optimizer, optimizer_params): for _ in self._contexts] def _init_kvstore(self): - arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} + arg_arrays = {} + contains_sparse = False + for param in self._params: + arg_arrays[param.name] = param.data(self._contexts[0]) + if param._grad_stype != 'default': + contains_sparse = True kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) if kvstore: if self._compression_params: kvstore.set_gradient_compression(self._compression_params) - if 'dist' in kvstore.type: - update_on_kvstore = False + # kv.pull(row_sparse_grad) is not supported + if contains_sparse: + update_on_kvstore = True + else: + if 'dist' in kvstore.type: + update_on_kvstore = False if update_on_kvstore: kvstore.set_optimizer(self._optimizer) # optimizer preferably needs to be set before init for multiprecision From 943e04eadccca6cb3bd43ea71ff356534f2ecb6e Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sun, 13 May 2018 18:15:16 +0000 Subject: [PATCH 4/7] fix lint --- python/mxnet/gluon/parameter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index cb8313239478..5c04493a6eb1 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -269,7 +269,7 @@ def _init_grad(self): return self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context, - stype=self._grad_stype) for i in self._data] + stype=self._grad_stype) for i in self._data] autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req) From 6a64ac5d62ad8cb546d343492aa7b990481551bb Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sun, 13 May 2018 21:58:07 +0000 Subject: [PATCH 5/7] fix test --- tests/python/unittest/test_gluon.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index ef3c56dfbc15..1ada6d4b2c74 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -999,14 +999,13 @@ def test_hybrid_multi_context(): @with_seed() def test_zero_grad(): data = mx.nd.random.uniform(shape=(3,3)) - net = nn.HybridSequential() - net.add(nn.Embedding(3, 4, sparse_grad=True)) + net = nn.Embedding(3, 4, sparse_grad=True, prefix='test_zero_grad_') net.initialize() with mx.autograd.record(): l = net(data) l.backward() net.collect_params().zero_grad() - grad = net.collect_params()['embedding0_weight'].grad() + grad = net.collect_params()['test_zero_grad_weight'].grad() assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0) From 9c1aba37dfdd6f7630397617e9fa1c290679ec45 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Tue, 15 May 2018 00:15:42 +0000 Subject: [PATCH 6/7] add proper error msg --- python/mxnet/gluon/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 232712297959..e1e06be88b9a 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -116,6 +116,10 @@ def _init_kvstore(self): arg_arrays[param.name] = param.data(self._contexts[0]) if param._grad_stype != 'default': contains_sparse = True + # update_on_kvstore is set to False by the user + if self._update_on_kvstore is False: + raise RuntimeError("Cannot set update_on_kvstore when sparse gradients " + "and/or parameters are present.") kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \ From 605bba686af82adab355268e2097e244c90c521a Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Tue, 15 May 2018 18:32:38 +0000 Subject: [PATCH 7/7] CR comment --- python/mxnet/gluon/nn/basic_layers.py | 3 ++- python/mxnet/gluon/parameter.py | 6 +++--- python/mxnet/gluon/trainer.py | 5 +++-- src/kvstore/kvstore_local.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py index b3ad32df3494..abde51b433af 100644 --- a/python/mxnet/gluon/nn/basic_layers.py +++ b/python/mxnet/gluon/nn/basic_layers.py @@ -381,7 +381,8 @@ class Embedding(HybridBlock): Data type of output embeddings. weight_initializer : Initializer Initializer for the `embeddings` matrix. - + sparse_grad: bool + If True, gradient w.r.t. weight will be a 'row_sparse' NDArray. Inputs: - **data**: (N-1)-D tensor with shape: `(x1, x2, ..., xN-1)`. diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 5c04493a6eb1..c7cbcccc95ec 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -99,7 +99,7 @@ class Parameter(object): """ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False, - differentiable=True, grad_stype=None): + differentiable=True, grad_stype='default'): self._var = None self._data = None self._grad = None @@ -116,9 +116,9 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, self.wd_mult = wd_mult self.grad_req = grad_req self.init = init - grad_stype = 'default' if grad_stype is None else grad_stype assert grad_stype in ['default', 'row_sparse', 'csr'], \ - "grad_stype must be one of 'default', 'row_sparse', or 'csr', but got '%s'"%grad_stype + "grad_stype for Parameter '%s' must be one of 'default', 'row_sparse', or 'csr'," \ + " but got '%s'" % (name, grad_stype) self._grad_stype = grad_stype diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index e1e06be88b9a..39c4a1fd6104 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -118,8 +118,9 @@ def _init_kvstore(self): contains_sparse = True # update_on_kvstore is set to False by the user if self._update_on_kvstore is False: - raise RuntimeError("Cannot set update_on_kvstore when sparse gradients " - "and/or parameters are present.") + raise RuntimeError("Cannot set update_on_kvstore to False when sparse " + "gradients and/or sparse weights are present for " + "Parameter %s." % param.name) kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \ diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 2ac6c11a1675..38ecf121dfeb 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -276,7 +276,7 @@ class KVStoreLocal : public KVStore { // invalid, print warning messages once if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) { LOG(INFO) << "Warning: non-default weights detected during kvstore pull. " - "This call has been ignored. Please make sure to use" + "This call has been ignored. Please make sure to use " "kv.row_sparse_pull() or module.prepare() with row_ids."; this->warnings_printed_.insert(key); }