From 2863a1f3cf8bedfcc3a856e6820a23b620cc6ac7 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 16 May 2018 06:12:24 +0000 Subject: [PATCH 01/21] + rsp parameter --- python/mxnet/gluon/block.py | 8 +++ python/mxnet/gluon/contrib/block.py | 62 ++++++++++++++++++++++ python/mxnet/gluon/parameter.py | 81 ++++++++++++++++++++++++++--- tests/python/unittest/test_gluon.py | 50 +++++++++++++++--- 4 files changed, 187 insertions(+), 14 deletions(-) create mode 100644 python/mxnet/gluon/contrib/block.py diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 4779484ec3ee..09863f00e23c 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -713,6 +713,14 @@ def __init__(self, outputs, inputs, params=None): "Input symbols must be variable, but %s is an output of operators"%str(i) input_names.add(i.name) + # check if any symbol is row_sparse + row_sparse_storage = ndarray.ndarray._STORAGE_TYPE_STR_TO_ID['row_sparse'] + for i in out: + for j in i.get_internals(): + assert(j.attr("__storage_type__") != str(row_sparse_storage)), \ + "SymbolBlock doesn't support Parameter '%s' because its storage " \ + "type is 'row_sparse'." % j.name + for i in out.list_arguments(): if i not in input_names: self.params.get(i, allow_deferred_init=True) diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py new file mode 100644 index 000000000000..335aa585db10 --- /dev/null +++ b/python/mxnet/gluon/contrib/block.py @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable= arguments-differ +"""Base container class for contrib neural network models.""" +__all__ = ['SparseBlock'] + +from .. import Block + +class SparseBlock(Block): + """`SparseBlock` only supports forwarding with NDArray. + """ + def __init__(self, prefix=None, params=None): + super(SparseBlock, self).__init__(prefix=prefix, params=params) + + def forward(self, x, *args): + """Defines the forward computation. Arguments has to be :py:class:`NDArray`.""" + assert isinstance(x, NDArray), \ + "SparseBlock requires the first argument to forward be NDArray, " \ + "but got %s"%type(x) + with x.context as ctx: + params = {} + for name, param in self._reg_params.items(): + if param._stype != 'default': + params[name] = param + else: + params[name] = param.data(ctx) + ret = self.sparse_forward(x, *args, **params) + for name, param in params.items(): + if param._dirty: + raise RuntimeError("sparse Parameter '%s' was not pulled during " + "sparse_forward(). Please call param.row_sparse_data() " + "with indices to pull sparse parameters in " + "sparse_forward()" % name) + + def sparse_forward(self, F, x, *args, **kwargs): + """Overrides to define sparse forward computation for this `SparseBlock`. + + Parameters + ---------- + x : NDArray + The first input tensor. + *args : list of Parameter or NDArray + Additional input tensors or sparse parameters. + """ + # pylint: disable= invalid-name + raise NotImplementedError diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index c7cbcccc95ec..ed3fc86a885f 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -81,6 +81,8 @@ class Parameter(object): Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult. init : Initializer, default None Initializer of this parameter. Will use the global initializer by default. + stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'. + The storage type of the parameter. grad_stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'. The storage type of the parameter's gradient. @@ -99,12 +101,13 @@ class Parameter(object): """ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False, - differentiable=True, grad_stype='default'): + differentiable=True, stype='default', grad_stype='default'): self._var = None self._data = None self._grad = None self._ctx_list = None self._ctx_map = None + self._trainer = None self._deferred_init = () self._differentiable = differentiable self._allow_deferred_init = allow_deferred_init @@ -116,10 +119,13 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, self.wd_mult = wd_mult self.grad_req = grad_req self.init = init - assert grad_stype in ['default', 'row_sparse', 'csr'], \ - "grad_stype for Parameter '%s' must be one of 'default', 'row_sparse', or 'csr'," \ - " but got '%s'" % (name, grad_stype) + valid_stypes = ['default', 'row_sparse', 'csr'] + assert grad_stype in valid_stypes, "grad_stype for Parameter %s must be " \ + "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, grad_stype) + assert stype in valid_stypes, "stype for Parameter %s must be " \ + "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, stype) self._grad_stype = grad_stype + self._stype = stype def __repr__(self): @@ -243,7 +249,7 @@ def _finish_deferred_init(self): with autograd.pause(): if data is None: data = ndarray.zeros(shape=self.shape, dtype=self.dtype, - ctx=context.cpu()) + ctx=context.cpu(), stype=self._stype) initializer.create(default_init)( initializer.InitDesc(self.name, {'__init__': init}), data) @@ -383,9 +389,54 @@ def set_data(self, data): for arr in self.list_data(): arr[:] = data + def row_sparse_data(self, ctx, row_id): + """Returns a copy of the 'row_sparse' parameter on one context. The copy only + retains rows whose ids occur in provided row ids. + The parameter must have been initialized on this context before. + + Parameters + ---------- + ctx : Context + Desired context. + row_id: NDArray + Row ids to retain for the 'row_sparse' parameter. + + Returns + ------- + NDArray on ctx + """ + if self._stype != 'row_sparse': + raise ValueError("Cannot return a copy of Parameter %s on ctx %s via " \ + "row_sparse_data() because its storage type is %s. Please " \ + "use data() instead." % (self.name, str(ctx), self._stype)) + return self._get_row_sparse(self._data, ctx, row_id) + + def list_row_sparse_data(self, row_id): + """Returns copies of the 'row_sparse' parameter on all contexts, in the same order + as creation. The copy only retains rows whose ids occur in provided row ids. + The parameter must have been initialized before. + + Parameters + ---------- + ctx : Context + Desired context. + row_id: NDArray + Row ids to retain for the 'row_sparse' parameter. + + Returns + ------- + list of NDArrays + """ + if self._stype != 'row_sparse': + raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \ + "list_row_sparse_data() because its storage type is %s. Please " \ + "use data() instead." % (self.name, self._stype)) + return self._get_row_sparse(self._data, list, row_id) + def data(self, ctx=None): """Returns a copy of this parameter on one context. Must have been - initialized on this context before. + initialized on this context before. For sparse parameters, use + :py:meth:`Parameter.row_sparse_data` instead. Parameters ---------- @@ -396,11 +447,25 @@ def data(self, ctx=None): ------- NDArray on ctx """ + if self._stype != 'default': + raise ValueError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \ + "because its storage type is %s. Please use row_sparse_data() " \ + "instead." % (self.name, str(ctx), self._stype)) return self._check_and_get(self._data, ctx) def list_data(self): """Returns copies of this parameter on all contexts, in the same order - as creation.""" + as creation. For sparse parameters, use :py:meth:`Parameter.list_row_sparse_data` + instead. + + Returns + ------- + list of NDArrays + """ + if self._stype != 'default': + raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \ + "list_data() because its storage type is %s. Please use " \ + "row_sparse_data() instead." % (self.name, self._stype)) return self._check_and_get(self._data, list) def grad(self, ctx=None): @@ -447,7 +512,7 @@ def var(self): if self._var is None: self._var = symbol.var(self.name, shape=self.shape, dtype=self.dtype, lr_mult=self.lr_mult, wd_mult=self.wd_mult, - init=self.init) + init=self.init, stype=self._stype) return self._var def cast(self, dtype): diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index e3026743ba35..9640fc0d4473 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -39,33 +39,61 @@ def test_parameter(): assert p.data(mx.cpu(0)).shape == (10, 10) assert p.var().name == 'weight' assert p.grad(mx.cpu(0)).stype == 'default' + assert p.data().stype == 'default' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] @with_seed() def test_sparse_parameter(): - p = gluon.Parameter('weight', shape=(10, 10), grad_stype='row_sparse') + p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) - assert len(p.list_data()) == 2 + row_id = mx.nd.arange(0, 10) + assert len(p.list_row_sparse_data(row_id)) == 2 assert len(p.list_grad()) == 2 - assert p.data(mx.cpu(1)).context == mx.cpu(1) - assert p.data(mx.cpu(0)).shape == (10, 10) + weight = p.row_sparse_data(mx.cpu(1), row_id) + assert weight.context == mx.cpu(1) + assert weight.shape == (10, 10) + assert weight.stype == 'row_sparse' assert p.var().name == 'weight' + assert p.var().attr('__storage_type__') == str(_STORAGE_TYPE_STR_TO_ID['row_sparse']) assert p.grad(mx.cpu(0)).stype == 'row_sparse' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] +@with_seed() +def test_parameter_invalid_access(): + # cannot call data on row_sparse parameters + p0 = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') + p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) + assertRaises(ValueError, p0.data) + assertRaises(ValueError, p0.list_data) + row_id = mx.nd.arange(0, 10) + # cannot call row_sparse_data on dense parameters + p1 = gluon.Parameter('weight', shape=(10, 10)) + p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) + assertRaises(ValueError, p1.row_sparse_data, mx.cpu(0), row_id) + assertRaises(ValueError, p1.list_row_sparse_data, row_id) @with_seed() def test_paramdict(): params = gluon.ParameterDict('net_') - params.get('weight', shape=(10, 10)) - assert list(params.keys()) == ['net_weight'] + params.get('w0', shape=(10, 10)) + params.get('w1', shape=(10, 10), stype='row_sparse') + all_row_ids = mx.nd.arange(0, 10) + assert list(params.keys()) == ['net_w0', 'net_w1'] params.initialize(ctx=mx.cpu()) + prev_w0 = params.get('w0').data(mx.cpu()) + prev_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids) + params.save('test.params') params.load('test.params', mx.cpu()) + # compare the values before and after save/load + cur_w0 = params.get('w0').data(mx.cpu()) + cur_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids) + mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) + mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) @with_seed() @@ -246,6 +274,16 @@ def hybrid_forward(self, F, x): net.hybridize() assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray) +@with_seed() +@raises(AssertionError) +def test_symbol_sparse_block(): + data = mx.sym.var('data') + weight = mx.sym.var('weight', stype='row_sparse') + bias = mx.sym.var('bias') + out = mx.sym.broadcast_add(mx.sym.dot(data, weight), bias) + # an exception is expected + net = gluon.SymbolBlock(out, data) + def check_layer_forward(layer, dshape): layer.collect_params().initialize() From e3d20c760e2b4919b71e5ee1b7a72911dd003c05 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 16 May 2018 18:05:41 +0000 Subject: [PATCH 02/21] draft --- python/mxnet/gluon/block.py | 7 ++ python/mxnet/gluon/contrib/__init__.py | 2 + python/mxnet/gluon/contrib/nn/basic_layers.py | 1 + python/mxnet/gluon/parameter.py | 47 +++++++++- python/mxnet/gluon/trainer.py | 94 ++++++++++++++----- python/mxnet/model.py | 19 ++++ tests/python/unittest/test_gluon.py | 45 ++++++++- 7 files changed, 184 insertions(+), 31 deletions(-) diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 09863f00e23c..1a06d2ab3340 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -445,6 +445,13 @@ class HybridBlock(Block): the end-to-end usage. """ def __init__(self, prefix=None, params=None): + if isinstance(params, ParameterDict): + for param in params.values(): + stype = param._stype + if stype != 'default': + raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \ + "because its storage type is %s. Please consider " \ + "using a SparseBlock instead."%(param.name, stype)) super(HybridBlock, self).__init__(prefix=prefix, params=params) self._cached_graph = () self._cached_op = None diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py index f708fb900227..a1d96e5ad408 100644 --- a/python/mxnet/gluon/contrib/__init__.py +++ b/python/mxnet/gluon/contrib/__init__.py @@ -23,3 +23,5 @@ from . import rnn from . import data + +from .block import * diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py index eccdf18c1bbb..3e68bd6cb9e5 100644 --- a/python/mxnet/gluon/contrib/nn/basic_layers.py +++ b/python/mxnet/gluon/contrib/nn/basic_layers.py @@ -22,6 +22,7 @@ from .... import nd from ...block import HybridBlock +from ..block import SparseBlock from ...nn import Sequential, HybridSequential class Concurrent(Sequential): diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index ed3fc86a885f..33a1051b93b4 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -168,6 +168,15 @@ def shape(self, new_shape): self._shape = new_shape + def _set_trainer(self, trainer): + """ Set the trainer this parameter is associated with. """ + if self._trainer and self._trainer is not trainer: + raise RuntimeError( + "Failed to set the trainer for Parameter %s to %s because it was set to %s. " \ + "More than one trainers for a single parameter is not supported." %( + self.name, str(trainer), str(self._trainer))) + self._trainer = trainer + def _check_and_get(self, arr_list, ctx): if arr_list is not None: if ctx is list: @@ -200,6 +209,31 @@ def _check_and_get(self, arr_list, ctx): "because the later does not include Parameters of " \ "nested child Blocks"%(self.name)) + def _get_row_sparse(self, arr_list, ctx, row_id): + results = self._check_and_get(arr_list, ctx) + + # get row sparse params based on row ids + if not isinstance(row_id, ndarray.NDArray): + raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. " + "NDArray type is expected." % (self.name, row_id)) + if not self._trainer: + # the sparse param is not yet associated with any trainer. + # and create new buffers to hold the outputs. + ret = [] + if isinstance(results, ndarray.NDArray): + rows = row_id.as_in_context(results.context) + ret = ndarray.sparse.retain(results, rows) + else: + # list of NDArrays + for result in results: + rows = row_id.as_in_context(result.context) + ret.append(ndarray.sparse.retain(result, rows)) + return ret; + + # fetch row sparse params from the trainer + self._trainer._row_sparse_pull(self, results, row_id) + return results + def _load_init(self, data, ctx): """(Re)initializes by loading from data.""" if self.shape: @@ -277,12 +311,17 @@ def _init_grad(self): self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context, stype=self._grad_stype) for i in self._data] - autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req) + autograd.mark_variables(self._check_and_get(self._data, list), + self._grad, self.grad_req) def _reduce(self): """Reduce data from multiple context.""" - block = self.list_data() - data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block) + if self._stype == 'default': + block = self.list_data() + data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block) + else: + all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64') + data = self.row_sparse_data(context.cpu(), all_row_ids) return data def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(), @@ -386,7 +425,7 @@ def set_data(self, data): self._deferred_init = self._deferred_init[:3] + (data,) return - for arr in self.list_data(): + for arr in self._check_and_get(self._data, list): arr[:] = data def row_sparse_data(self, ctx, row_id): diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 39c4a1fd6104..23665ce5070a 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -21,7 +21,7 @@ __all__ = ['Trainer'] from .. import optimizer as opt -from ..model import _create_kvstore +from ..model import _create_kvstore, _create_sparse_kvstore from .parameter import ParameterDict, Parameter class Trainer(object): @@ -68,12 +68,20 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got %s."%(type(params))) self._params = [] - for param in params: + self._params_to_init = [] + self._contains_sparse = False + self._param2idx = {} + for i, param in enumerate(params): if not isinstance(param, Parameter): raise ValueError( "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) + self._param2idx[param.name] = i self._params.append(param) + self._params_to_init.append(param) + param._set_trainer(self) + if param._stype != 'default': + self._contains_sparse = True self._compression_params = compression_params optimizer_params = optimizer_params if optimizer_params else {} self._scale = float(optimizer_params.get('rescale_grad', 1.0)) @@ -109,38 +117,55 @@ def _init_optimizer(self, optimizer, optimizer_params): self._updaters = [opt.get_updater(self._optimizer) \ for _ in self._contexts] + def _init_params(self): + """ Initialize parameters in the KVStore. Parameters whose + intiailization is incomplete are ignored. + """ + assert self._kv_initialized, "Cannot initialize parameters in KVStore " \ + "when KVStore is not initialized." + params_to_init = [] + if self._kvstore: + params = [param for param in self._params_to_init \ + if not param._deferred_init] + params_to_init = [param for param in self._params_to_init \ + if param._deferred_init] + print('initing ', params) + for param in params: + param_arrays = param._check_and_get(param._data, list) + idx = self._param2idx[param.name] + self._kvstore.init(idx, param_arrays[0]) + if param._stype == 'default': + self._kvstore.pull(idx, param_arrays, priority=-idx) + + self._params_to_init = params_to_init + def _init_kvstore(self): + """ Create kvstore """ arg_arrays = {} - contains_sparse = False - for param in self._params: - arg_arrays[param.name] = param.data(self._contexts[0]) - if param._grad_stype != 'default': - contains_sparse = True - # update_on_kvstore is set to False by the user - if self._update_on_kvstore is False: - raise RuntimeError("Cannot set update_on_kvstore to False when sparse " - "gradients and/or sparse weights are present for " - "Parameter %s." % param.name) - kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), - arg_arrays) - update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \ - else update_on_kvstore + if self._contains_sparse: + kvstore, update_on_kvstore = _create_kvstore(self._kvstore) + # update_on_kvstore is set to False by the user + if self._update_on_kvstore is False: + raise RuntimeError("Cannot set update_on_kvstore to False when sparse " + "gradients and/or sparse weights are present for " + "Parameter %s." % param.name) + else: + kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), + arg_arrays) + update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \ + else update_on_kvstore if kvstore: if self._compression_params: kvstore.set_gradient_compression(self._compression_params) # kv.pull(row_sparse_grad) is not supported - if contains_sparse: + if self._contains_sparse: update_on_kvstore = True else: if 'dist' in kvstore.type: update_on_kvstore = False if update_on_kvstore: + # optimizer preferably needs to be set before init for multiprecision kvstore.set_optimizer(self._optimizer) - # optimizer preferably needs to be set before init for multiprecision - for i, param in enumerate(self._params): - param_arrays = param.list_data() - kvstore.init(i, param_arrays[0]) - kvstore.pull(i, param_arrays, priority=-i) self._kvstore = kvstore self._update_on_kvstore = update_on_kvstore else: @@ -171,6 +196,17 @@ def set_learning_rate(self, lr): else: self._optimizer.set_learning_rate(lr) + def _row_sparse_pull(self, parameter, out, row_id): + # initialize kv and params if not already + assert self._kvstore, "Cannot perform row_sparse_pull() when KVStore is not set" + if not self._kv_initialized: + self._init_kvstore() + if self._params_to_init: + self._init_params() + # TODO check state + self._kvstore.row_sparse_pull(self._param2idx[parameter.name], \ + out=out, row_ids=row_id) + def step(self, batch_size, ignore_stale_grad=False): """Makes one step of parameter update. Should be called after `autograd.backward()` and outside of `record()` scope. @@ -191,6 +227,8 @@ def step(self, batch_size, ignore_stale_grad=False): """ if not self._kv_initialized: self._init_kvstore() + if self._params_to_init: + self._init_params() self._optimizer.rescale_grad = self._scale / batch_size @@ -210,6 +248,8 @@ def allreduce_grads(self): """ if not self._kv_initialized: self._init_kvstore() + if self._params_to_init: + self._init_params() assert not (self._kvstore and self._update_on_kvstore), \ 'allreduce_grads() when parameters are updated on kvstore ' \ 'is not supported. Try setting `update_on_kvstore` ' \ @@ -250,6 +290,8 @@ def update(self, batch_size, ignore_stale_grad=False): """ if not self._kv_initialized: self._init_kvstore() + if self._params_to_init: + self._init_params() assert not (self._kvstore and self._update_on_kvstore), \ 'update() when parameters are updated on kvstore ' \ 'is not supported. Try setting `update_on_kvstore` ' \ @@ -264,7 +306,7 @@ def _update(self, ignore_stale_grad=False): continue if not ignore_stale_grad: - for data in param.list_data(): + for data in param._check_and_get(param._data, list): if not data._fresh_grad: raise UserWarning( "Gradient of Parameter `%s` on context %s has not been updated " @@ -275,7 +317,7 @@ def _update(self, ignore_stale_grad=False): "warning and skip updating of Parameters with stale gradient" \ %(param.name, str(data.context))) - if self._kvstore and self._update_on_kvstore: + if self._kvstore and self._update_on_kvstore and param._stype == 'default': self._kvstore.pull(i, param.list_data(), priority=-i) continue @@ -296,6 +338,8 @@ def save_states(self, fname): if not self._kv_initialized: self._init_kvstore() + if self._params_to_init: + self._init_params() if self._update_on_kvstore: self._kvstore.save_optimizer_states(fname, dump_optimizer=True) @@ -313,6 +357,8 @@ def load_states(self, fname): """ if not self._kv_initialized: self._init_kvstore() + if self._params_to_init: + self._init_params() if self._update_on_kvstore: self._kvstore.load_optimizer_states(fname) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index ae7726d76a7a..3a50553a615c 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -55,6 +55,25 @@ 'eval_metric', 'locals']) +def _create_sparse_kvstore(kvstore): + """Create kvstore assuming some parameters' storage types are row_sparse. + + Parameters + ---------- + kvstore : KVStore or str + The kvstore. + """ + # always update on kvstore + update_on_kvstore = True + if isinstance(kvstore, kvs.KVStore): + kv = kvstore + elif isinstance(kvstore, str): + kv = kvs.create(kvstore) + else: + raise TypeError("Cannot create '%s' KVStore with row_sparse parameters. " + "The type must be KVStore or str." % kvstore) + return (kv, update_on_kvstore) + def _create_kvstore(kvstore, num_device, arg_params): """Create kvstore This function select and create a proper kvstore if given the kvstore type. diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 9640fc0d4473..857b56510f4d 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -19,7 +19,8 @@ from mxnet import gluon from mxnet.gluon import nn from mxnet.test_utils import assert_almost_equal -from common import setup_module, with_seed +from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID +from common import setup_module, with_seed, assertRaises import numpy as np from nose.tools import raises, assert_raises from copy import deepcopy @@ -95,6 +96,35 @@ def test_paramdict(): mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) +@with_seed() +def test_parameter_row_sparse_data(): + def check_parameter_row_sparse_data(with_trainer): + ctx0 = mx.cpu(1) + ctx1 = mx.cpu(2) + dim0 = 4 + x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse') + x.initialize(init='xavier', ctx=[ctx0, ctx1]) + if with_trainer: + trainer = gluon.Trainer([x], 'sgd') + x_param = x._data[0].copy() + assert x_param.stype == 'row_sparse' + row_id_0 = mx.nd.array([0,1]) + retained_0 = x.row_sparse_data(ctx0, row_id_0) + retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0)) + mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy()) + assert retained_0.context == ctx0 + row_id_1 = mx.nd.arange(0, dim0) + retained_1 = x.row_sparse_data(ctx1, row_id_1) + retained_target_1 = x_param + mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy()) + assert retained_1.context == ctx1 + row_id_2 = mx.nd.array([0,1,2]) + retained_2 = x.list_row_sparse_data(row_id_2) + retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0)) + mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy()) + + check_parameter_row_sparse_data(True) + check_parameter_row_sparse_data(False) @with_seed() def test_constant(): @@ -276,15 +306,24 @@ def hybrid_forward(self, F, x): @with_seed() @raises(AssertionError) -def test_symbol_sparse_block(): +def test_sparse_symbol_block(): data = mx.sym.var('data') weight = mx.sym.var('weight', stype='row_sparse') bias = mx.sym.var('bias') out = mx.sym.broadcast_add(mx.sym.dot(data, weight), bias) - # an exception is expected + # an exception is expected when creating a SparseBlock w/ sparse param net = gluon.SymbolBlock(out, data) +@with_seed() +@raises(ValueError) +def test_sparse_hybrid_block(): + params = gluon.ParameterDict('net_') + params.get('weight', shape=(5, 5), stype='row_sparse') + params.get('bias', shape=(5,)) + # an exception is expected when creating a HybridBlock w/ sparse param + net = gluon.nn.Dense(5, params=params) +@with_seed() def check_layer_forward(layer, dshape): layer.collect_params().initialize() x = mx.nd.ones(shape=dshape) From ad672a79dda1ec8414a59715aaadaf5d0d5f15af Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 17 May 2018 05:13:57 +0000 Subject: [PATCH 03/21] Fix optimizer pickle --- python/mxnet/gluon/trainer.py | 2 ++ python/mxnet/optimizer.py | 11 +++++++++++ tests/python/unittest/test_gluon.py | 20 ++++++++++++++++++-- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 23665ce5070a..d0eb0327ae08 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -363,6 +363,8 @@ def load_states(self, fname): if self._update_on_kvstore: self._kvstore.load_optimizer_states(fname) self._optimizer = self._kvstore._updater.optimizer + param_dict = {i: param for i, param in enumerate(self._params)} + self._optimizer.param_dict = param_dict else: with open(fname, 'rb') as f: states = f.read() diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 1d2fd2e73dfd..0c3fc904fb1f 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -426,6 +426,17 @@ def _get_wd(self, index): wd *= self.wd_mult.get(self.idx2name[index], 1.0) return wd + def __getstate__(self): + ret = self.__dict__.copy() + # do not include param_dict in the state + del ret['param_dict'] + return ret + + def __setstate__(self, state): + self.__dict__ = state + # param_dict needs to be explicitly set by the trainer + self.param_dict = {} + # convenience wrapper for Optimizer.Register register = Optimizer.register # pylint: disable=invalid-name diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 857b56510f4d..8b546a9c6588 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -601,10 +601,10 @@ def dict_equ(a, b): assert (x.data(mx.cpu(1)).asnumpy() == -4).all() - trainer.save_states('test.states') + trainer.save_states('test_trainer.states') states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \ else deepcopy(trainer._updaters[0].states) - trainer.load_states('test.states') + trainer.load_states('test_trainer.states') if trainer._update_on_kvstore: dict_equ(trainer._kvstore._updater.states, states) assert trainer._optimizer == trainer._kvstore._updater.optimizer @@ -630,6 +630,22 @@ def dict_equ(a, b): assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy() +@with_seed() +def test_trainer_save_load(): + x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) + x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') + trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1 + trainer.save_states('test_trainer_save_load.states') + trainer.load_states('test_trainer_save_load.states') + x.lr_mult = 2.0 + # check if parameter dict is correctly associated with optimizer after load_state + assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 @with_seed() def test_block_attr_hidden(): From 674d374b937f3c9711051a2830e57a0fb5e0ea2a Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 17 May 2018 16:58:44 +0000 Subject: [PATCH 04/21] refactor and document --- python/mxnet/gluon/block.py | 1 + python/mxnet/gluon/contrib/block.py | 20 ++++++++++-------- python/mxnet/gluon/parameter.py | 32 ++++++++++++++--------------- python/mxnet/gluon/trainer.py | 3 ++- tests/python/unittest/test_gluon.py | 26 +++++++++++++++-------- 5 files changed, 48 insertions(+), 34 deletions(-) diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 1a06d2ab3340..52b385daa14f 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -445,6 +445,7 @@ class HybridBlock(Block): the end-to-end usage. """ def __init__(self, prefix=None, params=None): + # check if any parameter is row_sparse if isinstance(params, ParameterDict): for param in params.values(): stype = param._stype diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py index 335aa585db10..036cd58e39d6 100644 --- a/python/mxnet/gluon/contrib/block.py +++ b/python/mxnet/gluon/contrib/block.py @@ -31,25 +31,29 @@ def __init__(self, prefix=None, params=None): def forward(self, x, *args): """Defines the forward computation. Arguments has to be :py:class:`NDArray`.""" assert isinstance(x, NDArray), \ - "SparseBlock requires the first argument to forward be NDArray, " \ + "SparseBlock requires the first argument to forward to be an NDArray, " \ "but got %s"%type(x) with x.context as ctx: params = {} for name, param in self._reg_params.items(): + # If a parameter is not dense, instead of passing the NDArray to + # sparse_forward(), the parameter itself is passed upon which + # row_sparse_data() will be performed. if param._stype != 'default': params[name] = param else: params[name] = param.data(ctx) - ret = self.sparse_forward(x, *args, **params) - for name, param in params.items(): - if param._dirty: - raise RuntimeError("sparse Parameter '%s' was not pulled during " - "sparse_forward(). Please call param.row_sparse_data() " - "with indices to pull sparse parameters in " - "sparse_forward()" % name) + return self.sparse_forward(x, *args, **params) def sparse_forward(self, F, x, *args, **kwargs): """Overrides to define sparse forward computation for this `SparseBlock`. + Note that the *args for :py:meth:`SparseBlock.sparse_forward` is a list of + :py:class:`NDArray`s and :py:class:`Parameter`s. If the storage type of any + Parameter is sparse, the Parameter is passed as :py:class:`Parameter` by itself. + Otherwise, the Parameter is passed as a :py:class:`NDArray`. + + When overridding sparse_forward, typically one needs to invoke + :py:meth:`Parameter.row_sparse_data` to access the data of the Parameter. Parameters ---------- diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 33a1051b93b4..306e725acb7c 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -119,10 +119,11 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t, self.wd_mult = wd_mult self.grad_req = grad_req self.init = init + # sparse related storage type information valid_stypes = ['default', 'row_sparse', 'csr'] - assert grad_stype in valid_stypes, "grad_stype for Parameter %s must be " \ + assert grad_stype in valid_stypes, "grad_stype for Parameter '%s' must be " \ "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, grad_stype) - assert stype in valid_stypes, "stype for Parameter %s must be " \ + assert stype in valid_stypes, "stype for Parameter '%s' must be " \ "one of 'default', 'row_sparse', or 'csr', but got '%s'" % (name, stype) self._grad_stype = grad_stype self._stype = stype @@ -172,8 +173,8 @@ def _set_trainer(self, trainer): """ Set the trainer this parameter is associated with. """ if self._trainer and self._trainer is not trainer: raise RuntimeError( - "Failed to set the trainer for Parameter %s to %s because it was set to %s. " \ - "More than one trainers for a single parameter is not supported." %( + "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \ + "More than one trainers for a single Parameter is not supported." %( self.name, str(trainer), str(self._trainer))) self._trainer = trainer @@ -210,6 +211,7 @@ def _check_and_get(self, arr_list, ctx): "nested child Blocks"%(self.name)) def _get_row_sparse(self, arr_list, ctx, row_id): + """ Get row_sparse data from row_sparse parameters based on row_id. """ results = self._check_and_get(arr_list, ctx) # get row sparse params based on row ids @@ -320,8 +322,8 @@ def _reduce(self): block = self.list_data() data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block) else: - all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64') - data = self.row_sparse_data(context.cpu(), all_row_ids) + all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=context.cpu()) + data = self.row_sparse_data(all_row_ids) return data def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(), @@ -428,27 +430,25 @@ def set_data(self, data): for arr in self._check_and_get(self._data, list): arr[:] = data - def row_sparse_data(self, ctx, row_id): - """Returns a copy of the 'row_sparse' parameter on one context. The copy only - retains rows whose ids occur in provided row ids. + def row_sparse_data(self, row_id): + """Returns a copy of the 'row_sparse' parameter on the same context as row_id's. + The copy only retains rows whose ids occur in provided row ids. The parameter must have been initialized on this context before. Parameters ---------- - ctx : Context - Desired context. row_id: NDArray Row ids to retain for the 'row_sparse' parameter. Returns ------- - NDArray on ctx + NDArray on row_id's context """ if self._stype != 'row_sparse': - raise ValueError("Cannot return a copy of Parameter %s on ctx %s via " \ - "row_sparse_data() because its storage type is %s. Please " \ - "use data() instead." % (self.name, str(ctx), self._stype)) - return self._get_row_sparse(self._data, ctx, row_id) + raise ValueError("Cannot return a copy of Parameter %s via row_sparse_data() " \ + "because its storage type is %s. Please use data() instead." \ + %(self.name, self._stype)) + return self._get_row_sparse(self._data, row_id.context, row_id) def list_row_sparse_data(self, row_id): """Returns copies of the 'row_sparse' parameter on all contexts, in the same order diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index d0eb0327ae08..d07e6e75fdf5 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -143,7 +143,7 @@ def _init_kvstore(self): """ Create kvstore """ arg_arrays = {} if self._contains_sparse: - kvstore, update_on_kvstore = _create_kvstore(self._kvstore) + kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore) # update_on_kvstore is set to False by the user if self._update_on_kvstore is False: raise RuntimeError("Cannot set update_on_kvstore to False when sparse " @@ -342,6 +342,7 @@ def save_states(self, fname): self._init_params() if self._update_on_kvstore: + assert not self._params_to_init, "Cannot save trainer states when some parameters are not yet initialized in kvstore." self._kvstore.save_optimizer_states(fname, dump_optimizer=True) else: with open(fname, 'wb') as fout: diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 8b546a9c6588..8164d2fac5d5 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -49,10 +49,10 @@ def test_parameter(): def test_sparse_parameter(): p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) - row_id = mx.nd.arange(0, 10) + row_id = mx.nd.arange(0, 10, ctx=mx.cpu(1)) assert len(p.list_row_sparse_data(row_id)) == 2 assert len(p.list_grad()) == 2 - weight = p.row_sparse_data(mx.cpu(1), row_id) + weight = p.row_sparse_data(row_id) assert weight.context == mx.cpu(1) assert weight.shape == (10, 10) assert weight.stype == 'row_sparse' @@ -74,7 +74,7 @@ def test_parameter_invalid_access(): # cannot call row_sparse_data on dense parameters p1 = gluon.Parameter('weight', shape=(10, 10)) p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) - assertRaises(ValueError, p1.row_sparse_data, mx.cpu(0), row_id) + assertRaises(ValueError, p1.row_sparse_data, row_id.copyto(mx.cpu(0))) assertRaises(ValueError, p1.list_row_sparse_data, row_id) @with_seed() @@ -86,13 +86,13 @@ def test_paramdict(): assert list(params.keys()) == ['net_w0', 'net_w1'] params.initialize(ctx=mx.cpu()) prev_w0 = params.get('w0').data(mx.cpu()) - prev_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids) + prev_w1 = params.get('w1').row_sparse_data(all_row_ids) params.save('test.params') params.load('test.params', mx.cpu()) # compare the values before and after save/load cur_w0 = params.get('w0').data(mx.cpu()) - cur_w1 = params.get('w1').row_sparse_data(mx.cpu(), all_row_ids) + cur_w1 = params.get('w1').row_sparse_data(all_row_ids) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) @@ -108,13 +108,13 @@ def check_parameter_row_sparse_data(with_trainer): trainer = gluon.Trainer([x], 'sgd') x_param = x._data[0].copy() assert x_param.stype == 'row_sparse' - row_id_0 = mx.nd.array([0,1]) - retained_0 = x.row_sparse_data(ctx0, row_id_0) + row_id_0 = mx.nd.array([0,1], ctx=ctx0) + retained_0 = x.row_sparse_data(row_id_0) retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0)) mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy()) assert retained_0.context == ctx0 - row_id_1 = mx.nd.arange(0, dim0) - retained_1 = x.row_sparse_data(ctx1, row_id_1) + row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1) + retained_1 = x.row_sparse_data(row_id_1) retained_target_1 = x_param mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy()) assert retained_1.context == ctx1 @@ -573,6 +573,14 @@ def test_flatten(): x = mx.nd.zeros((3,)) assert flatten(x).shape == (3, 1) +@with_seed() +@raises(RuntimeError) +def test_multi_trainer(): + x = gluon.Parameter('x', shape=(10,)) + x.initialize() + trainer0 = gluon.Trainer([x], 'sgd') + # multiple trainers for a single Parameter is not allowed + trainer1 = gluon.Trainer([x], 'sgd') @with_seed() def test_trainer(): From 6db6e291504ae9beb6f3bd7b6ae207b1758013ee Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 17 May 2018 18:08:17 +0000 Subject: [PATCH 05/21] add test for save load with cast_stype --- python/mxnet/gluon/parameter.py | 17 +++++++++---- python/mxnet/gluon/trainer.py | 7 ++++-- tests/python/unittest/test_gluon.py | 37 ++++++++++++++++++++--------- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 306e725acb7c..224759f6c66a 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -230,13 +230,13 @@ def _get_row_sparse(self, arr_list, ctx, row_id): for result in results: rows = row_id.as_in_context(result.context) ret.append(ndarray.sparse.retain(result, rows)) - return ret; + return ret # fetch row sparse params from the trainer self._trainer._row_sparse_pull(self, results, row_id) return results - def _load_init(self, data, ctx): + def _load_init(self, data, ctx, cast_stype=False): """(Re)initializes by loading from data.""" if self.shape: for self_dim, data_dim in zip(self.shape, data.shape): @@ -250,6 +250,14 @@ def _load_init(self, data, ctx): "Failed loading Parameter '%s' from saved params: " \ "dtype incompatible expected %s vs saved %s"%( self.name, str(self.dtype), str(data.dtype)) + if self._stype != data.stype: + if not cast_stype: + raise RuntimeError("Failed loading Parameter '%s' from saved params: storage " \ + "type incompatible expected %s vs saved %s. Set " \ + "cast_stype=True to cast saved params to the same stype " \ + "as '%s'."%(self.name, self._stype, data.stype, self.name)) + else: + data = data.tostype(self._stype) if isinstance(ctx, Context): ctx = [ctx] if self._data is None: @@ -322,6 +330,7 @@ def _reduce(self): block = self.list_data() data = ndarray.add_n(*(w.copyto(context.cpu()) for w in block)) / len(block) else: + # fetch all rows for 'row_sparse' param all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=context.cpu()) data = self.row_sparse_data(all_row_ids) return data @@ -870,7 +879,7 @@ def save(self, filename, strip_prefix=''): ndarray.save(filename, arg_dict) def load(self, filename, ctx=None, allow_missing=False, - ignore_extra=False, restore_prefix=''): + ignore_extra=False, restore_prefix='', cast_stype=False): """Load parameters from file. filename : str @@ -908,4 +917,4 @@ def load(self, filename, ctx=None, allow_missing=False, "Please make sure source and target networks have the same prefix."%( name[lprefix:], filename, _brief_print_list(self._params.keys())) continue - self[name]._load_init(arg_dict[name], ctx) + self[name]._load_init(arg_dict[name], ctx, cast_stype=cast_stype) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index d07e6e75fdf5..5bd9f2012595 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -317,8 +317,11 @@ def _update(self, ignore_stale_grad=False): "warning and skip updating of Parameters with stale gradient" \ %(param.name, str(data.context))) - if self._kvstore and self._update_on_kvstore and param._stype == 'default': - self._kvstore.pull(i, param.list_data(), priority=-i) + if self._kvstore and self._update_on_kvstore: + if param._stype == 'default': + # 'row_sparse' parameters are not pulled immediately - they're pulled + # in `SparseBlock.sparse_forward` + self._kvstore.pull(i, param.list_data(), priority=-i) continue for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()): diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 8164d2fac5d5..350cfff85e31 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -79,23 +79,37 @@ def test_parameter_invalid_access(): @with_seed() def test_paramdict(): - params = gluon.ParameterDict('net_') - params.get('w0', shape=(10, 10)) - params.get('w1', shape=(10, 10), stype='row_sparse') + params0 = gluon.ParameterDict('net_') + params0.get('w0', shape=(10, 10)) + params0.get('w1', shape=(10, 10), stype='row_sparse') all_row_ids = mx.nd.arange(0, 10) - assert list(params.keys()) == ['net_w0', 'net_w1'] - params.initialize(ctx=mx.cpu()) - prev_w0 = params.get('w0').data(mx.cpu()) - prev_w1 = params.get('w1').row_sparse_data(all_row_ids) + assert list(params0.keys()) == ['net_w0', 'net_w1'] + params0.initialize(ctx=mx.cpu()) + prev_w0 = params0.get('w0').data(mx.cpu()) + prev_w1 = params0.get('w1').row_sparse_data(all_row_ids) - params.save('test.params') - params.load('test.params', mx.cpu()) + params0.save('test.params') + params0.load('test.params', mx.cpu()) # compare the values before and after save/load - cur_w0 = params.get('w0').data(mx.cpu()) - cur_w1 = params.get('w1').row_sparse_data(all_row_ids) + cur_w0 = params0.get('w0').data(mx.cpu()) + cur_w1 = params0.get('w1').row_sparse_data(all_row_ids) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) + # create a new param dict with dense params, and load from the checkpoint + # of sparse & dense params + params1 = gluon.ParameterDict('net_') + params1.get('w0', shape=(10, 10)) + params1.get('w1', shape=(10, 10)) + assertRaises(RuntimeError, params1.load, 'test.params', mx.cpu()) + params1.load('test.params', mx.cpu(), cast_stype=True) + # compare the values before and after save/load + cur_w0 = params1.get('w0').data(mx.cpu()) + cur_w1 = params1.get('w1').data(mx.cpu()) + mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) + mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) + + @with_seed() def test_parameter_row_sparse_data(): def check_parameter_row_sparse_data(with_trainer): @@ -1105,6 +1119,7 @@ def test_save_load(): net.load_params('test.params') +@with_seed() def test_symbol_block_save_load(): class Net(gluon.HybridBlock): def __init__(self): From 6f0f403e5c99a31e2100d3b23063ace397b0182c Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Fri, 18 May 2018 00:46:23 +0000 Subject: [PATCH 06/21] refactor trainer tests --- python/mxnet/gluon/contrib/__init__.py | 2 - python/mxnet/gluon/contrib/block.py | 66 -------- python/mxnet/gluon/contrib/nn/basic_layers.py | 1 - python/mxnet/gluon/trainer.py | 1 - src/operator/tensor/indexing_op.h | 6 + tests/python/unittest/test_gluon.py | 84 ---------- tests/python/unittest/test_gluon_trainer.py | 150 ++++++++++++++++++ 7 files changed, 156 insertions(+), 154 deletions(-) delete mode 100644 python/mxnet/gluon/contrib/block.py create mode 100644 tests/python/unittest/test_gluon_trainer.py diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py index a1d96e5ad408..f708fb900227 100644 --- a/python/mxnet/gluon/contrib/__init__.py +++ b/python/mxnet/gluon/contrib/__init__.py @@ -23,5 +23,3 @@ from . import rnn from . import data - -from .block import * diff --git a/python/mxnet/gluon/contrib/block.py b/python/mxnet/gluon/contrib/block.py deleted file mode 100644 index 036cd58e39d6..000000000000 --- a/python/mxnet/gluon/contrib/block.py +++ /dev/null @@ -1,66 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# coding: utf-8 -# pylint: disable= arguments-differ -"""Base container class for contrib neural network models.""" -__all__ = ['SparseBlock'] - -from .. import Block - -class SparseBlock(Block): - """`SparseBlock` only supports forwarding with NDArray. - """ - def __init__(self, prefix=None, params=None): - super(SparseBlock, self).__init__(prefix=prefix, params=params) - - def forward(self, x, *args): - """Defines the forward computation. Arguments has to be :py:class:`NDArray`.""" - assert isinstance(x, NDArray), \ - "SparseBlock requires the first argument to forward to be an NDArray, " \ - "but got %s"%type(x) - with x.context as ctx: - params = {} - for name, param in self._reg_params.items(): - # If a parameter is not dense, instead of passing the NDArray to - # sparse_forward(), the parameter itself is passed upon which - # row_sparse_data() will be performed. - if param._stype != 'default': - params[name] = param - else: - params[name] = param.data(ctx) - return self.sparse_forward(x, *args, **params) - - def sparse_forward(self, F, x, *args, **kwargs): - """Overrides to define sparse forward computation for this `SparseBlock`. - Note that the *args for :py:meth:`SparseBlock.sparse_forward` is a list of - :py:class:`NDArray`s and :py:class:`Parameter`s. If the storage type of any - Parameter is sparse, the Parameter is passed as :py:class:`Parameter` by itself. - Otherwise, the Parameter is passed as a :py:class:`NDArray`. - - When overridding sparse_forward, typically one needs to invoke - :py:meth:`Parameter.row_sparse_data` to access the data of the Parameter. - - Parameters - ---------- - x : NDArray - The first input tensor. - *args : list of Parameter or NDArray - Additional input tensors or sparse parameters. - """ - # pylint: disable= invalid-name - raise NotImplementedError diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py index 3e68bd6cb9e5..eccdf18c1bbb 100644 --- a/python/mxnet/gluon/contrib/nn/basic_layers.py +++ b/python/mxnet/gluon/contrib/nn/basic_layers.py @@ -22,7 +22,6 @@ from .... import nd from ...block import HybridBlock -from ..block import SparseBlock from ...nn import Sequential, HybridSequential class Concurrent(Sequential): diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 5bd9f2012595..453055e09472 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -203,7 +203,6 @@ def _row_sparse_pull(self, parameter, out, row_id): self._init_kvstore() if self._params_to_init: self._init_params() - # TODO check state self._kvstore.row_sparse_pull(self._param2idx[parameter.name], \ out=out, row_ids=row_id) diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 28827db0e635..23a866d75af2 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -270,6 +270,12 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs, dispatched = dispatch_mode_assign(dispatch_mode, target_mode); } } + // Print user friendly error message to notify misuses of sparse_grad + if (weight_grad_stype != target_stype) { + LOG(FATAL) << "Cannot use sparse_grad = " << sparse_grad + << ", while stype of gradients w.r.t embedding weight is " + << common::stype_string(weight_grad_stype); + } return dispatched; } diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 350cfff85e31..cc55644db68e 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -28,8 +28,6 @@ import json import unittest - - @with_seed() def test_parameter(): p = gluon.Parameter('weight', shape=(10, 10)) @@ -587,88 +585,6 @@ def test_flatten(): x = mx.nd.zeros((3,)) assert flatten(x).shape == (3, 1) -@with_seed() -@raises(RuntimeError) -def test_multi_trainer(): - x = gluon.Parameter('x', shape=(10,)) - x.initialize() - trainer0 = gluon.Trainer([x], 'sgd') - # multiple trainers for a single Parameter is not allowed - trainer1 = gluon.Trainer([x], 'sgd') - -@with_seed() -def test_trainer(): - def dict_equ(a, b): - assert set(a) == set(b) - for k in a: - assert (a[k].asnumpy() == b[k].asnumpy()).all() - x = gluon.Parameter('x', shape=(10,)) - x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') - trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5}) - with mx.autograd.record(): - for w in x.list_data(): - y = w + 1 - y.backward() - trainer.step(1) - - assert (x.data(mx.cpu(1)).asnumpy() == -2).all() - - x.lr_mult = 0.5 - - with mx.autograd.record(): - for w in x.list_data(): - y = w + 1 - y.backward() - trainer.step(1) - - assert (x.data(mx.cpu(1)).asnumpy() == -4).all() - - trainer.save_states('test_trainer.states') - states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \ - else deepcopy(trainer._updaters[0].states) - trainer.load_states('test_trainer.states') - if trainer._update_on_kvstore: - dict_equ(trainer._kvstore._updater.states, states) - assert trainer._optimizer == trainer._kvstore._updater.optimizer - else: - for updater in trainer._updaters: - dict_equ(updater.states, states) - assert trainer._optimizer == trainer._updaters[0].optimizer - assert_raises(AssertionError, trainer.update, 1) - assert_raises(AssertionError, trainer.allreduce_grads) - - x = gluon.Parameter('x', shape=(10,)) - x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') - trainer2 = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5}, - update_on_kvstore=False) - with mx.autograd.record(): - for i, w in enumerate(x.list_data()): - y = i*w - y.backward() - assert (x.grad(mx.cpu(0)).asnumpy() != x.grad(mx.cpu(1)).asnumpy()).all() - trainer2.allreduce_grads() - assert (x.grad(mx.cpu(0)).asnumpy() == x.grad(mx.cpu(1)).asnumpy()).all() - trainer2.update(1) - - assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy() - -@with_seed() -def test_trainer_save_load(): - x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) - x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') - trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) - with mx.autograd.record(): - for w in x.list_data(): - y = w + 1 - y.backward() - trainer.step(1) - assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1 - trainer.save_states('test_trainer_save_load.states') - trainer.load_states('test_trainer_save_load.states') - x.lr_mult = 2.0 - # check if parameter dict is correctly associated with optimizer after load_state - assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 - @with_seed() def test_block_attr_hidden(): b = gluon.Block() diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py new file mode 100644 index 000000000000..2d7e97b9dd3f --- /dev/null +++ b/tests/python/unittest/test_gluon_trainer.py @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import unittest +import numpy as np +from mxnet import gluon +from mxnet.gluon import nn +from mxnet.test_utils import assert_almost_equal +from common import setup_module, with_seed, assertRaises +from copy import deepcopy +from nose.tools import raises, assert_raises + +@with_seed() +@raises(RuntimeError) +def test_multi_trainer(): + x = gluon.Parameter('x', shape=(10,)) + x.initialize() + trainer0 = gluon.Trainer([x], 'sgd') + # multiple trainers for a single Parameter is not allowed + trainer1 = gluon.Trainer([x], 'sgd') + +@with_seed() +def test_trainer(): + def dict_equ(a, b): + assert set(a) == set(b) + for k in a: + assert (a[k].asnumpy() == b[k].asnumpy()).all() + x = gluon.Parameter('x', shape=(10,)) + x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') + trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5}) + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + + assert (x.data(mx.cpu(1)).asnumpy() == -2).all() + + x.lr_mult = 0.5 + + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + + assert (x.data(mx.cpu(1)).asnumpy() == -4).all() + + trainer.save_states('test_trainer.states') + states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \ + else deepcopy(trainer._updaters[0].states) + trainer.load_states('test_trainer.states') + if trainer._update_on_kvstore: + dict_equ(trainer._kvstore._updater.states, states) + assert trainer._optimizer == trainer._kvstore._updater.optimizer + else: + for updater in trainer._updaters: + dict_equ(updater.states, states) + assert trainer._optimizer == trainer._updaters[0].optimizer + assert_raises(AssertionError, trainer.update, 1) + assert_raises(AssertionError, trainer.allreduce_grads) + + x = gluon.Parameter('x', shape=(10,)) + x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') + trainer2 = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5}, + update_on_kvstore=False) + with mx.autograd.record(): + for i, w in enumerate(x.list_data()): + y = i*w + y.backward() + assert (x.grad(mx.cpu(0)).asnumpy() != x.grad(mx.cpu(1)).asnumpy()).all() + trainer2.allreduce_grads() + assert (x.grad(mx.cpu(0)).asnumpy() == x.grad(mx.cpu(1)).asnumpy()).all() + trainer2.update(1) + + assert (x.data(mx.cpu(1)).asnumpy() == -1).all(), x.data(mx.cpu(1)).asnumpy() + +@with_seed() +def test_trainer_save_load(): + x = gluon.Parameter('x', shape=(10,), lr_mult=1.0) + x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') + trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1}) + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.1 + trainer.save_states('test_trainer_save_load.states') + trainer.load_states('test_trainer_save_load.states') + x.lr_mult = 2.0 + # check if parameter dict is correctly associated with optimizer after load_state + assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 + +@with_seed() +def test_trainer_multi_layer_init(): + class Net(gluon.Block): + def __init__(self, **kwargs): + super(Net, self).__init__(**kwargs) + with self.name_scope(): + # sparse param + self.embed_weight = self.params.get('embed_weight', stype='row_sparse', + shape=(4,3), grad_stype='row_sparse') + # dense param from a hybrid block + self.dense0 = nn.Dense(2) + + def forward(self, x): + embed_weight = self.embed_weight.row_sparse_data(x) + embed = mx.nd.Embedding(data=x, weight=embed_weight, + input_dim=4, output_dim=3, sparse_grad=True) + return self.dense0(embed) + + net = Net(prefix='net_') + ctxes = [mx.cpu(1), mx.cpu(2)] + net.initialize(mx.init.One(), ctx=ctxes) + trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1}) + data = mx.nd.array([[0,2], [1,2]]) + xs = gluon.utils.split_and_load(data, ctxes) + ys = [] + with mx.autograd.record(): + for x in xs: + y = net(x) + ys.append(y) + for y in ys: + y.backward() + trainer.step(1) + # all parameters should be initialized + assert not trainer._params_to_init + all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1)) + # check the updated weights + weight = net.embed_weight.row_sparse_data(all_rows).asnumpy() + assert (weight[0] == -1).all() + assert (weight[1] == -1).all() + assert (weight[2] == -3).all() + assert (weight[3] == 1).all() From 8db0499bc585616e8f2df5da1842aab3ca22abf8 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Fri, 18 May 2018 18:04:51 +0000 Subject: [PATCH 07/21] add test --- tests/python/unittest/test_gluon_trainer.py | 49 +++++++++++---------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 2d7e97b9dd3f..0f0f6c8dca75 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -125,26 +125,29 @@ def forward(self, x): input_dim=4, output_dim=3, sparse_grad=True) return self.dense0(embed) - net = Net(prefix='net_') - ctxes = [mx.cpu(1), mx.cpu(2)] - net.initialize(mx.init.One(), ctx=ctxes) - trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1}) - data = mx.nd.array([[0,2], [1,2]]) - xs = gluon.utils.split_and_load(data, ctxes) - ys = [] - with mx.autograd.record(): - for x in xs: - y = net(x) - ys.append(y) - for y in ys: - y.backward() - trainer.step(1) - # all parameters should be initialized - assert not trainer._params_to_init - all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1)) - # check the updated weights - weight = net.embed_weight.row_sparse_data(all_rows).asnumpy() - assert (weight[0] == -1).all() - assert (weight[1] == -1).all() - assert (weight[2] == -3).all() - assert (weight[3] == 1).all() + def check_init(ctxes): + net = Net(prefix='net_') + net.initialize(mx.init.One(), ctx=ctxes) + trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1}) + data = mx.nd.array([[0,2], [1,2]]) + xs = gluon.utils.split_and_load(data, ctxes) + ys = [] + with mx.autograd.record(): + for x in xs: + y = net(x) + ys.append(y) + for y in ys: + y.backward() + trainer.step(1) + # all parameters should be initialized + assert not trainer._params_to_init + all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1)) + # check the updated weights + weight = net.embed_weight.row_sparse_data(all_rows).asnumpy() + assert (weight[0] == -1).all() + assert (weight[1] == -1).all() + assert (weight[2] == -3).all() + assert (weight[3] == 1).all() + + check_init([mx.cpu(1), mx.cpu(2)]) + check_init([mx.cpu(1)]) From 83009bc655610bc9f85495cca4a7eea768d96a55 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Fri, 18 May 2018 18:16:47 +0000 Subject: [PATCH 08/21] add back test --- tests/python/unittest/test_gluon.py | 500 ++++++++++++++++++++++++++++ 1 file changed, 500 insertions(+) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 4c60492718d0..59f3c0ffbe8b 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -585,6 +585,506 @@ def test_flatten(): x = mx.nd.zeros((3,)) assert flatten(x).shape == (3, 1) +@with_seed() +def test_block_attr_hidden(): + b = gluon.Block() + + # regular attributes can change types + b.a = None + b.a = 1 + + +@raises(TypeError) +@with_seed() +def test_block_attr_block(): + b = gluon.Block() + + # regular variables can't change types + b.b = gluon.Block() + b.b = (2,) + + +@raises(TypeError) +@with_seed() +def test_block_attr_param(): + b = gluon.Block() + + # regular variables can't change types + b.b = gluon.Parameter() + b.b = (2,) + + +@with_seed() +def test_block_attr_regular(): + b = gluon.Block() + + # set block attribute also sets _children + b.c = gluon.Block() + c2 = gluon.Block() + b.c = c2 + assert b.c is c2 and list(b._children.values())[0] is c2 + + +@with_seed() +def test_block_attr_list_of_block(): + class Model1(gluon.Block): + def __init__(self, **kwargs): + super(Model1, self).__init__(**kwargs) + with self.name_scope(): + self.layers = [nn.Dense(i * 10) for i in range(6)] + + class Model2(gluon.Block): + def __init__(self, **kwargs): + super(Model2, self).__init__(**kwargs) + with self.name_scope(): + self.layers = dict() + self.layers['a'] = [nn.Dense(10), nn.Dense(10)] + + class Model3(gluon.Block): + def __init__(self, **kwargs): + super(Model3, self).__init__(**kwargs) + with self.name_scope(): + self.layers = nn.Sequential() + self.layers.add(*[nn.Dense(i * 10) for i in range(6)]) + + class Model4(gluon.Block): + def __init__(self, **kwargs): + super(Model4, self).__init__(**kwargs) + with self.name_scope(): + self.data = {'a': '4', 'b': 123} + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + model = Model1() + model.collect_params() + assert len(w) > 0 + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + model = Model2() + model.collect_params() + assert len(w) > 0 + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + model = Model3() + model.collect_params() + assert len(w) == 0 + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + model = Model4() + model.collect_params() + assert len(w) == 0 + +def check_sequential(net): + dense1 = gluon.nn.Dense(10) + net.add(dense1) + dense2 = gluon.nn.Dense(10) + net.add(dense2) + dense3 = gluon.nn.Dense(10) + net.add(dense3) + + assert net[1] is dense2 + assert net[-1] is dense3 + slc = net[1:3] + assert len(slc) == 2 and slc[0] is dense2 and slc[1] is dense3 + assert isinstance(slc, type(net)) + +@with_seed() +def test_sequential(): + check_sequential(gluon.nn.Sequential()) + check_sequential(gluon.nn.HybridSequential()) + +@with_seed() +def test_sequential_warning(): + with warnings.catch_warnings(record=True) as w: + # The following line permits the test to pass if run multiple times + warnings.simplefilter('always') + b = gluon.nn.Sequential() + b.add(gluon.nn.Dense(20)) + b.hybridize() + assert len(w) == 1 + + +@with_seed() +def test_global_norm_clip(): + x1 = mx.nd.ones((3,3)) + x2 = mx.nd.ones((4,4)) + norm = gluon.utils.clip_global_norm([x1, x2], 1.0) + assert norm == 5.0 + assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5) + assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5) + + x3 = mx.nd.array([1.0, 2.0, float('nan')]) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + gluon.utils.clip_global_norm([x1, x3], 2.0) + assert len(w) == 1 + + +@with_seed() +def test_embedding(): + def check_embedding(sparse_grad): + layer = gluon.nn.Embedding(10, 100, sparse_grad=sparse_grad) + layer.initialize() + x = mx.nd.array([3,4,2,0,1]) + with mx.autograd.record(): + y = layer(x) + y.backward() + assert (layer.weight.grad().asnumpy()[:5] == 1).all() + assert (layer.weight.grad().asnumpy()[5:] == 0).all() + check_embedding(True) + check_embedding(False) + +@with_seed() +def test_export(): + ctx = mx.context.current_context() + model = gluon.model_zoo.vision.resnet18_v1( + prefix='resnet', ctx=ctx, pretrained=True) + model.hybridize() + data = mx.nd.random.normal(shape=(1, 3, 224, 224)) + out = model(data) + + model.export('gluon') + + module = mx.mod.Module.load('gluon', 0, label_names=None, context=ctx) + module.bind(data_shapes=[('data', data.shape)]) + module.forward(mx.io.DataBatch([data], None), is_train=False) + mod_out, = module.get_outputs() + + assert_almost_equal(out.asnumpy(), mod_out.asnumpy()) + + model2 = gluon.model_zoo.vision.resnet18_v1(prefix='resnet', ctx=ctx) + model2.collect_params().load('gluon-0000.params', ctx) + out2 = model2(data) + + assert_almost_equal(out.asnumpy(), out2.asnumpy()) + + +@with_seed() +def test_hybrid_stale_cache(): + net = mx.gluon.nn.HybridSequential() + with net.name_scope(): + net.add(mx.gluon.nn.Dense(10, weight_initializer='zeros', bias_initializer='ones', flatten=False)) + + net.hybridize() + net.initialize() + net(mx.nd.ones((2,3,5))) + + net.add(mx.gluon.nn.Flatten()) + assert net(mx.nd.ones((2,3,5))).shape == (2, 30) + + net = mx.gluon.nn.HybridSequential() + with net.name_scope(): + net.fc1 = mx.gluon.nn.Dense(10, weight_initializer='zeros', + bias_initializer='ones', flatten=False) + net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros', + bias_initializer='ones', flatten=False) + net.hybridize() + net.initialize() + net(mx.nd.ones((2,3,5))) + + net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros', + bias_initializer='ones', flatten=True) + net.initialize() + assert net(mx.nd.ones((2,3,5))).shape == (2, 10) + + +@with_seed() +def test_lambda(): + net1 = mx.gluon.nn.HybridSequential() + net1.add(nn.Activation('tanh'), + nn.LeakyReLU(0.1)) + + net2 = mx.gluon.nn.HybridSequential() + op3 = lambda F, x, *args: F.LeakyReLU(x, *args, slope=0.1) + net2.add(nn.HybridLambda('tanh'), + nn.HybridLambda(op3)) + + op4 = lambda x: mx.nd.LeakyReLU(x, slope=0.1) + net3 = mx.gluon.nn.Sequential() + net3.add(nn.Lambda('tanh'), + nn.Lambda(op4)) + + input_data = mx.nd.random.uniform(shape=(2, 3, 5, 7)) + out1, out2, out3 = net1(input_data), net2(input_data), net3(input_data) + assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-3, atol=1e-3) + assert_almost_equal(out1.asnumpy(), out3.asnumpy(), rtol=1e-3, atol=1e-3) + + +@with_seed() +def test_fill_shape_deferred(): + net = nn.HybridSequential() + with net.name_scope(): + net.add(nn.Conv2D(64, kernel_size=2, padding=1), + nn.BatchNorm(), + nn.Dense(10)) + net.hybridize() + net.initialize() + net(mx.nd.ones((2,3,5,7))) + assert net[0].weight.shape[1] == 3, net[0].weight.shape[1] + assert net[1].gamma.shape[0] == 64, net[1].gamma.shape[0] + assert net[2].weight.shape[1] == 3072, net[2].weight.shape[1] + + +@with_seed() +def test_dtype(): + net = mx.gluon.model_zoo.vision.resnet18_v1() + net.initialize() + net.cast('float64') + with mx.autograd.record(): + y = net(mx.nd.ones((16, 3, 32, 32), dtype='float64')) + y.backward() + + net = mx.gluon.model_zoo.vision.resnet18_v1() + net.initialize() + net.hybridize() + net(mx.nd.ones((16, 3, 32, 32), dtype='float32')) + + net.cast('float64') + net(mx.nd.ones((16, 3, 32, 32), dtype='float64')) + + mx.nd.waitall() + + class Net(gluon.Block): + def __init__(self, in_dim, output_dim): + super(Net, self).__init__() + with self.name_scope(): + self.embed = gluon.nn.Embedding(input_dim=in_dim, output_dim=output_dim,dtype=np.float64) + self.dense = gluon.nn.Dense(2, dtype=np.float64) + + def forward(self, x): + e = self.embed(x) + assert(e.dtype == np.float64) + y = self.dense(e) + assert(y.dtype == np.float64) + return y + + net = Net(5, 10) + net.initialize() + out = net(mx.nd.ones((3,), dtype=np.float64)) + mx.nd.waitall() + +@with_seed() +def test_fill_shape_load(): + ctx = mx.context.current_context() + net1 = nn.HybridSequential() + with net1.name_scope(): + net1.add(nn.Conv2D(64, kernel_size=2, padding=1), + nn.BatchNorm(), + nn.Dense(10)) + net1.hybridize() + net1.initialize(ctx=ctx) + net1(mx.nd.ones((2,3,5,7), ctx)) + net1.save_params('net_fill.params') + + net2 = nn.HybridSequential() + with net2.name_scope(): + net2.add(nn.Conv2D(64, kernel_size=2, padding=1), + nn.BatchNorm(), + nn.Dense(10)) + net2.hybridize() + net2.initialize() + net2.load_params('net_fill.params', ctx) + assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1] + assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0] + assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1] + + +@with_seed() +def test_inline(): + net = mx.gluon.nn.HybridSequential() + with net.name_scope(): + net.add(mx.gluon.nn.Dense(10)) + net.add(mx.gluon.nn.Dense(10)) + net.add(mx.gluon.nn.Dense(10)) + + net.initialize() + net.hybridize(inline_limit=3) + with mx.autograd.record(): + y = net(mx.nd.zeros((1,10))) + + len_1 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes']) + y.backward() + + net.hybridize(inline_limit=0) + with mx.autograd.record(): + y = net(mx.nd.zeros((1,10))) + + len_2 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes']) + y.backward() + + assert len_1 == len_2 + 2 + + +@with_seed() +def test_activations(): + point_to_validate = mx.nd.array([-0.1, 0.1] * 3) + + swish = mx.gluon.nn.Swish() + def swish_test(x): + return x * mx.nd.sigmoid(x) + + for test_point, ref_point in zip(swish_test(point_to_validate), swish(point_to_validate)): + assert test_point == ref_point + + elu = mx.gluon.nn.ELU() + def elu_test(x): + def elu(x): + return 1.0 * (mx.nd.exp(x) - 1) if x < 0 else x + return [elu(x_i) for x_i in x] + + for test_point, ref_point in zip(elu_test(point_to_validate), elu(point_to_validate)): + assert test_point == ref_point + + selu = mx.gluon.nn.SELU() + def selu_test(x): + def selu(x): + scale, alpha = 1.0507009873554804934193349852946, 1.6732632423543772848170429916717 + return scale * x if x >= 0 else alpha * mx.nd.exp(x) - alpha + return [selu(x_i) for x_i in x] + + for test_point, ref_point in zip(selu(point_to_validate), selu(point_to_validate)): + assert test_point == ref_point + + prelu = mx.gluon.nn.PReLU() + prelu.initialize() + x = point_to_validate.reshape((1, 3, 2)) + assert_almost_equal(prelu(x).asnumpy(), mx.nd.where(x >= 0, x, 0.25 * x).asnumpy()) + +@with_seed() +def test_dropout(): + def get_slice(x, axis, idx): + ix = () + for i in range(x.ndim): + if i == axis: + ix += (idx,) + else: + ix += (slice(None, None, None),) + return x[ix] + + def check_dropout_axes(ratio, shape, axes): + compactshape = list(shape) + for axis in axes: + compactshape[axis] = 1 + compactx = mx.random.uniform(shape=tuple(compactshape)) + broadcastx = compactx.broadcast_to(shape) + dropouty = mx.gluon.nn.Dropout(rate=ratio, axes=axes)(broadcastx) + for axis in axes: + target = get_slice(dropouty, axis, 0).asnumpy() + for i in range(1, shape[axis]): + assert(get_slice(dropouty, axis, i).asnumpy() == target).all() + + nshape = (10, 10, 10, 10) + with mx.autograd.train_mode(): + check_dropout_axes(0.25, nshape, axes = (0,)) + check_dropout_axes(0.25, nshape, axes = (1,)) + check_dropout_axes(0.25, nshape, axes = (2,)) + check_dropout_axes(0.25, nshape, axes = (3,)) + check_dropout_axes(0.25, nshape, axes = (0, 1)) + check_dropout_axes(0.25, nshape, axes = (0, 2)) + check_dropout_axes(0.25, nshape, axes = (0, 3)) + check_dropout_axes(0.25, nshape, axes = (1, 2)) + check_dropout_axes(0.25, nshape, axes = (1, 3)) + check_dropout_axes(0.25, nshape, axes = (2, 3)) + check_dropout_axes(0.25, nshape, axes = (0, 1, 2)) + check_dropout_axes(0.25, nshape, axes = (0, 2, 3)) + check_dropout_axes(0.25, nshape, axes = (1, 2, 3)) + +@with_seed() +def test_req(): + data = mx.nd.random.uniform(shape=(1,3,224,224)) + label = mx.nd.random.uniform(shape=(1)) + label[:] = 1 + loss = gluon.loss.SoftmaxCrossEntropyLoss() + + net = nn.HybridSequential() + net1 = nn.HybridSequential() + net1.add(nn.Dense(4)) + net2 = nn.HybridSequential() + net2.add(nn.Dense(3)) + net2.add(nn.Dense(2)) + net.add(net1) + net.add(net2) + net.initialize() + + net.hybridize() + + for v in net.collect_params().values(): + v.grad_req = 'add' + + net.collect_params().zero_grad() + with mx.autograd.record(): + pred = net(data) + l = loss(pred, label) + l.backward() + grad = net[0][0].weight.grad().mean().asnumpy() + # run twice to check req = add + pred = net(data) + l = loss(pred, label) + l.backward() + + grad_double = net[0][0].weight.grad().mean().asnumpy() + assert_almost_equal(grad * 2, grad_double) + + +@with_seed() +def test_save_load(): + net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True) + net.save_params('test.params') + + net = mx.gluon.model_zoo.vision.get_resnet(1, 18) + net.output = mx.gluon.nn.Dense(1000) + + net.load_params('test.params') + +@with_seed() +def test_symbol_block_save_load(): + class Net(gluon.HybridBlock): + def __init__(self): + super(Net, self).__init__() + with self.name_scope(): + backbone = gluon.model_zoo.vision.resnet18_v1() + data = mx.sym.var('data') + featnames = ['stage1_activation0', 'stage2_activation0', 'stage3_activation0'] + out_names = ['_'.join([backbone.name, featname, 'output']) for featname in featnames] + internals = backbone(data).get_internals() + outs = [internals[out_name] for out_name in out_names] + self.backbone = gluon.SymbolBlock(outs, data, params=backbone.collect_params()) + self.body = nn.Conv2D(3, 1) + + def hybrid_forward(self, F, x): + x = self.body(x) + return self.backbone(x) + + net1 = Net() + net1.initialize(mx.init.Normal()) + net1.hybridize() + net1(mx.nd.random.normal(shape=(1, 3, 32, 32))) + net1.save_params('./test.params') + + net2 = Net() + net2.load_params('./test.params', ctx=mx.cpu()) + + +@with_seed() +def test_hybrid_multi_context(): + net = mx.gluon.model_zoo.vision.get_resnet(1, 18) + net.initialize(ctx=[mx.cpu(0), mx.cpu(1)]) + net.hybridize() + net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy() + + +@with_seed() +def test_zero_grad(): + data = mx.nd.random.uniform(shape=(3,3)) + net = nn.Embedding(3, 4, sparse_grad=True, prefix='test_zero_grad_') + net.initialize() + with mx.autograd.record(): + l = net(data) + l.backward() + net.collect_params().zero_grad() + grad = net.collect_params()['test_zero_grad_weight'].grad() + assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0) + if __name__ == '__main__': import nose From cf006c8036b240d6e77577de6c40a00ffc6862ba Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Fri, 18 May 2018 23:48:35 +0000 Subject: [PATCH 09/21] raise error for load params --- python/mxnet/gluon/parameter.py | 20 ++---- python/mxnet/gluon/trainer.py | 1 - tests/python/unittest/test_gluon.py | 94 +++++++++++++++-------------- 3 files changed, 56 insertions(+), 59 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 224759f6c66a..6360135b439c 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -212,25 +212,14 @@ def _check_and_get(self, arr_list, ctx): def _get_row_sparse(self, arr_list, ctx, row_id): """ Get row_sparse data from row_sparse parameters based on row_id. """ - results = self._check_and_get(arr_list, ctx) - # get row sparse params based on row ids if not isinstance(row_id, ndarray.NDArray): raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. " "NDArray type is expected." % (self.name, row_id)) if not self._trainer: - # the sparse param is not yet associated with any trainer. - # and create new buffers to hold the outputs. - ret = [] - if isinstance(results, ndarray.NDArray): - rows = row_id.as_in_context(results.context) - ret = ndarray.sparse.retain(results, rows) - else: - # list of NDArrays - for result in results: - rows = row_id.as_in_context(result.context) - ret.append(ndarray.sparse.retain(result, rows)) - return ret + raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \ + "Trainer is created with it."%self.name) + results = self._check_and_get(arr_list, ctx) # fetch row sparse params from the trainer self._trainer._row_sparse_pull(self, results, row_id) @@ -238,6 +227,9 @@ def _get_row_sparse(self, arr_list, ctx, row_id): def _load_init(self, data, ctx, cast_stype=False): """(Re)initializes by loading from data.""" + if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore: + raise RuntimeError("Cannot (Re)initialize Parameter '%s' when its Trainer " \ + "already initialized the parameter on KVStore."%(self.name)) if self.shape: for self_dim, data_dim in zip(self.shape, data.shape): assert self_dim == 0 or self_dim == data_dim, \ diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 453055e09472..188ecedcecec 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -198,7 +198,6 @@ def set_learning_rate(self, lr): def _row_sparse_pull(self, parameter, out, row_id): # initialize kv and params if not already - assert self._kvstore, "Cannot perform row_sparse_pull() when KVStore is not set" if not self._kv_initialized: self._init_kvstore() if self._params_to_init: diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 59f3c0ffbe8b..0285fac8b544 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -48,8 +48,11 @@ def test_sparse_parameter(): p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) row_id = mx.nd.arange(0, 10, ctx=mx.cpu(1)) - assert len(p.list_row_sparse_data(row_id)) == 2 assert len(p.list_grad()) == 2 + # getting row_sparse data without trainer throws an exception + assertRaises(RuntimeError, p.list_row_sparse_data, row_id) + trainer = mx.gluon.Trainer([p], 'sgd') + assert len(p.list_row_sparse_data(row_id)) == 2 weight = p.row_sparse_data(row_id) assert weight.context == mx.cpu(1) assert weight.shape == (10, 10) @@ -83,60 +86,63 @@ def test_paramdict(): all_row_ids = mx.nd.arange(0, 10) assert list(params0.keys()) == ['net_w0', 'net_w1'] params0.initialize(ctx=mx.cpu()) + trainer0 = mx.gluon.Trainer(params0, 'sgd') prev_w0 = params0.get('w0').data(mx.cpu()) prev_w1 = params0.get('w1').row_sparse_data(all_row_ids) + params0.save('test_paramdict.params') + # Cannot load parameters if they are already initialized on trainer's kvstore + assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu()) + + params1 = gluon.ParameterDict('net_') + params1.get('w0', shape=(10, 10)) + params1.get('w1', shape=(10, 10), stype='row_sparse') + params1.load('test_paramdict.params', mx.cpu()) + trainer1 = mx.gluon.Trainer(params1, 'sgd') - params0.save('test.params') - params0.load('test.params', mx.cpu()) # compare the values before and after save/load - cur_w0 = params0.get('w0').data(mx.cpu()) - cur_w1 = params0.get('w1').row_sparse_data(all_row_ids) + cur_w0 = params1.get('w0').data(mx.cpu()) + cur_w1 = params1.get('w1').row_sparse_data(all_row_ids) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) - # create a new param dict with dense params, and load from the checkpoint # of sparse & dense params - params1 = gluon.ParameterDict('net_') - params1.get('w0', shape=(10, 10)) - params1.get('w1', shape=(10, 10)) - assertRaises(RuntimeError, params1.load, 'test.params', mx.cpu()) - params1.load('test.params', mx.cpu(), cast_stype=True) + params2 = gluon.ParameterDict('net_') + params2.get('w0', shape=(10, 10)) + params2.get('w1', shape=(10, 10)) + assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu()) + params2.load('test_paramdict.params', mx.cpu(), cast_stype=True) # compare the values before and after save/load - cur_w0 = params1.get('w0').data(mx.cpu()) - cur_w1 = params1.get('w1').data(mx.cpu()) + cur_w0 = params2.get('w0').data(mx.cpu()) + cur_w1 = params2.get('w1').data(mx.cpu()) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) @with_seed() def test_parameter_row_sparse_data(): - def check_parameter_row_sparse_data(with_trainer): - ctx0 = mx.cpu(1) - ctx1 = mx.cpu(2) - dim0 = 4 - x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse') - x.initialize(init='xavier', ctx=[ctx0, ctx1]) - if with_trainer: - trainer = gluon.Trainer([x], 'sgd') - x_param = x._data[0].copy() - assert x_param.stype == 'row_sparse' - row_id_0 = mx.nd.array([0,1], ctx=ctx0) - retained_0 = x.row_sparse_data(row_id_0) - retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0)) - mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy()) - assert retained_0.context == ctx0 - row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1) - retained_1 = x.row_sparse_data(row_id_1) - retained_target_1 = x_param - mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy()) - assert retained_1.context == ctx1 - row_id_2 = mx.nd.array([0,1,2]) - retained_2 = x.list_row_sparse_data(row_id_2) - retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0)) - mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy()) - - check_parameter_row_sparse_data(True) - check_parameter_row_sparse_data(False) + ctx0 = mx.cpu(1) + ctx1 = mx.cpu(2) + dim0 = 4 + x = gluon.Parameter('x', shape=(dim0, 2), stype='row_sparse') + x.initialize(init='xavier', ctx=[ctx0, ctx1]) + trainer = gluon.Trainer([x], 'sgd') + x_param = x._data[0].copy() + assert x_param.stype == 'row_sparse' + row_id_0 = mx.nd.array([0,1], ctx=ctx0) + retained_0 = x.row_sparse_data(row_id_0) + retained_target_0 = mx.nd.sparse.retain(x_param, row_id_0.as_in_context(ctx0)) + mx.test_utils.assert_almost_equal(retained_0.asnumpy(), retained_target_0.asnumpy()) + assert retained_0.context == ctx0 + row_id_1 = mx.nd.arange(0, dim0, ctx=ctx1) + retained_1 = x.row_sparse_data(row_id_1) + retained_target_1 = x_param + mx.test_utils.assert_almost_equal(retained_1.asnumpy(), retained_target_1.asnumpy()) + assert retained_1.context == ctx1 + row_id_2 = mx.nd.array([0,1,2]) + retained_2 = x.list_row_sparse_data(row_id_2) + retained_target_2 = mx.nd.sparse.retain(x_param, row_id_2.as_in_context(ctx0)) + mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy()) + @with_seed() def test_constant(): @@ -1029,12 +1035,12 @@ def test_req(): @with_seed() def test_save_load(): net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=True) - net.save_params('test.params') + net.save_params('test_save_load.params') net = mx.gluon.model_zoo.vision.get_resnet(1, 18) net.output = mx.gluon.nn.Dense(1000) - net.load_params('test.params') + net.load_params('test_save_load.params') @with_seed() def test_symbol_block_save_load(): @@ -1059,10 +1065,10 @@ def hybrid_forward(self, F, x): net1.initialize(mx.init.Normal()) net1.hybridize() net1(mx.nd.random.normal(shape=(1, 3, 32, 32))) - net1.save_params('./test.params') + net1.save_params('./test_symbol_block_save_load.params') net2 = Net() - net2.load_params('./test.params', ctx=mx.cpu()) + net2.load_params('./test_symbol_block_save_load.params', ctx=mx.cpu()) @with_seed() From 4e9ab9c8424bcbda8c424712a293de269bc663ca Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sat, 19 May 2018 00:02:12 +0000 Subject: [PATCH 10/21] add comment --- python/mxnet/gluon/parameter.py | 4 ++-- python/mxnet/gluon/trainer.py | 3 ++- tests/python/unittest/test_gluon.py | 7 ++++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 6360135b439c..8e6cfed94d22 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -214,8 +214,8 @@ def _get_row_sparse(self, arr_list, ctx, row_id): """ Get row_sparse data from row_sparse parameters based on row_id. """ # get row sparse params based on row ids if not isinstance(row_id, ndarray.NDArray): - raise TypeError("Cannot get 'row_sparse' Parameter %s with row_id = %s. " - "NDArray type is expected." % (self.name, row_id)) + raise TypeError("Cannot get 'row_sparse' Parameter %s with %s type. " + "NDArray type is expected." % (self.name, type(row_id))) if not self._trainer: raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \ "Trainer is created with it."%self.name) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 188ecedcecec..b2f29acf0b66 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -343,7 +343,8 @@ def save_states(self, fname): self._init_params() if self._update_on_kvstore: - assert not self._params_to_init, "Cannot save trainer states when some parameters are not yet initialized in kvstore." + assert not self._params_to_init, "Cannot save trainer states when some " \ + "parameters are not yet initialized in kvstore." self._kvstore.save_optimizer_states(fname, dump_optimizer=True) else: with open(fname, 'wb') as fout: diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 0285fac8b544..fcd04298e886 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -84,15 +84,18 @@ def test_paramdict(): params0.get('w0', shape=(10, 10)) params0.get('w1', shape=(10, 10), stype='row_sparse') all_row_ids = mx.nd.arange(0, 10) + # check param names assert list(params0.keys()) == ['net_w0', 'net_w1'] params0.initialize(ctx=mx.cpu()) trainer0 = mx.gluon.Trainer(params0, 'sgd') prev_w0 = params0.get('w0').data(mx.cpu()) prev_w1 = params0.get('w1').row_sparse_data(all_row_ids) + # save params params0.save('test_paramdict.params') - # Cannot load parameters if they are already initialized on trainer's kvstore + # cannot load parameters if they are already initialized on trainer's kvstore assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu()) + # load params params1 = gluon.ParameterDict('net_') params1.get('w0', shape=(10, 10)) params1.get('w1', shape=(10, 10), stype='row_sparse') @@ -104,6 +107,7 @@ def test_paramdict(): cur_w1 = params1.get('w1').row_sparse_data(all_row_ids) mx.test_utils.assert_almost_equal(prev_w0.asnumpy(), cur_w0.asnumpy()) mx.test_utils.assert_almost_equal(prev_w1.asnumpy(), cur_w1.asnumpy()) + # create a new param dict with dense params, and load from the checkpoint # of sparse & dense params params2 = gluon.ParameterDict('net_') @@ -111,6 +115,7 @@ def test_paramdict(): params2.get('w1', shape=(10, 10)) assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu()) params2.load('test_paramdict.params', mx.cpu(), cast_stype=True) + # compare the values before and after save/load cur_w0 = params2.get('w0').data(mx.cpu()) cur_w1 = params2.get('w1').data(mx.cpu()) From a991e988f3701567ccab9b6e60c801c831f0758a Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sat, 19 May 2018 00:10:38 +0000 Subject: [PATCH 11/21] remove print --- python/mxnet/gluon/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index b2f29acf0b66..5422295d45db 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -129,7 +129,6 @@ def _init_params(self): if not param._deferred_init] params_to_init = [param for param in self._params_to_init \ if param._deferred_init] - print('initing ', params) for param in params: param_arrays = param._check_and_get(param._data, list) idx = self._param2idx[param.name] From 468b5999f9787a4088bf48737bb68f99b41d196c Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sat, 19 May 2018 00:19:37 +0000 Subject: [PATCH 12/21] fix doc --- python/mxnet/gluon/block.py | 1 + python/mxnet/gluon/parameter.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 52b385daa14f..206991ccae3e 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -443,6 +443,7 @@ class HybridBlock(Block): Refer `Hybrid tutorial `_ to see the end-to-end usage. + """ def __init__(self, prefix=None, params=None): # check if any parameter is row_sparse diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 8e6cfed94d22..3b0ecdd56530 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -458,8 +458,6 @@ def list_row_sparse_data(self, row_id): Parameters ---------- - ctx : Context - Desired context. row_id: NDArray Row ids to retain for the 'row_sparse' parameter. @@ -885,6 +883,9 @@ def load(self, filename, ctx=None, allow_missing=False, present in this ParameterDict. restore_prefix : str, default '' prepend prefix to names of stored parameters before loading. + cast_stype: bool, default False + Whether to cast the storage type of parameters from the file to current + Parameter's storage type if stypes mismatch. """ if restore_prefix: for name in self.keys(): From 0f703440f1ca9dafdecc36727aad0fbb80be38c7 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Mon, 21 May 2018 21:19:11 +0000 Subject: [PATCH 13/21] CR comments --- python/mxnet/gluon/block.py | 4 ++-- python/mxnet/gluon/parameter.py | 2 +- tests/python/unittest/test_gluon.py | 14 ++++++++++++-- tests/python/unittest/test_gluon_trainer.py | 6 ++++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 206991ccae3e..9e5ecfc4ea31 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -452,8 +452,8 @@ def __init__(self, prefix=None, params=None): stype = param._stype if stype != 'default': raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \ - "because its storage type is %s. Please consider " \ - "using a SparseBlock instead."%(param.name, stype)) + "because its storage type is %s. Please use " \ + "a SparseBlock instead."%(param.name, stype)) super(HybridBlock, self).__init__(prefix=prefix, params=params) self._cached_graph = () self._cached_op = None diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 3b0ecdd56530..c323c0c2f6b0 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -171,7 +171,7 @@ def shape(self, new_shape): def _set_trainer(self, trainer): """ Set the trainer this parameter is associated with. """ - if self._trainer and self._trainer is not trainer: + if self._trainer and trainer and self._trainer is not trainer: raise RuntimeError( "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \ "More than one trainers for a single Parameter is not supported." %( diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index fcd04298e886..c7e348eef522 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -38,11 +38,21 @@ def test_parameter(): assert p.data(mx.cpu(0)).shape == (10, 10) assert p.var().name == 'weight' assert p.grad(mx.cpu(0)).stype == 'default' - assert p.data().stype == 'default' + assert p.data(mx.cpu(0)).stype == 'default' p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)]) assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)] +@with_seed() +@raises(AssertionError) +def test_invalid_parameter_stype(): + p = gluon.Parameter('weight', shape=(10, 10), stype='invalid') + +@with_seed() +@raises(AssertionError) +def test_invalid_parameter_grad_stype(): + p = gluon.Parameter('weight', shape=(10, 10), grad_stype='invalid') + @with_seed() def test_sparse_parameter(): p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') @@ -83,7 +93,7 @@ def test_paramdict(): params0 = gluon.ParameterDict('net_') params0.get('w0', shape=(10, 10)) params0.get('w1', shape=(10, 10), stype='row_sparse') - all_row_ids = mx.nd.arange(0, 10) + all_row_ids = mx.nd.arange(0, 10, ctx=mx.cpu()) # check param names assert list(params0.keys()) == ['net_w0', 'net_w1'] params0.initialize(ctx=mx.cpu()) diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index fca2cb442563..3af0aa8734c5 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -30,7 +30,13 @@ def test_multi_trainer(): x = gluon.Parameter('x', shape=(10,)) x.initialize() + # test set trainer trainer0 = gluon.Trainer([x], 'sgd') + assert(x._trainer is trainer0) + # test unset trainer + x._set_trainer(None) + assert(x._trainer is None) + x._set_trainer(trainer0) # multiple trainers for a single Parameter is not allowed trainer1 = gluon.Trainer([x], 'sgd') From ff9bf84c76567de58679cea80fb58234e0a569e6 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 23 May 2018 00:56:04 +0000 Subject: [PATCH 14/21] CR comments --- python/mxnet/gluon/block.py | 8 ------ python/mxnet/gluon/parameter.py | 19 ++++++++------ python/mxnet/gluon/trainer.py | 28 +++++++++++---------- tests/python/unittest/test_gluon.py | 9 ++++--- tests/python/unittest/test_gluon_trainer.py | 4 +-- 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 9e5ecfc4ea31..483c06bccc26 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -446,14 +446,6 @@ class HybridBlock(Block): """ def __init__(self, prefix=None, params=None): - # check if any parameter is row_sparse - if isinstance(params, ParameterDict): - for param in params.values(): - stype = param._stype - if stype != 'default': - raise ValueError("Cannot create a HybridBlock with Parameter '%s' " \ - "because its storage type is %s. Please use " \ - "a SparseBlock instead."%(param.name, stype)) super(HybridBlock, self).__init__(prefix=prefix, params=params) self._cached_graph = () self._cached_op = None diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index c323c0c2f6b0..adbe0243c8ba 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -171,11 +171,12 @@ def shape(self, new_shape): def _set_trainer(self, trainer): """ Set the trainer this parameter is associated with. """ - if self._trainer and trainer and self._trainer is not trainer: + # trainer cannot be replaced for sparse params + if self._stype != 'default' and self._trainer and trainer and self._trainer is not trainer: raise RuntimeError( - "Failed to set the trainer for Parameter '%s' to %s because it was set to %s. " \ - "More than one trainers for a single Parameter is not supported." %( - self.name, str(trainer), str(self._trainer))) + "Failed to set the trainer for Parameter '%s' because it was already set. " \ + "More than one trainers for a %s Parameter is not supported." \ + %(self.name, self._stype)) self._trainer = trainer def _check_and_get(self, arr_list, ctx): @@ -214,8 +215,7 @@ def _get_row_sparse(self, arr_list, ctx, row_id): """ Get row_sparse data from row_sparse parameters based on row_id. """ # get row sparse params based on row ids if not isinstance(row_id, ndarray.NDArray): - raise TypeError("Cannot get 'row_sparse' Parameter %s with %s type. " - "NDArray type is expected." % (self.name, type(row_id))) + raise TypeError("row_id must have NDArray type, but %s is given"%(type(row_id))) if not self._trainer: raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \ "Trainer is created with it."%self.name) @@ -228,8 +228,11 @@ def _get_row_sparse(self, arr_list, ctx, row_id): def _load_init(self, data, ctx, cast_stype=False): """(Re)initializes by loading from data.""" if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore: - raise RuntimeError("Cannot (Re)initialize Parameter '%s' when its Trainer " \ - "already initialized the parameter on KVStore."%(self.name)) + if self not in self._trainer._params_to_init: + raise RuntimeError("Cannot (re)initialize Parameter '%s' because it was " \ + "already initialized on the trainer's KVStore, and " \ + "trainer._update_on_kvstore is True. Please create a " \ + "new Trainer with this Parameter."%(self.name)) if self.shape: for self_dim, data_dim in zip(self.shape, data.shape): assert self_dim == 0 or self_dim == data_dim, \ diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 5422295d45db..50a6526ef831 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -68,6 +68,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got %s."%(type(params))) self._params = [] + # parameters to initialize on the kvstore self._params_to_init = [] self._contains_sparse = False self._param2idx = {} @@ -118,28 +119,29 @@ def _init_optimizer(self, optimizer, optimizer_params): for _ in self._contexts] def _init_params(self): - """ Initialize parameters in the KVStore. Parameters whose - intiailization is incomplete are ignored. + """Initialize parameters in the KVStore. + + Parameters with incomplete initialization are ignored. + """ assert self._kv_initialized, "Cannot initialize parameters in KVStore " \ "when KVStore is not initialized." params_to_init = [] if self._kvstore: - params = [param for param in self._params_to_init \ - if not param._deferred_init] - params_to_init = [param for param in self._params_to_init \ - if param._deferred_init] - for param in params: - param_arrays = param._check_and_get(param._data, list) - idx = self._param2idx[param.name] - self._kvstore.init(idx, param_arrays[0]) - if param._stype == 'default': - self._kvstore.pull(idx, param_arrays, priority=-idx) + for param in self._params_to_init: + if param._deferred_init: + params_to_init.append(param) + else: + param_arrays = param._check_and_get(param._data, list) + idx = self._param2idx[param.name] + self._kvstore.init(idx, param_arrays[0]) + if param._stype == 'default': + self._kvstore.pull(idx, param_arrays, priority=-idx) self._params_to_init = params_to_init def _init_kvstore(self): - """ Create kvstore """ + """Create kvstore.""" arg_arrays = {} if self._contains_sparse: kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index c7e348eef522..6c8ff8677aba 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -351,10 +351,13 @@ def test_sparse_symbol_block(): @raises(ValueError) def test_sparse_hybrid_block(): params = gluon.ParameterDict('net_') - params.get('weight', shape=(5, 5), stype='row_sparse') - params.get('bias', shape=(5,)) - # an exception is expected when creating a HybridBlock w/ sparse param + params.get('weight', shape=(5,5), stype='row_sparse', dtype='float32') + params.get('bias', shape=(5,), dtype='float32') net = gluon.nn.Dense(5, params=params) + net.initialize() + x = mx.nd.ones((2,5)) + # an exception is expected when forwarding a HybridBlock w/ sparse param + y = net(x) @with_seed() def check_layer_forward(layer, dshape): diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 3af0aa8734c5..4923149f8cfa 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -28,7 +28,7 @@ @with_seed() @raises(RuntimeError) def test_multi_trainer(): - x = gluon.Parameter('x', shape=(10,)) + x = gluon.Parameter('x', shape=(10,), stype='row_sparse') x.initialize() # test set trainer trainer0 = gluon.Trainer([x], 'sgd') @@ -37,7 +37,7 @@ def test_multi_trainer(): x._set_trainer(None) assert(x._trainer is None) x._set_trainer(trainer0) - # multiple trainers for a single Parameter is not allowed + # multiple trainers for a sparse Parameter is not allowed trainer1 = gluon.Trainer([x], 'sgd') @with_seed() From bee6774da08f169f4f4a1823bdb390a0e99b1c6a Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 23 May 2018 01:01:51 +0000 Subject: [PATCH 15/21] change error --- python/mxnet/gluon/parameter.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index adbe0243c8ba..e7ba794c9cda 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -449,9 +449,9 @@ def row_sparse_data(self, row_id): NDArray on row_id's context """ if self._stype != 'row_sparse': - raise ValueError("Cannot return a copy of Parameter %s via row_sparse_data() " \ - "because its storage type is %s. Please use data() instead." \ - %(self.name, self._stype)) + raise RuntimeError("Cannot return a copy of Parameter %s via row_sparse_data() " \ + "because its storage type is %s. Please use data() instead." \ + %(self.name, self._stype)) return self._get_row_sparse(self._data, row_id.context, row_id) def list_row_sparse_data(self, row_id): @@ -469,9 +469,9 @@ def list_row_sparse_data(self, row_id): list of NDArrays """ if self._stype != 'row_sparse': - raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \ - "list_row_sparse_data() because its storage type is %s. Please " \ - "use data() instead." % (self.name, self._stype)) + raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \ + "list_row_sparse_data() because its storage type is %s. Please " \ + "use data() instead." % (self.name, self._stype)) return self._get_row_sparse(self._data, list, row_id) def data(self, ctx=None): @@ -489,9 +489,9 @@ def data(self, ctx=None): NDArray on ctx """ if self._stype != 'default': - raise ValueError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \ - "because its storage type is %s. Please use row_sparse_data() " \ - "instead." % (self.name, str(ctx), self._stype)) + raise RuntimeError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \ + "because its storage type is %s. Please use row_sparse_data() " \ + "instead." % (self.name, str(ctx), self._stype)) return self._check_and_get(self._data, ctx) def list_data(self): @@ -504,9 +504,9 @@ def list_data(self): list of NDArrays """ if self._stype != 'default': - raise ValueError("Cannot return copies of Parameter '%s' on all contexts via " \ - "list_data() because its storage type is %s. Please use " \ - "row_sparse_data() instead." % (self.name, self._stype)) + raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \ + "list_data() because its storage type is %s. Please use " \ + "row_sparse_data() instead." % (self.name, self._stype)) return self._check_and_get(self._data, list) def grad(self, ctx=None): From 077b7a5ea15169c00f35d5df75fb2d9e7c651548 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 23 May 2018 03:12:18 +0000 Subject: [PATCH 16/21] remove cast stype --- python/mxnet/gluon/parameter.py | 17 ++++------------- tests/python/unittest/test_gluon.py | 2 +- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index e7ba794c9cda..10d2123295d5 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -225,7 +225,7 @@ def _get_row_sparse(self, arr_list, ctx, row_id): self._trainer._row_sparse_pull(self, results, row_id) return results - def _load_init(self, data, ctx, cast_stype=False): + def _load_init(self, data, ctx): """(Re)initializes by loading from data.""" if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore: if self not in self._trainer._params_to_init: @@ -246,13 +246,7 @@ def _load_init(self, data, ctx, cast_stype=False): "dtype incompatible expected %s vs saved %s"%( self.name, str(self.dtype), str(data.dtype)) if self._stype != data.stype: - if not cast_stype: - raise RuntimeError("Failed loading Parameter '%s' from saved params: storage " \ - "type incompatible expected %s vs saved %s. Set " \ - "cast_stype=True to cast saved params to the same stype " \ - "as '%s'."%(self.name, self._stype, data.stype, self.name)) - else: - data = data.tostype(self._stype) + data = data.tostype(self._stype) if isinstance(ctx, Context): ctx = [ctx] if self._data is None: @@ -872,7 +866,7 @@ def save(self, filename, strip_prefix=''): ndarray.save(filename, arg_dict) def load(self, filename, ctx=None, allow_missing=False, - ignore_extra=False, restore_prefix='', cast_stype=False): + ignore_extra=False, restore_prefix=''): """Load parameters from file. filename : str @@ -886,9 +880,6 @@ def load(self, filename, ctx=None, allow_missing=False, present in this ParameterDict. restore_prefix : str, default '' prepend prefix to names of stored parameters before loading. - cast_stype: bool, default False - Whether to cast the storage type of parameters from the file to current - Parameter's storage type if stypes mismatch. """ if restore_prefix: for name in self.keys(): @@ -913,4 +904,4 @@ def load(self, filename, ctx=None, allow_missing=False, "Please make sure source and target networks have the same prefix."%( name[lprefix:], filename, _brief_print_list(self._params.keys())) continue - self[name]._load_init(arg_dict[name], ctx, cast_stype=cast_stype) + self[name]._load_init(arg_dict[name], ctx) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 6c8ff8677aba..1d3aaf6036a5 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -124,7 +124,7 @@ def test_paramdict(): params2.get('w0', shape=(10, 10)) params2.get('w1', shape=(10, 10)) assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu()) - params2.load('test_paramdict.params', mx.cpu(), cast_stype=True) + params2.load('test_paramdict.params', mx.cpu()) # compare the values before and after save/load cur_w0 = params2.get('w0').data(mx.cpu()) From 6038fe9513b4bbdc80c9850fc24a3e3a0a8e5afc Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 23 May 2018 03:15:34 +0000 Subject: [PATCH 17/21] fix test --- tests/python/unittest/test_gluon.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 1d3aaf6036a5..88eddcdc1456 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -79,14 +79,14 @@ def test_parameter_invalid_access(): # cannot call data on row_sparse parameters p0 = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse') p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) - assertRaises(ValueError, p0.data) - assertRaises(ValueError, p0.list_data) + assertRaises(RuntimeError, p0.data) + assertRaises(RuntimeError, p0.list_data) row_id = mx.nd.arange(0, 10) # cannot call row_sparse_data on dense parameters p1 = gluon.Parameter('weight', shape=(10, 10)) p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)]) - assertRaises(ValueError, p1.row_sparse_data, row_id.copyto(mx.cpu(0))) - assertRaises(ValueError, p1.list_row_sparse_data, row_id) + assertRaises(RuntimeError, p1.row_sparse_data, row_id.copyto(mx.cpu(0))) + assertRaises(RuntimeError, p1.list_row_sparse_data, row_id) @with_seed() def test_paramdict(): @@ -123,7 +123,6 @@ def test_paramdict(): params2 = gluon.ParameterDict('net_') params2.get('w0', shape=(10, 10)) params2.get('w1', shape=(10, 10)) - assertRaises(RuntimeError, params2.load, 'test_paramdict.params', mx.cpu()) params2.load('test_paramdict.params', mx.cpu()) # compare the values before and after save/load @@ -348,7 +347,7 @@ def test_sparse_symbol_block(): net = gluon.SymbolBlock(out, data) @with_seed() -@raises(ValueError) +@raises(RuntimeError) def test_sparse_hybrid_block(): params = gluon.ParameterDict('net_') params.get('weight', shape=(5,5), stype='row_sparse', dtype='float32') From 70de567d1cf496f8b359b7fc9bdd382f9907de55 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 24 May 2018 21:24:31 +0000 Subject: [PATCH 18/21] add reset kvstore to trainer --- python/mxnet/gluon/parameter.py | 11 +++--- python/mxnet/gluon/trainer.py | 38 +++++++++++++-------- tests/python/unittest/test_gluon.py | 2 -- tests/python/unittest/test_gluon_trainer.py | 24 +++++++++++++ 4 files changed, 52 insertions(+), 23 deletions(-) diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py index 10d2123295d5..3265fef2b6c0 100644 --- a/python/mxnet/gluon/parameter.py +++ b/python/mxnet/gluon/parameter.py @@ -227,12 +227,6 @@ def _get_row_sparse(self, arr_list, ctx, row_id): def _load_init(self, data, ctx): """(Re)initializes by loading from data.""" - if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore: - if self not in self._trainer._params_to_init: - raise RuntimeError("Cannot (re)initialize Parameter '%s' because it was " \ - "already initialized on the trainer's KVStore, and " \ - "trainer._update_on_kvstore is True. Please create a " \ - "new Trainer with this Parameter."%(self.name)) if self.shape: for self_dim, data_dim in zip(self.shape, data.shape): assert self_dim == 0 or self_dim == data_dim, \ @@ -425,6 +419,11 @@ def set_data(self, data): self._deferred_init = self._deferred_init[:3] + (data,) return + # if update_on_kvstore, we need to make sure the copy stored in kvstore is in sync + if self._trainer and self._trainer._kv_initialized and self._trainer._update_on_kvstore: + if self not in self._trainer._params_to_init: + self._trainer._reset_kvstore() + for arr in self._check_and_get(self._data, list): arr[:] = data diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 50a6526ef831..ef20109021aa 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -69,7 +69,6 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "got %s."%(type(params))) self._params = [] # parameters to initialize on the kvstore - self._params_to_init = [] self._contains_sparse = False self._param2idx = {} for i, param in enumerate(params): @@ -79,7 +78,6 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "got list of %s."%(type(param))) self._param2idx[param.name] = i self._params.append(param) - self._params_to_init.append(param) param._set_trainer(self) if param._stype != 'default': self._contains_sparse = True @@ -88,9 +86,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', self._scale = float(optimizer_params.get('rescale_grad', 1.0)) self._contexts = self._check_contexts() self._init_optimizer(optimizer, optimizer_params) + self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore} self._kv_initialized = False - self._kvstore = kvstore - self._update_on_kvstore = update_on_kvstore + self._kvstore = None + self._update_on_kvstore = None + self._params_to_init = [] + self._reset_kvstore() def _check_contexts(self): contexts = None @@ -140,30 +141,37 @@ def _init_params(self): self._params_to_init = params_to_init + def _reset_kvstore(self): + """Reset kvstore.""" + if self._kvstore and 'dist' in self._kvstore.type: + raise RuntimeError("Cannot reset distributed KVStore.") + self._kv_initialized = False + self._kvstore = None + self._update_on_kvstore = None + self._params_to_init = [param for param in self._params] + def _init_kvstore(self): """Create kvstore.""" arg_arrays = {} + config = self._kvstore_params if self._contains_sparse: - kvstore, update_on_kvstore = _create_sparse_kvstore(self._kvstore) + kvstore, update_on_kvstore = _create_sparse_kvstore(config['kvstore']) # update_on_kvstore is set to False by the user - if self._update_on_kvstore is False: + if config['update_on_kvstore'] is False: raise RuntimeError("Cannot set update_on_kvstore to False when sparse " "gradients and/or sparse weights are present for " - "Parameter %s." % param.name) + "Parameter '%s'."%param.name) else: - kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), + kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts), arg_arrays) - update_on_kvstore = self._update_on_kvstore if self._update_on_kvstore is not None \ - else update_on_kvstore + if config['update_on_kvstore'] is not None: + update_on_kvstore = config['update_on_kvstore'] if kvstore: if self._compression_params: kvstore.set_gradient_compression(self._compression_params) # kv.pull(row_sparse_grad) is not supported - if self._contains_sparse: - update_on_kvstore = True - else: - if 'dist' in kvstore.type: - update_on_kvstore = False + if 'dist' in kvstore.type and not self._contains_sparse: + update_on_kvstore = False if update_on_kvstore: # optimizer preferably needs to be set before init for multiprecision kvstore.set_optimizer(self._optimizer) diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py index 88eddcdc1456..82bad5baeb74 100644 --- a/tests/python/unittest/test_gluon.py +++ b/tests/python/unittest/test_gluon.py @@ -102,8 +102,6 @@ def test_paramdict(): prev_w1 = params0.get('w1').row_sparse_data(all_row_ids) # save params params0.save('test_paramdict.params') - # cannot load parameters if they are already initialized on trainer's kvstore - assertRaises(RuntimeError, params0.load, 'test_paramdict.params', mx.cpu()) # load params params1 = gluon.ParameterDict('net_') diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py index 4923149f8cfa..c2e11ebb18ee 100644 --- a/tests/python/unittest/test_gluon_trainer.py +++ b/tests/python/unittest/test_gluon_trainer.py @@ -174,3 +174,27 @@ def test_trainer_save_load(): x.lr_mult = 2.0 # check if parameter dict is correctly associated with optimizer after load_state assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2 + +@with_seed() +def test_trainer_reset_kv(): + params = gluon.ParameterDict() + x = params.get('x', shape=(10,), lr_mult=1.0) + params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros') + trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}) + params.save('test_trainer_reset_kv.params') + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + # load would reset kvstore + params.load('test_trainer_reset_kv.params') + assert trainer._kvstore is None + assert trainer._kv_initialized is False + with mx.autograd.record(): + for w in x.list_data(): + y = w + 1 + y.backward() + trainer.step(1) + # the updated parameter should be based on the loaded checkpoint + assert (x.data(mx.cpu()) == -0.2).asnumpy().all() From 12a8b59913e9a30044d4b982bdc6ea74f45c74f9 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 24 May 2018 23:09:02 +0000 Subject: [PATCH 19/21] lint --- tests/nightly/dist_sync_kvstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 3bf5cbffa13a..de49b5822fa4 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -357,13 +357,13 @@ def check_init(kv, cur_keys, cur_shape, device=False): parser.add_argument('--no-gpu', dest='gpu', action='store_false') parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false') opt = parser.parse_args() - if opt.type == 'all' or opt.type == 'init': + if opt.type == 'all' or opt.type == 'init': test_sync_init(opt.gpu) kv = init_kv() - if opt.type == 'all' or opt.type == 'default': + if opt.type == 'all' or opt.type == 'default': kv = set_optimizer(use_multiprecision=opt.multiprecision) test_sync_push_pull(opt.nrepeat) # dont run non compressed tests after this as kvstore compression will be set here - if opt.type == 'all' or opt.type == 'compressed': + if opt.type == 'all' or opt.type == 'compressed': kv, threshold = init_kv_compressed(kv) test_sync_2bit_compression(threshold, opt.nrepeat) From 2a06884657ade6b69bfbceb8ab618739f5bca370 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 24 May 2018 23:33:16 +0000 Subject: [PATCH 20/21] add test to CI --- ci/docker/runtime_functions.sh | 1 + tests/nightly/dist_sync_kvstore.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 7abe767c8691..10bca17b5ff4 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -544,6 +544,7 @@ integrationtest_ubuntu_gpu_dist_kvstore() { ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon } test_ubuntu_cpu_python2() { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index de49b5822fa4..32fb6a5ff077 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -24,7 +24,7 @@ import mxnet as mx import numpy as np import numpy.random as rnd -from mxnet.test_utils import assert_almost_equal +from mxnet.test_utils import assert_almost_equal, assert_exception from test_kvstore import compute_expected_2bit_quantization def check_diff(A, x, rank=None): @@ -350,6 +350,23 @@ def check_init(kv, cur_keys, cur_shape, device=False): check_init(kv, init_test_keys_device_big, big_shape, device=True) print('worker ' + str(kv.rank) + ' is initialized') +def test_gluon_trainer_reset(): + params = mx.gluon.ParameterDict() + x = params.get('x', shape=(4, 2), lr_mult=1.0, stype='row_sparse') + params.initialize(ctx=mx.cpu(0), init='zeros') + trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv) + params.save('test_gluon_trainer_reset_' + str(my_rank) + '.params') + row_id = mx.nd.arange(0, 4) + w = x.row_sparse_data(row_id) + with mx.autograd.record(): + y = w + 1 + y.backward() + trainer.step(1) + # load would fail to reset kvstore since update_on_kvstore is True + assert_exception(params.load, RuntimeError, 'test_gluon_trainer_reset_' + str(my_rank) + '.params') + print('worker ' + str(my_rank) + ' passed test_gluon_trainer_reset') + + if __name__ == "__main__": parser = argparse.ArgumentParser(description='test distributed kvstore in dist_sync mode') parser.add_argument('--nrepeat', type=int, default=7) @@ -357,13 +374,16 @@ def check_init(kv, cur_keys, cur_shape, device=False): parser.add_argument('--no-gpu', dest='gpu', action='store_false') parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false') opt = parser.parse_args() + if opt.type == 'gluon': + test_gluon_trainer_reset() if opt.type == 'all' or opt.type == 'init': test_sync_init(opt.gpu) - kv = init_kv() if opt.type == 'all' or opt.type == 'default': + kv = init_kv() kv = set_optimizer(use_multiprecision=opt.multiprecision) test_sync_push_pull(opt.nrepeat) # dont run non compressed tests after this as kvstore compression will be set here if opt.type == 'all' or opt.type == 'compressed': + kv = init_kv() kv, threshold = init_kv_compressed(kv) test_sync_2bit_compression(threshold, opt.nrepeat) From 01b3e4d1b6d8d8157157e08adaa5436f1a03fc1a Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Sat, 26 May 2018 03:32:27 +0000 Subject: [PATCH 21/21] add more checks --- tests/nightly/dist_sync_kvstore.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 32fb6a5ff077..32ed2dddb6fb 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -358,10 +358,7 @@ def test_gluon_trainer_reset(): params.save('test_gluon_trainer_reset_' + str(my_rank) + '.params') row_id = mx.nd.arange(0, 4) w = x.row_sparse_data(row_id) - with mx.autograd.record(): - y = w + 1 - y.backward() - trainer.step(1) + assert trainer._kv_initialized and trainer._update_on_kvstore # load would fail to reset kvstore since update_on_kvstore is True assert_exception(params.load, RuntimeError, 'test_gluon_trainer_reset_' + str(my_rank) + '.params') print('worker ' + str(my_rank) + ' passed test_gluon_trainer_reset')