From 54eeaaf76bad2b6c3a2554950961609d318c0aa7 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Tue, 10 Jul 2018 19:17:04 +0000 Subject: [PATCH 01/10] fix dist kvstore trainer --- ci/docker/runtime_functions.sh | 4 ++- python/mxnet/gluon/trainer.py | 9 ++++-- tests/nightly/dist_sync_kvstore.py | 48 ++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 4f0b14647423..dbaff31fd04f 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -733,7 +733,9 @@ integrationtest_ubuntu_gpu_dist_kvstore() { ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step } test_ubuntu_cpu_python2() { diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 09ad96314d5a..b4263410a50b 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -93,6 +93,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', self._kv_initialized = False self._kvstore = None self._update_on_kvstore = None + self._distributed = None self._params_to_init = [] self._reset_kvstore() @@ -150,6 +151,7 @@ def _reset_kvstore(self): raise RuntimeError("Cannot reset distributed KVStore.") self._kv_initialized = False self._kvstore = None + self._distributed = None self._update_on_kvstore = None self._params_to_init = [param for param in self._params] @@ -190,7 +192,8 @@ def _init_kvstore(self): if kvstore: if self._compression_params: kvstore.set_gradient_compression(self._compression_params) - if 'dist' in kvstore.type: + self._distributed = 'dist' in kvstore.type + if self._distributed: # kv.pull(row_sparse_grad) is not supported for dist kvstore update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad if update_on_kvstore: @@ -291,9 +294,9 @@ def _allreduce_grads(self): if param.grad_req != 'null': self._kvstore.push(i, param.list_grad(), priority=-i) - if not self._update_on_kvstore: - self._kvstore.pull(i, param.list_grad(), priority=-i, ignore_sparse=False) + self._kvstore.pull(i, param.list_grad(), priority=-i, + ignore_sparse=self._distributed) def update(self, batch_size, ignore_stale_grad=False): """Makes one step of parameter update. diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 8ba1edab3a0d..71394f394b65 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -375,7 +375,7 @@ def check_invalid_pull(): check_invalid_gluon_trainer_reset() check_invalid_pull() -def test_gluon_trainer(): +def test_gluon_trainer_type(): def check_trainer_kv_type(stype, grad_stype, update_on_kv): params = mx.gluon.ParameterDict() x = params.get('x', shape=(10,1), lr_mult=1.0, stype=stype, grad_stype=grad_stype) @@ -388,8 +388,42 @@ def check_trainer_kv_type(stype, grad_stype, update_on_kv): check_trainer_kv_type('default', 'default', False) check_trainer_kv_type('default', 'row_sparse', True) check_trainer_kv_type('row_sparse', 'row_sparse', True) - print('worker ' + str(my_rank) + ' passed test_gluon_trainer') - + print('worker ' + str(my_rank) + ' passed test_gluon_trainer_type') + +def test_gluon_trainer_step(): + def check_trainer_step(): + ctx = mx.cpu(0) + shape = (10, 1) + x = mx.gluon.Parameter('x', shape=shape, grad_stype='row_sparse') + x.initialize(ctx=ctx, init='ones') + trainer = mx.gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'multi_precision': False}, kvstore=kv) + with mx.autograd.record(): + w = x.data(ctx) + y = (my_rank + 1) * w + y.backward() + trainer.step(1) + expected = 1 - (1 + nworker) * nworker / 2 + assert_almost_equal(x.data(ctx).asnumpy(), np.full(shape, expected)) + check_trainer_step() + print('worker ' + str(my_rank) + ' passed test_gluon_trainer_step') + +def test_gluon_trainer_sparse_step(): + def check_trainer_sparse_step(): + ctx = mx.cpu(0) + shape = (2, 10) + all_rows = mx.nd.arange(0, shape[0], ctx=ctx) + x = mx.gluon.Parameter('x', shape=shape, stype='row_sparse', grad_stype='row_sparse') + x.initialize(ctx=ctx, init='ones') + trainer = mx.gluon.Trainer([x], 'sgd', {'learning_rate': 1.0}, kvstore=kv) + with mx.autograd.record(): + w = x.row_sparse_data(all_rows) + y = (my_rank + 1) * w + y.backward() + trainer.step(1) + expected = 1 - (1 + nworker) * nworker / 2 + assert_almost_equal(x.row_sparse_data(all_rows).asnumpy(), np.full(shape, expected)) + check_trainer_sparse_step() + print('worker ' + str(my_rank) + ' passed test_gluon_trainer_sparse_step') if __name__ == "__main__": parser = argparse.ArgumentParser(description='test distributed kvstore in dist_sync mode') @@ -398,8 +432,12 @@ def check_trainer_kv_type(stype, grad_stype, update_on_kv): parser.add_argument('--no-gpu', dest='gpu', action='store_false') parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false') opt = parser.parse_args() - if opt.type == 'gluon': - test_gluon_trainer() + if opt.type == 'gluon_type': + test_gluon_trainer_type() + if opt.type == 'gluon_step': + test_gluon_trainer_step() + if opt.type == 'gluon_sparse_step': + test_gluon_trainer_sparse_step() if opt.type == 'invalid': test_invalid_operations() if opt.type == 'all' or opt.type == 'init': From 15be79c0c1c71be215dbbb6af7246a4b578f2c4e Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Tue, 10 Jul 2018 22:56:15 +0000 Subject: [PATCH 02/10] fix test setup --- src/kvstore/kvstore_dist_server.h | 2 -- tests/nightly/dist_sync_kvstore.py | 11 +++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index a150ff42f57e..451fb78a6229 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -329,8 +329,6 @@ class KVStoreDistServer { if (has_multi_precision_copy(type)) CopyFromTo(recved, updateBuf->temp_array); const NDArray& to_merge = has_multi_precision_copy(type) ? updateBuf->temp_array : recved; // accumulate row_sparse gradients - // TODO(haibin) override + operator for row_sparse NDArray - // instead of calling BinaryComputeRspRsp directly using namespace mshadow; Engine::Get()->PushAsync( [to_merge, updateBuf, out](RunContext ctx, Engine::CallbackOnComplete on_complete) { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 71394f394b65..06419aff7dd6 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -394,7 +394,7 @@ def test_gluon_trainer_step(): def check_trainer_step(): ctx = mx.cpu(0) shape = (10, 1) - x = mx.gluon.Parameter('x', shape=shape, grad_stype='row_sparse') + x = mx.gluon.Parameter('x', shape=shape) x.initialize(ctx=ctx, init='ones') trainer = mx.gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'multi_precision': False}, kvstore=kv) with mx.autograd.record(): @@ -434,20 +434,23 @@ def check_trainer_sparse_step(): opt = parser.parse_args() if opt.type == 'gluon_type': test_gluon_trainer_type() + exit() if opt.type == 'gluon_step': test_gluon_trainer_step() + exit() if opt.type == 'gluon_sparse_step': test_gluon_trainer_sparse_step() + exit() if opt.type == 'invalid': test_invalid_operations() + exit() if opt.type == 'all' or opt.type == 'init': test_sync_init(opt.gpu) + kv = init_kv() if opt.type == 'all' or opt.type == 'default': - kv = init_kv() kv = set_optimizer(use_multiprecision=opt.multiprecision) test_sync_push_pull(opt.nrepeat) - # dont run non compressed tests after this as kvstore compression will be set here + # don't run non compressed tests after this as kvstore compression will be set here if opt.type == 'all' or opt.type == 'compressed': - kv = init_kv() kv, threshold = init_kv_compressed(kv) test_sync_2bit_compression(threshold, opt.nrepeat) From 7a22ea3f8fec3b153629c0ca916e66a5ff52d906 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 11 Jul 2018 01:20:12 +0000 Subject: [PATCH 03/10] enable tests on CI --- Jenkinsfile | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4ee1d982fed7..4efc39a46e57 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -979,20 +979,19 @@ try { } } } + }, + 'dist-kvstore tests GPU': { + node('mxnetlinux-gpu') { + ws('workspace/it-dist-kvstore') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + unpack_lib('gpu') + docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true) + publish_test_coverage() + } + } + } } - // Disable until fixed https://github.com/apache/incubator-mxnet/issues/11441 - // 'dist-kvstore tests GPU': { - // node('mxnetlinux-gpu') { - // ws('workspace/it-dist-kvstore') { - // timeout(time: max_time, unit: 'MINUTES') { - // init_git() - // unpack_lib('gpu') - // docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true) - // publish_test_coverage() - // } - // } - // } - //} } stage('Deploy') { From adc933d51532240a1955f38b0af4e4624b09232e Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 11 Jul 2018 18:58:58 +0000 Subject: [PATCH 04/10] update move some test to cpu --- Jenkinsfile | 12 ++++++++++++ ci/docker/runtime_functions.sh | 15 +++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4efc39a46e57..2b051fe2e721 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -991,6 +991,18 @@ try { } } } + }, + 'dist-kvstore tests CPU': { + node('mxnetlinux-cpu') { + ws('workspace/it-dist-kvstore') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + unpack_lib('cpu') + docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', true) + publish_test_coverage() + } + } + } } } diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index dbaff31fd04f..31b1ee9ff0bf 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -724,20 +724,27 @@ integrationtest_ubuntu_gpu_cpp_package() { cpp-package/tests/ci_test.sh } -integrationtest_ubuntu_gpu_dist_kvstore() { +integrationtest_ubuntu_cpu_dist_kvstore() { set -ex export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 cd tests/nightly/ - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision - ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step } +integrationtest_ubuntu_gpu_dist_kvstore() { + set -ex + export PYTHONPATH=./python/ + export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 + cd tests/nightly/ + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision + ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py +} + test_ubuntu_cpu_python2() { set -ex pushd . From 9dda024c2451907e72aec9db3474eafe94e6df16 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 11 Jul 2018 20:27:26 +0000 Subject: [PATCH 05/10] dont use nvdia-docker --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2b051fe2e721..5e184b15f8d4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -998,7 +998,7 @@ try { timeout(time: max_time, unit: 'MINUTES') { init_git() unpack_lib('cpu') - docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', true) + docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_dist_kvstore', false) publish_test_coverage() } } From 64cd90029698129ea6151ccdc07f056416900412 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Wed, 11 Jul 2018 20:30:34 +0000 Subject: [PATCH 06/10] rename option --- ci/docker/runtime_functions.sh | 8 ++++---- tests/nightly/dist_sync_kvstore.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 31b1ee9ff0bf..8740dbf4e5ec 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -729,10 +729,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() { export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 cd tests/nightly/ - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu } integrationtest_ubuntu_gpu_dist_kvstore() { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 06419aff7dd6..1448ed728e52 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -432,16 +432,16 @@ def check_trainer_sparse_step(): parser.add_argument('--no-gpu', dest='gpu', action='store_false') parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false') opt = parser.parse_args() - if opt.type == 'gluon_type': + if opt.type == 'gluon_type_cpu': test_gluon_trainer_type() exit() - if opt.type == 'gluon_step': + if opt.type == 'gluon_step_cpu': test_gluon_trainer_step() exit() - if opt.type == 'gluon_sparse_step': + if opt.type == 'gluon_sparse_step_cpu': test_gluon_trainer_sparse_step() exit() - if opt.type == 'invalid': + if opt.type == 'invalid_cpu': test_invalid_operations() exit() if opt.type == 'all' or opt.type == 'init': From 103e6e124898f7c3679df19a094840d67b55cc29 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 12 Jul 2018 06:00:00 +0000 Subject: [PATCH 07/10] trigger test --- ci/docker/runtime_functions.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 8740dbf4e5ec..d8e0b260b8be 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -729,10 +729,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() { export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 cd tests/nightly/ - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid_cpu - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu } integrationtest_ubuntu_gpu_dist_kvstore() { From ece12eab2014082b715d12b33c574242fedef86e Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 12 Jul 2018 17:43:04 +0000 Subject: [PATCH 08/10] reduce workload to avvoid time out --- ci/docker/runtime_functions.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 19a7482ae45e..6632d49a9e98 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -722,11 +722,12 @@ integrationtest_ubuntu_cpu_dist_kvstore() { set -ex export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 + export OMP_NUM_THREADS=3 cd tests/nightly/ - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid_cpu - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu + ../../tools/launch.py -n 3 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu + ../../tools/launch.py -n 3 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu + ../../tools/launch.py -n 3 --launcher local python dist_sync_kvstore.py --type=invalid_cpu + ../../tools/launch.py -n 3 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu } integrationtest_ubuntu_gpu_scala() { From ce8a3e964f31f6b6652aaa98921e6135e6732a93 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Thu, 12 Jul 2018 21:40:23 +0000 Subject: [PATCH 09/10] disable operator tuning to reduce launch overhead --- ci/docker/runtime_functions.sh | 1 + src/operator/operator_tune-inl.h | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 19a7482ae45e..681006e2e67b 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -722,6 +722,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() { set -ex export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 + export MXNET_USE_OPERATOR_TUNING=0 cd tests/nightly/ ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_step_cpu ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu diff --git a/src/operator/operator_tune-inl.h b/src/operator/operator_tune-inl.h index 2dfc103b1008..127691bccccd 100644 --- a/src/operator/operator_tune-inl.h +++ b/src/operator/operator_tune-inl.h @@ -165,8 +165,14 @@ class OperatorTune : public OperatorTuneByType { // Not especially concerned with a race condition, since this hsould // run when only one thread is active (static init), just don't cache this variable OperatorTuneBase::calculated_.store(true); - OperatorTuneBase::omp_overhead_ns_ = GetOMPLoopOverhead(); std::string config = dmlc::GetEnv("MXNET_USE_OPERATOR_TUNING", std::string()); + StringUtil::trim(&config); + // disabled + if (!config.empty() && ::isdigit(config[0]) && std::atoi(config.c_str()) == 0) { + OperatorTuneBase::omp_overhead_ns_ = INT_MAX; + } else { + OperatorTuneBase::omp_overhead_ns_ = GetOMPLoopOverhead(); + } ParseEnablerConfig(config); } @@ -435,7 +441,7 @@ class OperatorTune : public OperatorTuneByType { } /*! - * \brief Parse MXNET_ENABLE_OPERATOR_TUNING environment variable + * \brief Parse MXNET_USE_OPERATOR_TUNING environment variable * \param config String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable * Values: * 0=disable all From 4649dcc801e2129ef4f211b170c642c061b6e9b9 Mon Sep 17 00:00:00 2001 From: eric-haibin-lin Date: Fri, 13 Jul 2018 17:36:00 +0000 Subject: [PATCH 10/10] update test types --- ci/docker/runtime_functions.sh | 7 +++++-- tests/nightly/dist_sync_kvstore.py | 24 +++++++++++------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 681006e2e67b..0b315019d738 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -728,6 +728,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() { ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_sparse_step_cpu ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=invalid_cpu ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=gluon_type_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=compressed_cpu + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision } integrationtest_ubuntu_gpu_scala() { @@ -741,9 +745,8 @@ integrationtest_ubuntu_gpu_dist_kvstore() { export PYTHONPATH=./python/ export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 cd tests/nightly/ - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py - ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision ../../tools/launch.py -n 7 --launcher local python dist_device_sync_kvstore.py + ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=init_gpu } test_ubuntu_cpu_python2() { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 1448ed728e52..861b85913ac8 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -428,29 +428,27 @@ def check_trainer_sparse_step(): if __name__ == "__main__": parser = argparse.ArgumentParser(description='test distributed kvstore in dist_sync mode') parser.add_argument('--nrepeat', type=int, default=7) - parser.add_argument('--type', type=str, default='all') + parser.add_argument('--type', type=str, default='default_cpu') parser.add_argument('--no-gpu', dest='gpu', action='store_false') parser.add_argument('--no-multiprecision', dest='multiprecision', action='store_false') opt = parser.parse_args() if opt.type == 'gluon_type_cpu': test_gluon_trainer_type() - exit() - if opt.type == 'gluon_step_cpu': + elif opt.type == 'gluon_step_cpu': test_gluon_trainer_step() - exit() - if opt.type == 'gluon_sparse_step_cpu': + elif opt.type == 'gluon_sparse_step_cpu': test_gluon_trainer_sparse_step() - exit() - if opt.type == 'invalid_cpu': + elif opt.type == 'invalid_cpu': test_invalid_operations() - exit() - if opt.type == 'all' or opt.type == 'init': + elif opt.type == 'init_gpu': test_sync_init(opt.gpu) - kv = init_kv() - if opt.type == 'all' or opt.type == 'default': + elif opt.type == 'default_cpu': + kv = init_kv() kv = set_optimizer(use_multiprecision=opt.multiprecision) test_sync_push_pull(opt.nrepeat) - # don't run non compressed tests after this as kvstore compression will be set here - if opt.type == 'all' or opt.type == 'compressed': + elif opt.type == 'compressed_cpu': kv, threshold = init_kv_compressed(kv) + kv = set_optimizer(use_multiprecision=opt.multiprecision) test_sync_2bit_compression(threshold, opt.nrepeat) + else: + raise RuntimeError("Unknown test type")