From 124de683526b7910753bc87e8813f3b80bf32e42 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 5 Mar 2021 11:15:46 -0800 Subject: [PATCH 1/6] only set args.local_rank if the attr exists already --- deepspeed/runtime/engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 716f73d3b469..1e84504a9e84 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -462,8 +462,9 @@ def _configure_with_arguments(self, args, mpu): # environment variable is set. We must align args.local_rank to this value for # backwards compatability with scripts relying on [args|self].local_rank containing # the correct local rank info. - args.local_rank = int(os.environ['LOCAL_RANK']) - self.local_rank = args.local_rank + self.local_rank = int(os.environ['LOCAL_RANK']) + if hasattr(args, 'local_rank'): + args.local_rank = self.local_rank config_file = args.deepspeed_config if hasattr(args, 'deepspeed_config') else None From 6c4b94981fbfb35c7868816b4d531a7a7b0c89f7 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 5 Mar 2021 11:24:30 -0800 Subject: [PATCH 2/6] add unit test --- tests/unit/test_config.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 4cabefe71a33..fb8e237a2a76 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -226,3 +226,31 @@ def _helper(): model.step() _helper() + +def test_no_args(tmpdir): + config_dict = { + "train_batch_size": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + @distributed_test(world_size=1) + def _helper(): + model = SimpleModel(hidden_dim=10) + model, _, _, _ = deepspeed.initialize(args=None, model=model, config_params=config_dict) + data_loader = random_dataloader(model=model, + total_samples=5, + hidden_dim=hidden_dim, + device=model.device) + for n, batch in enumerate(data_loader): + loss = model(batch[0], batch[1]) + + _helper() + From 775ffac203f05381a9d27dcb63b121703c6f2d14 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 5 Mar 2021 11:25:43 -0800 Subject: [PATCH 3/6] formatting --- tests/unit/test_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index fb8e237a2a76..b1d1a842ee6d 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -227,6 +227,7 @@ def _helper(): _helper() + def test_no_args(tmpdir): config_dict = { "train_batch_size": 1, @@ -240,7 +241,7 @@ def test_no_args(tmpdir): "enabled": True } } - + @distributed_test(world_size=1) def _helper(): model = SimpleModel(hidden_dim=10) @@ -251,6 +252,5 @@ def _helper(): device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) - - _helper() + _helper() From badced0c27ccd15675e052e653f33a9cf6bf2097 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 5 Mar 2021 11:36:39 -0800 Subject: [PATCH 4/6] make args optional --- deepspeed/__init__.py | 12 ++++++--- tests/unit/test_config.py | 54 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py index 31e901d8ec59..453c025eb3e4 100755 --- a/deepspeed/__init__.py +++ b/deepspeed/__init__.py @@ -47,8 +47,8 @@ def _parse_version(version_str): sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler -def initialize(args, - model, +def initialize(args=None, + model=None, optimizer=None, model_parameters=None, training_data=None, @@ -60,8 +60,7 @@ def initialize(args, """Initialize the DeepSpeed Engine. Arguments: - args: a dictionary containing local_rank and deepspeed_config - file location + args: an object containing local_rank and deepspeed_config fields. This is optional if `config_params` is passed. model: Required: nn.module class before apply any wrappers @@ -86,6 +85,9 @@ def initialize(args, mini-batch of Tensor(s). Used when using batched loading from a map-style dataset. + config_params: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config + as a dictionary instead. + Returns: A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler`` @@ -106,6 +108,8 @@ def initialize(args, __git_branch__), ranks=[0]) + assert model is not None, "deepspeed.initialize requires a model" + if not isinstance(model, PipelineModule): engine = DeepSpeedEngine(args=args, model=model, diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index b1d1a842ee6d..dea5e00650f6 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -228,7 +228,7 @@ def _helper(): _helper() -def test_no_args(tmpdir): +def test_none_args(tmpdir): config_dict = { "train_batch_size": 1, "optimizer": { @@ -254,3 +254,55 @@ def _helper(): loss = model(batch[0], batch[1]) _helper() + + +def test_no_args(tmpdir): + config_dict = { + "train_batch_size": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + @distributed_test(world_size=1) + def _helper(): + model = SimpleModel(hidden_dim=10) + model, _, _, _ = deepspeed.initialize(model=model, config_params=config_dict) + data_loader = random_dataloader(model=model, + total_samples=5, + hidden_dim=hidden_dim, + device=model.device) + for n, batch in enumerate(data_loader): + loss = model(batch[0], batch[1]) + + _helper() + + +def test_no_model(tmpdir): + config_dict = { + "train_batch_size": 1, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "fp16": { + "enabled": True + } + } + + @distributed_test(world_size=1) + def _helper(): + model = SimpleModel(hidden_dim=10) + with pytest.raises(AssertionError): + model, _, _, _ = deepspeed.initialize(model=None, config_params=config_dict) + + with pytest.raises(AssertionError): + model, _, _, _ = deepspeed.initialize(model, config_params=config_dict) From 876eda8718c062a626381aeeaddacfd60a777cfa Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Fri, 5 Mar 2021 11:37:58 -0800 Subject: [PATCH 5/6] fix missing hidden --- tests/unit/test_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index dea5e00650f6..7de3a40fabeb 100755 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -248,7 +248,7 @@ def _helper(): model, _, _, _ = deepspeed.initialize(args=None, model=model, config_params=config_dict) data_loader = random_dataloader(model=model, total_samples=5, - hidden_dim=hidden_dim, + hidden_dim=10, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) @@ -276,7 +276,7 @@ def _helper(): model, _, _, _ = deepspeed.initialize(model=model, config_params=config_dict) data_loader = random_dataloader(model=model, total_samples=5, - hidden_dim=hidden_dim, + hidden_dim=10, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) From 17ce3b683350f2f0f5fd3887448caa934d78888c Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Tue, 16 Mar 2021 10:12:35 -0700 Subject: [PATCH 6/6] restructure sanity check for LOCAL_RANK --- deepspeed/runtime/engine.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 1df80c038838..e11e2c1d7afc 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -495,7 +495,7 @@ def _configure_with_arguments(self, args, mpu): # After the distributed backend is initialized we are guaranteed the LOCAL_RANK # environment variable is set. We must align args.local_rank to this value for # backwards compatability with scripts relying on [args|self].local_rank containing - # the correct local rank info. + # the correct local rank info. _do_args_sanity_check will ensure this is the case. self.local_rank = int(os.environ['LOCAL_RANK']) if hasattr(args, 'local_rank'): args.local_rank = self.local_rank @@ -514,15 +514,14 @@ def _do_args_sanity_check(self, args): assert args.deepspeed_config is None, "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config" args.deepspeed_config = args.deepscale_config - local_rank_err = "DeepSpeed requires a command line parameter of --local_rank [int] and/or setting the LOCAL_RANK environment variable." - if hasattr(args, 'local_rank'): - assert type(args.local_rank) == int, local_rank_err - if "LOCAL_RANK" in os.environ and args.local_rank >= 0: - env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + assert "LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment variable, it is set by the deepspeed launcher, " \ + "deepspeed.init_distributed, or the torch.distributed launcher. If using a different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed." + if hasattr(args, 'local_rank') and args.local_rank != None: + assert isinstance(args.local_rank, int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}" + if args.local_rank >= 0: + env_local_rank = int(os.environ.get("LOCAL_RANK")) assert env_local_rank == args.local_rank, \ f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}." - else: - assert "LOCAL_RANK" in os.environ, local_rank_err if self.config_params is None: assert hasattr(args, 'deepspeed_config') and args.deepspeed_config is not None, \