From ea13962299e4bb6a1ccb15c95bc98396727dd45a Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 6 Jul 2022 17:09:10 -0600 Subject: [PATCH 01/37] add virtual pipeline size to config Signed-off-by: ericharper --- .../conf/megatron_gpt_config.yaml | 3 +- .../language_modeling/megatron_gpt_model.py | 42 ++++++++++++++----- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 837d397c0815..f71b55b031a1 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -47,7 +47,7 @@ model: global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism - resume_from_checkpoint: null # manually set the checkpoint file to load from + virtual_pipeline_model_parallel_size: null # interleaved pipeline # model architecture encoder_seq_length: 512 @@ -89,6 +89,7 @@ model: # miscellaneous seed: 1234 + resume_from_checkpoint: null # manually set the checkpoint file to load from use_cpu_initialization: False # Init weights on the CPU (slow for large models) onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f43824526fc0..56ea6e4b2743 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -55,6 +55,9 @@ from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_without_interleaving import ( forward_backward_pipelining_without_interleaving, ) + from apex.transformer.pipeline_parallel.schedules.fwd_bwd_pipelining_with_interleaving import ( + _forward_backward_pipelining_with_interleaving, + ) from apex.transformer.pipeline_parallel.schedules.fwd_bwd_no_pipelining import forward_backward_no_pipelining from apex.transformer.pipeline_parallel.utils import get_num_microbatches @@ -81,7 +84,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): # TODO: Not sure how to use lists of modules with PTL. # This means we can only use pipeline parallelism without the interleaved schedule. - self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] + # self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] + self.model = build_model( + model_provider_func=self.model_provider_func, + wrap_with_ddp=False, + virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), + ) # We don't need to call it explicitly? Since it is a pytorch lightning hook function # self.setup_optimizer_param_groups() @@ -193,16 +201,28 @@ def training_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=False, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) + if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: + losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=False, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) + else: + losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=False, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) else: # no pipeline parallelism so we reduce grads asynchronously if not using sequence parallelism if self.megatron_amp_o2 and not self.cfg.get('sequence_parallel', False): From 2de384e076df057f9d622c210be57f04bd931f47 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 3 Aug 2022 17:28:14 -0600 Subject: [PATCH 02/37] convert model to list of modules Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 10 +++++++--- nemo/collections/nlp/modules/common/megatron/utils.py | 3 +-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 56ea6e4b2743..4da598c3c3b0 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -99,10 +99,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type - self.model.cuda(torch.cuda.current_device()) + for module in self.model: + module.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision - self.model = Float16Module(module=self.model, precision=cfg.precision) + converted_model = [] + for module in self.model: + converted_model.append(Float16Module(module=module, precision=cfg.precision)) + self.model = converted_model if self.trainer.precision == 32: self.autocast_dtype = torch.float @@ -172,7 +176,7 @@ def model_provider_func(self, pre_process, post_process): def setup_optimizer_param_groups(self): """ModelPT override. Optimizer will get self._optimizer_param_groups""" - self._optimizer_param_groups = get_params_for_weight_decay_optimization([self.model]) + self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) def forward(self, tokens, text_position_ids, attention_mask, labels): output_tensor = self.model(tokens, text_position_ids, attention_mask, labels=labels) diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py index e86c6ab21bd1..13090c60fa5e 100644 --- a/nemo/collections/nlp/modules/common/megatron/utils.py +++ b/nemo/collections/nlp/modules/common/megatron/utils.py @@ -313,10 +313,9 @@ def get_params_for_weight_decay_optimization( Layernorms and biases will have no weight decay but the rest will. """ - modules = listify_model(model) weight_decay_params = {'params': []} no_weight_decay_params = {'params': [], 'weight_decay': 0.0} - for module in modules: + for module in model: for module_ in module.modules(): if isinstance(module_, (FusedLayerNorm, FastLayerNorm)): no_weight_decay_params['params'].extend( From 51b56395da15ca55eb725e435d5386051b850811 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 3 Aug 2022 17:41:59 -0600 Subject: [PATCH 03/37] convert model to list of modules Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 4da598c3c3b0..a5f5fe2497d4 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -205,7 +205,7 @@ def training_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: + if self.cfg.get('virtual_pipeline_model_parallel_size', None) > 1: losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, @@ -606,7 +606,8 @@ def setup(self, stage=None): # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: - self.model.sync_initial_word_embeddings() + for module in self.model: + module.sync_initial_word_embeddings() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): From 55ee50b433e3749d1417360e3abada70ed705494 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 3 Aug 2022 17:44:18 -0600 Subject: [PATCH 04/37] convert model to list of modules Signed-off-by: ericharper --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index f71b55b031a1..29a06eb3f5e7 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -47,7 +47,7 @@ model: global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism - virtual_pipeline_model_parallel_size: null # interleaved pipeline + virtual_pipeline_model_parallel_size: 1 # interleaved pipeline # model architecture encoder_seq_length: 512 diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a5f5fe2497d4..e390940efe22 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -205,7 +205,7 @@ def training_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', None) > 1: + if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, From 31b001f47857b12d8f040bcf0cfc7be865d76853 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 4 Aug 2022 11:50:58 -0600 Subject: [PATCH 05/37] update for list of modules Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e390940efe22..89d192680f59 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -337,14 +337,15 @@ def allreduce_first_last_embeddings(self): if parallel_state.get_pipeline_model_parallel_world_size() > 1 and ( parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage() ): - if self.model.share_token_embeddings: - word_embeddings_weight = self.model.word_embeddings_weight() - if self.megatron_amp_o2: - # O2 recipe stores a "main" copy of weights and grads - grad = word_embeddings_weight.main_grad - else: - grad = word_embeddings_weight.grad - torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) + for module in self.model: + if module.share_token_embeddings: + word_embeddings_weight = module.word_embeddings_weight() + if self.megatron_amp_o2: + # O2 recipe stores a "main" copy of weights and grads + grad = word_embeddings_weight.main_grad + else: + grad = word_embeddings_weight.grad + torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(batch, model): From 6c9c6a9874eaf4d682aacdee244e449a349591c9 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 4 Aug 2022 14:11:26 -0600 Subject: [PATCH 06/37] add virtual to init Signed-off-by: ericharper --- .../language_modeling/megatron_base_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 29 ++++++--- .../modules/common/megatron/megatron_init.py | 9 ++- nemo/utils/app_state.py | 65 ++++++++++++++----- 4 files changed, 78 insertions(+), 26 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index ecc2129f1723..3dde3b57f283 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -82,6 +82,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1), + virtual_pipeline_model_parallel_size=cfg.get('virtual_pipeline_model_parallel_size', 1), pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 89d192680f59..66cba3bcac0d 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -428,15 +428,26 @@ def validation_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=True, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) + if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: + losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=True, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) + else: + losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=True, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) else: losses_reduced_per_micro_batch = forward_backward_no_pipelining( forward_step_func=self.get_forward_output_and_loss_func(), diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 206887f64a2e..70a5f17ce73a 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -25,6 +25,7 @@ from apex.transformer.parallel_state import ( get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank, + set_virtual_pipeline_model_parallel_rank, set_pipeline_model_parallel_split_rank, set_pipeline_model_parallel_world_size, set_tensor_model_parallel_rank, @@ -45,6 +46,7 @@ def initialize_model_parallel_for_nemo( local_rank, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=1, pipeline_model_parallel_split_rank=None, micro_batch_size=None, global_batch_size=None, @@ -59,17 +61,20 @@ def initialize_model_parallel_for_nemo( app_state.local_rank = local_rank app_state.tensor_model_parallel_size = tensor_model_parallel_size app_state.pipeline_model_parallel_size = pipeline_model_parallel_size + app_state.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size ( app_state.tensor_model_parallel_rank, app_state.pipeline_model_parallel_rank, app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=world_size, rank=global_rank, tensor_model_parallel_size_=tensor_model_parallel_size, pipeline_model_parallel_size_=pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size_=virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank, ) @@ -78,6 +83,7 @@ def initialize_model_parallel_for_nemo( set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) + set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank) set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size) set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank) @@ -184,12 +190,12 @@ def fake_initialize_model_parallel( num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size - # TODO: virtual pipeline model parallelism is not yet implemented in NeMo. This is needed for interleaved pipelining. # if virtual_pipeline_model_parallel_size_ is not None: # global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK # global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE # _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0 # _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_ + virtual_pipeline_model_parallel_rank = 0 # Build the data-parallel groups. all_data_parallel_group_ranks = [] @@ -272,4 +278,5 @@ def fake_initialize_model_parallel( model_parallel_size, data_parallel_size, pipeline_model_parallel_split_rank_, + virtual_pipeline_model_parallel_rank, ) diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index f886008cc3d4..253c6bfbcb27 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -47,6 +47,7 @@ def __init__(self): self._tensor_model_parallel_size = None self._tensor_model_parallel_group = None self._pipeline_model_parallel_size = None + self._virtual_pipeline_model_parallel_size = None self._pipeline_model_parallel_group = None self._pipeline_model_parallel_split_rank = None self._is_megatron_initialized = False @@ -153,6 +154,22 @@ def pipeline_model_parallel_size(self, size): """ self._pipeline_model_parallel_size = size + @property + def virtual_pipeline_model_parallel_size(self): + """ Property returns the number of GPUs in each model parallel group. + Returns: + Number of GPUs in each model parallel group. + """ + return self._pipeline_model_parallel_size + + @virtual_pipeline_model_parallel_size.setter + def virtual_pipeline_model_parallel_size(self, size): + """ Property sets the size of the virtual pipeline parallel model. + Args: + size (int): Number of modules in each pipeline parallel model. + """ + self._virtual_pipeline_model_parallel_size = size + @property def data_parallel_size(self): """ Property returns the number of GPUs in each data parallel group. @@ -203,52 +220,68 @@ def global_rank(self, rank): @property def tensor_model_parallel_rank(self): - """ Property returns the model parallel rank. + """ Property returns the tensor model parallel rank. Returns: - Model parallel rank. + Tensor model parallel rank. """ return self._tensor_model_parallel_rank @tensor_model_parallel_rank.setter def tensor_model_parallel_rank(self, rank): - """ Property sets the model parallel rank. + """ Property sets the tensor model parallel rank. Args: - rank (int): Model parallel rank. + rank (int): Tensor model parallel rank. """ self._tensor_model_parallel_rank = rank @property def tensor_model_parallel_group(self): - """ Property returns the model parallel group. + """ Property returns the tensor model parallel group. Returns: - Model parallel group. + Tensor model parallel group. """ return self._tensor_model_parallel_group @tensor_model_parallel_group.setter def tensor_model_parallel_group(self, group): - """ Property sets the model parallel group. + """ Property sets the tensor model parallel group. Args: - group: Model parallel group. + group: Tensor model parallel group. """ self._tensor_model_parallel_group = group @property def pipeline_model_parallel_rank(self): - """ Property returns the model parallel rank. + """ Property returns the pipeline model parallel rank. Returns: - Model parallel rank. + Pipeline model parallel rank. """ return self._pipeline_model_parallel_rank @pipeline_model_parallel_rank.setter def pipeline_model_parallel_rank(self, rank): - """ Property sets the model parallel rank. + """ Property sets the pipeline model parallel rank. Args: - rank (int): Model parallel rank. + rank (int): Pipeline model parallel rank. """ self._pipeline_model_parallel_rank = rank + @property + def virtual_pipeline_model_parallel_rank(self): + """ Property returns the virtual pipeline parallel rank. + Returns: + Model parallel rank. + """ + return self._virtual_pipeline_model_parallel_rank + + @virtual_pipeline_model_parallel_rank.setter + def virtual_pipeline_model_parallel_rank(self, rank): + """ Property sets the virtual pipeline parallel rank. + Args: + rank (int): Virtual pipeline parallel rank. + """ + self._virtual_pipeline_model_parallel_rank = rank + @property def pipeline_model_parallel_split_rank(self): """ Property returns the rank at which Encoder and Decoder are split into different pipelines for Megatrron Encoder-Decoder models. @@ -267,17 +300,17 @@ def pipeline_model_parallel_split_rank(self, rank): @property def pipeline_model_parallel_group(self): - """ Property returns the model parallel group. + """ Property returns the pipeline model parallel group. Returns: - Model parallel group. + Pipeline model parallel group. """ return self._pipeline_model_parallel_group @pipeline_model_parallel_group.setter def pipeline_model_parallel_group(self, group): - """ Property sets the model parallel group. + """ Property sets the pipeline model parallel group. Args: - group: Model parallel group. + group: Pipeline model parallel group. """ self._pipeline_model_parallel_group = group From 1ca0fa3d1eadd0490417a99e27592a06eebab1c0 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 11 Aug 2022 14:14:20 -0600 Subject: [PATCH 07/37] update first last stage embedding all reduce Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 66cba3bcac0d..2dabf4254b5b 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -335,17 +335,21 @@ def allreduce_first_last_embeddings(self): # This should only run for models that support pipelined model parallelism # (BERT and GPT-2). if parallel_state.get_pipeline_model_parallel_world_size() > 1 and ( - parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage() + parallel_state.is_pipeline_first_stage(ignore_virtual=True) + or parallel_state.is_pipeline_last_stage(ignore_virtual=True) ): - for module in self.model: - if module.share_token_embeddings: - word_embeddings_weight = module.word_embeddings_weight() - if self.megatron_amp_o2: - # O2 recipe stores a "main" copy of weights and grads - grad = word_embeddings_weight.main_grad - else: - grad = word_embeddings_weight.grad - torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + module = self.model[0] # only the first virtual rank has the embeddings + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): + module = self.model[-1] # only the last virtual rank has the embeddings + if module.share_token_embeddings: + word_embeddings_weight = module.word_embeddings_weight() + if self.megatron_amp_o2: + # O2 recipe stores a "main" copy of weights and grads + grad = word_embeddings_weight.main_grad + else: + grad = word_embeddings_weight.grad + torch.distributed.all_reduce(grad, group=parallel_state.get_embedding_group()) def get_forward_output_and_loss_func(self): def fwd_output_and_loss_func(batch, model): From 56b0d4c9b3f8eb62a0d35ad8853939ae9336cc17 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 11 Aug 2022 14:31:45 -0600 Subject: [PATCH 08/37] update sequence parallel all reduce for virtual models Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 2dabf4254b5b..bedbaec95974 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -315,13 +315,14 @@ def allreduce_sequence_parallel_gradients(self): https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425 """ grads = [] - for param in self.model.parameters(): - if getattr(param, 'sequence_parallel_enabled', False): - if self.megatron_amp_o2: - grad = param.main_grad - else: - grad = param.grad - grads.append(grad.data) + for module in self.model: + for param in module.parameters(): + if getattr(param, 'sequence_parallel_enabled', False): + if self.megatron_amp_o2: + grad = param.main_grad + else: + grad = param.grad + grads.append(grad.data) coalesced = torch._utils._flatten_dense_tensors(grads) torch.distributed.all_reduce(coalesced, group=parallel_state.get_tensor_model_parallel_group()) for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): From c8d3acf111556663698748c72baf9e600a147ddc Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 16 Aug 2022 18:34:01 -0600 Subject: [PATCH 09/37] runs but we get an error Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 ++++- .../collections/nlp/modules/common/megatron/megatron_init.py | 2 ++ nemo/collections/nlp/modules/common/megatron/transformer.py | 2 +- nemo/collections/nlp/parts/nlp_overrides.py | 1 + nemo/utils/app_state.py | 2 +- 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index bedbaec95974..44bd4e490092 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -623,8 +623,11 @@ def setup(self, stage=None): # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: - for module in self.model: + for i, module in enumerate(self.model): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) module.sync_initial_word_embeddings() + # for module in self.model: + # module.sync_initial_word_embeddings() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 70a5f17ce73a..9e78e7de29ff 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -26,6 +26,7 @@ get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank, + set_virtual_pipeline_model_parallel_world_size, set_pipeline_model_parallel_split_rank, set_pipeline_model_parallel_world_size, set_tensor_model_parallel_rank, @@ -83,6 +84,7 @@ def initialize_model_parallel_for_nemo( set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) + set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size) set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank) set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size) set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank) diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 155e676ac981..e03430fdc560 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -1642,7 +1642,7 @@ def build_layer(layer_number): assert num_layers % parallel_state.get_virtual_pipeline_model_parallel_world_size() == 0, ( 'num_layers_per_stage must be divisible by ' 'virtual_pipeline_model_parallel_size' ) - assert self.model_type != ModelType.encoder_or_decoder + # assert self.model_type != ModelType.encoder_or_decoder # Number of layers in each model chunk is the number of layers in the stage, # divided by the number of model chunks in a stage. self.num_layers = self.num_layers // parallel_state.get_virtual_pipeline_model_parallel_world_size() diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 2738041ecf85..dbe2d7097c31 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -148,6 +148,7 @@ def init_model_parallel(self, global_rank: int, world_size: int) -> None: tensor_model_parallel_size_=app_state.tensor_model_parallel_size, pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=app_state.pipeline_model_parallel_split_rank, + virtual_pipeline_model_parallel_size_=app_state.virtual_pipeline_model_parallel_size, ) # assert that fake tp and pp rank match after model parallel init diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 253c6bfbcb27..07af7ec62521 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -160,7 +160,7 @@ def virtual_pipeline_model_parallel_size(self): Returns: Number of GPUs in each model parallel group. """ - return self._pipeline_model_parallel_size + return self._virtual_pipeline_model_parallel_size @virtual_pipeline_model_parallel_size.setter def virtual_pipeline_model_parallel_size(self, size): From c83b3c988db013005e7e71d4a8e55bf6a4fc7f20 Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 16 Aug 2022 18:37:33 -0600 Subject: [PATCH 10/37] set virtual rank 0 after looping Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 44bd4e490092..b099d2ef5f17 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -626,6 +626,7 @@ def setup(self, stage=None): for i, module in enumerate(self.model): parallel_state.set_virtual_pipeline_model_parallel_rank(i) module.sync_initial_word_embeddings() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) # for module in self.model: # module.sync_initial_word_embeddings() From af60b68dbb54ce40f82797a13afabccccf71966d Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 17 Aug 2022 16:54:25 -0600 Subject: [PATCH 11/37] account for virtual when determinining first and last pipeline stages Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 37 ++++++------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index b099d2ef5f17..d9f3bc000337 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -120,21 +120,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): # configuration used for inference self._inference_config = None - # At pipeline-parallel training, set the pipeline stage that the current GPU belongs to skip loading inputs - # Intemediate stage: doesn't need any inputs - # Fist pipeline stage: needs only 'tokens' and 'position_ids' - # Last pipeline stage: needs only 'labels' and 'loss_mask' - self._is_first_pipe_stage = False - self._is_intermediate_pipe_stage = False - self._is_last_pipe_stage = False - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - if parallel_state.is_pipeline_first_stage(): - self._is_first_pipe_stage = True - elif parallel_state.is_pipeline_last_stage(): - self._is_last_pipe_stage = True - else: - self._is_intermediate_pipe_stage = True - def set_inference_config(self, inference_config): self._inference_config = inference_config @@ -195,13 +180,16 @@ def training_step(self, batch, batch_idx): # we zero grads here because we also call backward in the apex fwd/bwd functions self._optimizer.zero_grad() - if self._is_intermediate_pipe_stage: + if parallel_state.is_pipeline_first_stage(ignore_virtual=True) or parallel_state.is_pipeline_last_stage( + ignore_virtual=True + ): + # we prepare the micro batches for the apex fwd/bwd function + batch_for_pipeline = self.process_global_batch(batch) + else: # The intermediate pipeline stages do not need any inputs from data loader # GPT3 uses decoder with AttnMask:causal, thus doesn't need attention_mask batch_for_pipeline = None - else: - # we prepare the micro batches for the apex fwd/bwd function - batch_for_pipeline = self.process_global_batch(batch) + tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: @@ -360,21 +348,20 @@ def fwd_output_and_loss_func(batch, model): attention_mask = attention_mask[0:1] else: # GPT3 uses only causal mask, which doesn't need attention mask - if self._is_first_pipe_stage: + if parallel_state.is_pipeline_first_stage(): # Fist pipeline stage needs only the tokens and position_ids tokens = batch[0].cuda(non_blocking=True) position_ids = batch[4].cuda(non_blocking=True) labels, loss_mask, attention_mask = None, None, None - elif self._is_intermediate_pipe_stage: - # Intermediate pipeline stage doesn't need any inputs - tokens, labels, loss_mask, attention_mask, position_ids = None, None, None, None, None - elif self._is_last_pipe_stage: + elif parallel_state.is_pipeline_last_stage(): # Last pipeline stage needs only the labels and loss_mask labels = batch[1].cuda(non_blocking=True) loss_mask = batch[2].cuda(non_blocking=True) tokens, attention_mask, position_ids = None, None, None else: - assert False + # Intermediate pipeline stage doesn't need any inputs + tokens, labels, loss_mask, attention_mask, position_ids = None, None, None, None, None + output_tensor = model(tokens, position_ids, attention_mask, labels) def loss_func(output_tensor): From 2a76688c615327bcc1f0d140fbb4051c86f1ad63 Mon Sep 17 00:00:00 2001 From: ericharper Date: Thu, 18 Aug 2022 17:03:01 -0600 Subject: [PATCH 12/37] checkpointing for virtual models in progress Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d9f3bc000337..0d698f326ac2 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -193,7 +193,7 @@ def training_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: + if self.cfg.get('virtual_pipeline_model_parallel_size', 1) is not None: losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, @@ -420,7 +420,7 @@ def validation_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', 1) > 1: + if self.cfg.get('virtual_pipeline_model_parallel_size', 1) is not None: losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( forward_step_func=self.get_forward_output_and_loss_func(), batch=batch_for_pipeline, @@ -737,3 +737,9 @@ def list_available_models(cls) -> Optional[PretrainedModelInfo]: ) ) return result + + def on_save_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint + """ + return super().on_save_checkpoint(checkpoint) From 1c5d879c7004a9b958aa2c0c83c442c8f2593721 Mon Sep 17 00:00:00 2001 From: ericharper Date: Fri, 19 Aug 2022 11:31:19 -0600 Subject: [PATCH 13/37] add checkpoint hooks Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 15 ++++++++++++++- .../nlp/modules/common/megatron/module.py | 5 +++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 0d698f326ac2..29fee0d749e3 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -742,4 +742,17 @@ def on_save_checkpoint(self, checkpoint) -> None: """LightningModule hook: https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint """ - return super().on_save_checkpoint(checkpoint) + if len(self.model) is not None: + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() + + def on_load_checkpoint(self, checkpoint) -> None: + """LightningModule hook: + https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint + """ + if len(self.model) is not None: + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) + diff --git a/nemo/collections/nlp/modules/common/megatron/module.py b/nemo/collections/nlp/modules/common/megatron/module.py index 662e861e951a..c40652860d84 100644 --- a/nemo/collections/nlp/modules/common/megatron/module.py +++ b/nemo/collections/nlp/modules/common/megatron/module.py @@ -163,6 +163,11 @@ def sync_initial_position_embeddings(self): position_embeddings = self.position_embeddings_weight() torch.distributed.all_reduce(position_embeddings.data, group=parallel_state.get_position_embedding_group()) + def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False): + """Use this function to override the state dict for + saving checkpoints.""" + return self.state_dict(destination, prefix, keep_vars) + def conversion_helper(val, conversion): """Apply conversion to val. Recursively apply conversion if `val` From 049b065779acf2554a4ddd4ec24929f30b161230 Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 22 Aug 2022 10:32:28 -0600 Subject: [PATCH 14/37] working on validation when resuming Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 29fee0d749e3..13b102600f70 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -82,8 +82,12 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self._validate_trainer() - # TODO: Not sure how to use lists of modules with PTL. - # This means we can only use pipeline parallelism without the interleaved schedule. + self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) + + if not self.megatron_amp_o2 and self.cfg.get('virtual_pipeline_model_parallel_size', None): + raise ValueError('Virtual pipeline model parallel is only supported when using megatron_amp_O2') + + # TODO: add unwrap model functionality # self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] self.model = build_model( model_provider_func=self.model_provider_func, @@ -91,11 +95,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), ) - # We don't need to call it explicitly? Since it is a pytorch lightning hook function - # self.setup_optimizer_param_groups() - - self.megatron_amp_o2 = cfg.get('megatron_amp_O2', False) - if self.megatron_amp_o2: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type @@ -462,9 +461,9 @@ def validation_step(self, batch, batch_idx): return loss_mean def validation_epoch_end(self, outputs): - if not outputs: - return - if parallel_state.is_pipeline_last_stage(): + # if not outputs[0]: + # return + if parallel_state.is_pipeline_last_stage(ignore_virtual=True): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() else: @@ -479,6 +478,7 @@ def validation_epoch_end(self, outputs): self.compute_consumed_samples(self.trainer.global_step - self.init_global_step), rank_zero_only=True, ) + self.log('global_step', self.trainer.global_step, prog_bar=True, rank_zero_only=True) def test_step(self, batch, batch_idx): return self.validation_step(batch, batch_idx) From 9c810f72e42a2379bbd5b6f56c1585d2e825b06c Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 23 Aug 2022 16:03:43 -0600 Subject: [PATCH 15/37] skip sanity val steps by default in config Signed-off-by: ericharper --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 + .../nlp/models/language_modeling/megatron_gpt_model.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index b9246d3735a8..233182937edb 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -15,6 +15,7 @@ trainer: val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 + num_sanity_val_steps: 0 # 0 means skip sanity val steps accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 benchmark: False diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 13b102600f70..3cab3739bcd9 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -461,8 +461,8 @@ def validation_step(self, batch, batch_idx): return loss_mean def validation_epoch_end(self, outputs): - # if not outputs[0]: - # return + if not outputs: + return if parallel_state.is_pipeline_last_stage(ignore_virtual=True): # only the last pipeline parallel stages return loss averaged_loss = torch.stack(outputs).mean() From c59e5fb6da2dce756e74c4c46cad41575460eb68 Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 27 Sep 2022 17:20:01 -0600 Subject: [PATCH 16/37] remove comment Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 49525f6f1087..57da51320a0f 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -94,8 +94,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): "Distributed optimizers require O2. Please set megatron_amp_O2 to True in the model config." ) - # TODO: add unwrap model functionality - # self.model = build_model(model_provider_func=self.model_provider_func, wrap_with_ddp=False)[0] self.model = build_model( model_provider_func=self.model_provider_func, wrap_with_ddp=False, From 87f995bf1bb10800b6265b4d6b8477ba2309b2fa Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 27 Sep 2022 18:14:40 -0600 Subject: [PATCH 17/37] log number of params Signed-off-by: ericharper --- .../conf/megatron_gpt_config.yaml | 1 + .../language_modeling/megatron_base_model.py | 15 +++++++++++++++ .../language_modeling/megatron_gpt_model.py | 13 +++++++++++++ 3 files changed, 29 insertions(+) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 4491ceac2222..3e0fac0987a0 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -19,6 +19,7 @@ trainer: accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually exp_manager: explicit_log_dir: null diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 657640c0e9cc..e20a707979f0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -33,6 +33,7 @@ from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.core.optim import MainParamsOptimizerWrapper, prepare_lr_scheduler from nemo.utils import logging +from nemo.utils.get_rank import is_global_rank_zero try: from apex.transformer import parallel_state @@ -368,3 +369,17 @@ def _validate_config(self): logging.info("Gradient accumulation fusion can only be used with megatron amp O2 mixed precision.") with open_dict(self.cfg): self.cfg.gradient_accumulation_fusion = False + + def is_data_parallel_rank_zero(self): + if is_global_rank_zero(): + return True + else: + try: + data_parallel_rank = parallel_state.get_data_parallel_rank() + except: + data_parallel_rank = None + + if data_parallel_rank is not None and data_parallel_rank == 0: + return True + else: + return False diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 57da51320a0f..e1ca506c4328 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -49,6 +49,7 @@ from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import AppState, logging +from nemo.utils.get_rank import is_global_rank_zero try: from apex.transformer import parallel_state @@ -612,6 +613,18 @@ def setup(self, stage=None): Args: stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ + + # log number of parameters + if self.is_data_parallel_rank_zero(): + num_parameters = sum( + [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model] + ) + logging.info( + f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' + f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' + f'Number of parameters: {num_parameters:.2e}.' + ) + resume_checkpoint_path = self.trainer._checkpoint_connector.resume_from_checkpoint_fit_path if resume_checkpoint_path: try: From b0e0548b82aea10e8a93cd2dfc3f28be533cb5e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Sep 2022 00:18:15 +0000 Subject: [PATCH 18/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d38de13afdcd..f76cdd9c320d 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -810,4 +810,3 @@ def on_load_checkpoint(self, checkpoint) -> None: for i in range(len(self.model)): parallel_state.set_virtual_pipeline_model_parallel_rank(i) self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) - From 8ef0d58a1278b72ac1611e0ab094f7d34490c33a Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 27 Sep 2022 18:18:32 -0600 Subject: [PATCH 19/37] style Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d38de13afdcd..f76cdd9c320d 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -810,4 +810,3 @@ def on_load_checkpoint(self, checkpoint) -> None: for i in range(len(self.model)): parallel_state.set_virtual_pipeline_model_parallel_rank(i) self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) - From 93a95dd37293017bdb78536f638f465930ff220d Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 3 Oct 2022 14:03:56 -0600 Subject: [PATCH 20/37] check if self.model is a list Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 90 +++++++++++++------ 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f76cdd9c320d..ec3b4a8db77e 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -94,12 +94,17 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): "Distributed optimizers require O2. Please set megatron_amp_O2 to True in the model config." ) + # build_model returns a list of modules which are used for interleaved pipeline parallelism self.model = build_model( model_provider_func=self.model_provider_func, wrap_with_ddp=False, virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None), ) + # if we're not using interleaved, then self.model is a module. + if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None: + self.model = self.model[0] + if self.megatron_amp_o2: if not self.with_distributed_adam: @@ -108,10 +113,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): module.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision - converted_model = [] - for module in self.model: - converted_model.append(Float16Module(module=module, precision=cfg.precision)) - self.model = converted_model + if isinstance(self.model, list): + converted_model = [] + for module in self.model: + converted_model.append(Float16Module(module=module, precision=cfg.precision)) + self.model = converted_model + else: + self.model = Float16Module(module=self.model, precision=cfg.precision) if self.trainer.precision == 32: self.autocast_dtype = torch.float @@ -169,9 +177,16 @@ def model_provider_func(self, pre_process, post_process): def setup_optimizer_param_groups(self): """ModelPT override. Optimizer will get self._optimizer_param_groups""" if self.cfg.get('do_layer_norm_weight_decay', False): - self._optimizer_param_groups = get_all_params_for_weight_decay_optimization(self.model) + if isinstance(self.model, list): + self._optimizer_param_groups = get_all_params_for_weight_decay_optimization(self.model) + else: + self._optimizer_param_groups = get_all_params_for_weight_decay_optimization([self.model]) + else: - self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) + if isinstance(self.model, list): + self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) + else: + self._optimizer_param_groups = get_params_for_weight_decay_optimization([self.model]) def setup_optimization( self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, @@ -323,20 +338,28 @@ def optimizer_zero_grad(self, *args, **kwargs): """ return + def _append_module_grads(self, module, grads): + for param in module.parameters(): + if getattr(param, 'sequence_parallel_enabled', False): + if self.megatron_amp_o2: + grad = param.main_grad + else: + grad = param.grad + grads.append(grad.data) + def allreduce_sequence_parallel_gradients(self): """ All-reduce layernorm parameters across model parallel nodes when sequence parallelism is used. Modified from megatron-lm: https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/3f91f09bb2ab32f9904b47f46f19d2fc3f518ed8/megatron/training.py#L425 """ + grads = [] - for module in self.model: - for param in module.parameters(): - if getattr(param, 'sequence_parallel_enabled', False): - if self.megatron_amp_o2: - grad = param.main_grad - else: - grad = param.grad - grads.append(grad.data) + if isinstance(self.model, list): + for module in self.model: + self._append_module_grads(module, grads) + else: + self._append_module_grads(self.model, grads) + coalesced = torch._utils._flatten_dense_tensors(grads) torch.distributed.all_reduce(coalesced, group=parallel_state.get_tensor_model_parallel_group()) for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): @@ -354,9 +377,15 @@ def allreduce_first_last_embeddings(self): or parallel_state.is_pipeline_last_stage(ignore_virtual=True) ): if parallel_state.is_pipeline_first_stage(ignore_virtual=True): - module = self.model[0] # only the first virtual rank has the embeddings + if isinstance(self.model, list): + module = self.model[0] # only the first virtual rank has the embeddings + else: + module = self.model if parallel_state.is_pipeline_last_stage(ignore_virtual=True): - module = self.model[-1] # only the last virtual rank has the embeddings + if isinstance(self.model, list): + module = self.model[-1] # only the last virtual rank has the embeddings + else: + module = self.model if module.share_token_embeddings: word_embeddings_weight = module.word_embeddings_weight() if self.megatron_amp_o2: @@ -614,9 +643,13 @@ def setup(self, stage=None): # log number of parameters if self.is_data_parallel_rank_zero(): - num_parameters = sum( - [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model] - ) + if isinstance(self.model, list): + num_parameters = sum( + [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model] + ) + else: + num_parameters = sum([p.nelement() for p in self.model.parameters()]) + logging.info( f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' @@ -648,12 +681,13 @@ def setup(self, stage=None): # when using pipeline model parallel the final stage need to initialize word embeddings if parallel_state.get_pipeline_model_parallel_world_size() > 1: - for i, module in enumerate(self.model): - parallel_state.set_virtual_pipeline_model_parallel_rank(i) - module.sync_initial_word_embeddings() - parallel_state.set_virtual_pipeline_model_parallel_rank(0) - # for module in self.model: - # module.sync_initial_word_embeddings() + if isinstance(self.model, list): + for i, module in enumerate(self.model): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + module.sync_initial_word_embeddings() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + else: + self.model.sync_initial_word_embeddings() def setup_training_data(self, cfg): if hasattr(self, '_train_ds'): @@ -797,16 +831,18 @@ def on_save_checkpoint(self, checkpoint) -> None: """LightningModule hook: https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint """ - if len(self.model) is not None: + if isinstance(self.model, list): for i in range(len(self.model)): parallel_state.set_virtual_pipeline_model_parallel_rank(i) checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) def on_load_checkpoint(self, checkpoint) -> None: """LightningModule hook: https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint """ - if len(self.model) is not None: + if isinstance(self.model, list): for i in range(len(self.model)): parallel_state.set_virtual_pipeline_model_parallel_rank(i) self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(0) From bf15a0d4cf06626414454f2cfdeb2397c36fe0d8 Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 3 Oct 2022 14:08:14 -0600 Subject: [PATCH 21/37] make virtual pipeline default size None on init Signed-off-by: ericharper --- nemo/collections/nlp/modules/common/megatron/megatron_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 9e78e7de29ff..3d16406302d7 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -47,7 +47,7 @@ def initialize_model_parallel_for_nemo( local_rank, tensor_model_parallel_size=1, pipeline_model_parallel_size=1, - virtual_pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, pipeline_model_parallel_split_rank=None, micro_batch_size=None, global_batch_size=None, From dba6c5f76a36cd71894b73c8be510c4e8e7594cc Mon Sep 17 00:00:00 2001 From: ericharper Date: Mon, 3 Oct 2022 14:08:52 -0600 Subject: [PATCH 22/37] make virtual pipeline default to None in config Signed-off-by: ericharper --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 0be6ff797843..fd8d27e035e8 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -49,7 +49,7 @@ model: global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism - virtual_pipeline_model_parallel_size: 1 # interleaved pipeline + virtual_pipeline_model_parallel_size: null # interleaved pipeline # model architecture encoder_seq_length: 512 From b115d095683d4f718d727a383fd2d21ed5f9244b Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:26:54 -0600 Subject: [PATCH 23/37] remove ensure_divisibility call Signed-off-by: ericharper --- .../nlp/modules/common/megatron/megatron_init.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 3d16406302d7..0b66e6ce5a16 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -186,18 +186,16 @@ def fake_initialize_model_parallel( pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size) model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size - ensure_divisibility(world_size, tensor_model_parallel_size * pipeline_model_parallel_size) + assert ( + world_size % tensor_model_parallel_size * pipeline_model_parallel_size == 0 + ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size}' data_parallel_size = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size) num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size - # if virtual_pipeline_model_parallel_size_ is not None: - # global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK - # global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE - # _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0 - # _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_ - virtual_pipeline_model_parallel_rank = 0 + if virtual_pipeline_model_parallel_size_ is not None: + virtual_pipeline_model_parallel_rank = 0 # Build the data-parallel groups. all_data_parallel_group_ranks = [] From 969bf692324ca03bb42a7d1a3fddf285653998bb Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:39:48 -0600 Subject: [PATCH 24/37] fix lgtm alerts Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8d6b71a8717d..5c4d45e96c37 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -48,7 +48,6 @@ from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import AppState, logging -from nemo.utils.get_rank import is_global_rank_zero try: from apex.transformer import parallel_state @@ -357,8 +356,8 @@ def allreduce_sequence_parallel_gradients(self): if isinstance(self.model, list): for module in self.model: self._append_module_grads(module, grads) - else: - self._append_module_grads(self.model, grads) + else: + self._append_module_grads(self.model, grads) coalesced = torch._utils._flatten_dense_tensors(grads) torch.distributed.all_reduce(coalesced, group=parallel_state.get_tensor_model_parallel_group()) From 1e956d67c1f695dc2a289f514c3da1a01bc4be60 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:40:43 -0600 Subject: [PATCH 25/37] remove num_sanity_val_steps from config Signed-off-by: ericharper --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index fd8d27e035e8..4eebe88952db 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -15,7 +15,6 @@ trainer: val_check_interval: 100 limit_val_batches: 50 limit_test_batches: 500 - num_sanity_val_steps: 0 # 0 means skip sanity val steps accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 benchmark: False From a48e80d21462ca6c5fb13d5dd6929e1fa5d7e944 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:42:14 -0600 Subject: [PATCH 26/37] default virtual pipeline size to none Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index e20a707979f0..6696bb614955 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -87,7 +87,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1), - virtual_pipeline_model_parallel_size=cfg.get('virtual_pipeline_model_parallel_size', 1), + virtual_pipeline_model_parallel_size=cfg.get('virtual_pipeline_model_parallel_size', None), pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), From 1b7a2061549a56e3291f308de6a4173c90cb35c2 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:44:15 -0600 Subject: [PATCH 27/37] check for list Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 5c4d45e96c37..7bd1cfed931f 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -108,8 +108,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if not self.with_distributed_adam: # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type - for module in self.model: - module.cuda(torch.cuda.current_device()) + if isinstance(self.model, list): + for module in self.model: + module.cuda(torch.cuda.current_device()) + else: + self.model.cuda(torch.cuda.current_device()) # Model wrapper to convert both model and inputs to half precision if isinstance(self.model, list): From fdf3846788d6c358129ad79f44096db59ebbe11c Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:55:26 -0600 Subject: [PATCH 28/37] update assert to make sure we are only doing virtual for gpt Signed-off-by: ericharper --- nemo/collections/nlp/modules/common/megatron/transformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index 6ba58bc50089..45de12e32ad9 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -1763,7 +1763,9 @@ def build_layer(layer_number): assert num_layers % parallel_state.get_virtual_pipeline_model_parallel_world_size() == 0, ( 'num_layers_per_stage must be divisible by ' 'virtual_pipeline_model_parallel_size' ) - # assert self.model_type != ModelType.encoder_or_decoder + + assert self.model_type.value != 2, f'virtual pipeline parallel currently only supported for GPT' + # Number of layers in each model chunk is the number of layers in the stage, # divided by the number of model chunks in a stage. self.num_layers = self.num_layers // parallel_state.get_virtual_pipeline_model_parallel_world_size() From 5a65fada0d4fb5773c1b82af18efab62c67b9bc1 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 5 Oct 2022 13:58:28 -0600 Subject: [PATCH 29/37] revert change to get_params_for_weight_decay Signed-off-by: ericharper --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 +---- nemo/collections/nlp/modules/common/megatron/utils.py | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7bd1cfed931f..60bfdfcc5b57 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -185,10 +185,7 @@ def setup_optimizer_param_groups(self): self._optimizer_param_groups = get_all_params_for_weight_decay_optimization([self.model]) else: - if isinstance(self.model, list): - self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) - else: - self._optimizer_param_groups = get_params_for_weight_decay_optimization([self.model]) + self._optimizer_param_groups = get_params_for_weight_decay_optimization(self.model) def setup_optimization( self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None, diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py index bfc41b5bfc8f..ab801608120a 100644 --- a/nemo/collections/nlp/modules/common/megatron/utils.py +++ b/nemo/collections/nlp/modules/common/megatron/utils.py @@ -320,9 +320,10 @@ def get_params_for_weight_decay_optimization( Layernorms and biases will have no weight decay but the rest will. """ + modules = listify_model(model) weight_decay_params = {'params': []} no_weight_decay_params = {'params': [], 'weight_decay': 0.0} - for module in model: + for module in modules: for module_ in module.modules(): if isinstance(module_, (FusedLayerNorm, FastLayerNorm)): no_weight_decay_params['params'].extend( From 784060e8a4847835cc82325a1d8cf0d55acd822d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Oct 2022 16:03:35 +0000 Subject: [PATCH 30/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 6795daf68f63..e67671a4936d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -32,8 +32,8 @@ from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.core.optim import MainParamsOptimizerWrapper, prepare_lr_scheduler -from nemo.utils.get_rank import is_global_rank_zero from nemo.utils import AppState, logging +from nemo.utils.get_rank import is_global_rank_zero try: from apex.transformer import parallel_state From 08509f9fa505b7c3a3b31a936ca135c11f2a098b Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 11 Oct 2022 10:35:49 -0600 Subject: [PATCH 31/37] init var Signed-off-by: ericharper --- nemo/collections/nlp/modules/common/megatron/megatron_init.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 0b66e6ce5a16..7dba82c62733 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -194,6 +194,7 @@ def fake_initialize_model_parallel( num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size + virtual_pipeline_model_parallel_rank = None if virtual_pipeline_model_parallel_size_ is not None: virtual_pipeline_model_parallel_rank = 0 From aa21c85fb0e1e2e1ec64d5fb6585479a1f4c01b3 Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 11 Oct 2022 11:21:32 -0600 Subject: [PATCH 32/37] add import guard for set virtual model parallel world size Signed-off-by: ericharper --- .../nlp/modules/common/megatron/megatron_init.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 7dba82c62733..b0233068fe44 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -26,7 +26,6 @@ get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank, - set_virtual_pipeline_model_parallel_world_size, set_pipeline_model_parallel_split_rank, set_pipeline_model_parallel_world_size, set_tensor_model_parallel_rank, @@ -40,6 +39,16 @@ except (ImportError, ModuleNotFoundError): HAVE_APEX = False +try: + # TODO: remove when apex is updated + from apex.transformer.parallel_state import set_virtual_pipeline_model_parallel_world_size + + HAVE_INTERLEAVED = True + +except: + + HAVE_INTERLEAVED = False + def initialize_model_parallel_for_nemo( world_size, @@ -55,6 +64,9 @@ def initialize_model_parallel_for_nemo( apex_transformer_log_level=30, ): + if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: + raise ValueError("set_virtual_pipeline_model_parallel_world_size is needed in Apex for interleaved.") + # updating NeMo globals app_state = AppState() app_state.global_rank = global_rank From 5b41d1fb7307c0df5d62a08abb0bddcc2a71a70a Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 11 Oct 2022 12:25:26 -0600 Subject: [PATCH 33/37] use import guard Signed-off-by: ericharper --- nemo/collections/nlp/modules/common/megatron/megatron_init.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index b0233068fe44..7f14a56d8814 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -96,7 +96,8 @@ def initialize_model_parallel_for_nemo( set_tensor_model_parallel_rank(app_state.tensor_model_parallel_rank) set_pipeline_model_parallel_rank(app_state.pipeline_model_parallel_rank) - set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size) + if HAVE_INTERLEAVED: + set_virtual_pipeline_model_parallel_world_size(app_state.virtual_pipeline_model_parallel_size) set_virtual_pipeline_model_parallel_rank(app_state.virtual_pipeline_model_parallel_rank) set_pipeline_model_parallel_world_size(app_state.pipeline_model_parallel_size) set_pipeline_model_parallel_split_rank(app_state.pipeline_model_parallel_split_rank) From 4a6b3f5833bf60522af0896c3fa7191b12e3bd0a Mon Sep 17 00:00:00 2001 From: ericharper Date: Tue, 11 Oct 2022 16:02:05 -0600 Subject: [PATCH 34/37] update calls to fake init in eval scripts Signed-off-by: ericharper --- examples/nlp/language_modeling/megatron_gpt_eval.py | 1 + examples/nlp/language_modeling/megatron_t5_eval.py | 1 + .../nlp/language_modeling/megatron_t5_prompt_learning_eval.py | 1 + .../nlp/language_modeling/tuning/megatron_t5_adapter_eval.py | 1 + examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py | 1 + .../nlp/machine_translation/nmt_transformer_infer_megatron.py | 1 + .../convert_prompt_learning_ckpt_to_nemo.py | 1 + 7 files changed, 7 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index 7e66c3096f33..e0f73d00993e 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -171,6 +171,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py index 0c205ab65ad0..56b46f96a895 100644 --- a/examples/nlp/language_modeling/megatron_t5_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_eval.py @@ -70,6 +70,7 @@ def main(): app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py index 812eb51975d3..a01c9b15d195 100644 --- a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py @@ -56,6 +56,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py index 1430b8b6da03..6afc5a505917 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_adapter_eval.py @@ -57,6 +57,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py index 21a1b926f8c6..d150353475d5 100644 --- a/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_t5_ia3_eval.py @@ -57,6 +57,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py index a8d87f71dcbe..7bee2f562dc3 100644 --- a/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py +++ b/examples/nlp/machine_translation/nmt_transformer_infer_megatron.py @@ -62,6 +62,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, diff --git a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py index 59cd1e694b13..af9a01fa3ece 100644 --- a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py @@ -72,6 +72,7 @@ def main(cfg) -> None: app_state.model_parallel_size, app_state.data_parallel_size, app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, ) = fake_initialize_model_parallel( world_size=app_state.model_parallel_size, rank=trainer.global_rank, From f631df400bda9670352585633d4f002d2e39c090 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 12 Oct 2022 12:08:22 -0600 Subject: [PATCH 35/37] add _get_fwd_bwd_function Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 120 +++++++----------- 1 file changed, 49 insertions(+), 71 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index cf3729962c1a..7c654591f4d9 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -199,6 +199,17 @@ def forward(self, tokens, text_position_ids, attention_mask, labels): output_tensor = self.model(tokens, text_position_ids, attention_mask, labels=labels) return output_tensor + def _get_fwd_bwd_function(self): + fwd_bwd_function = None + if self.cfg.get('pipeline_model_parallel_size', 1) > 1: + if self.cfg.get('virtual_pipeline_model_parallel_size', None) is not None: + fwd_bwd_function = _forward_backward_pipelining_with_interleaving + else: + fwd_bwd_function = forward_backward_pipelining_without_interleaving + else: + fwd_bwd_function = forward_backward_no_pipelining + return fwd_bwd_function + def training_step(self, batch, batch_idx): """ Our dataloaders produce a micro-batch and then we fetch @@ -224,49 +235,32 @@ def training_step(self, batch, batch_idx): tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] - if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', 1) is not None: - losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=False, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) + # determine if we can use async grad all reduce + custom_sync_context_handler = None + if self.megatron_amp_o2 and not self.cfg.get('sequence_parallel', False): + if self.with_distributed_adam: + custom_sync_context_handler = lambda: self._optimizer.no_sync(greedy_grad_copy=True) else: - losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=False, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) + custom_sync_context_handler = self._optimizer.no_sync else: - # no pipeline parallelism so we reduce grads asynchronously if not using sequence parallelism - if self.megatron_amp_o2 and not self.cfg.get('sequence_parallel', False): - if self.with_distributed_adam: - custom_sync_context_handler = lambda: self._optimizer.no_sync(greedy_grad_copy=True) - else: - custom_sync_context_handler = self._optimizer.no_sync - else: - # TODO: enable async grad all reduce for O1/autocast mixed precision training - custom_sync_context_handler = None - losses_reduced_per_micro_batch = forward_backward_no_pipelining( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=False, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, - custom_sync_context_handler=custom_sync_context_handler, - ) + # TODO: enable async grad all reduce for O1/autocast mixed precision training + custom_sync_context_handler = None + + # run forward and backwards passes for an entire global batch + # we do this inside training_step to support pipeline parallelism + fwd_bwd_function = self._get_fwd_bwd_function() + + losses_reduced_per_micro_batch = fwd_bwd_function( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=False, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None, + custom_sync_context_handler=custom_sync_context_handler, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) # only the last stages of the pipeline return losses if losses_reduced_per_micro_batch: @@ -466,37 +460,21 @@ def validation_step(self, batch, batch_idx): batch_for_pipeline = self.process_global_batch(batch) tensor_shape = [self.cfg.encoder_seq_length, self.cfg.micro_batch_size, self.cfg.hidden_size] - if self.cfg.get('pipeline_model_parallel_size', 1) > 1: - if self.cfg.get('virtual_pipeline_model_parallel_size', 1) is not None: - losses_reduced_per_micro_batch = _forward_backward_pipelining_with_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=True, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) - else: - losses_reduced_per_micro_batch = forward_backward_pipelining_without_interleaving( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=True, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), - ) - else: - losses_reduced_per_micro_batch = forward_backward_no_pipelining( - forward_step_func=self.get_forward_output_and_loss_func(), - batch=batch_for_pipeline, - model=self.model, - forward_only=True, - tensor_shape=tensor_shape, - dtype=self.autocast_dtype, - ) + # run forward passes for an entire global batch + # we do this inside validation_step to support pipeline parallelism + fwd_bwd_function = self._get_fwd_bwd_function() + + losses_reduced_per_micro_batch = fwd_bwd_function( + forward_step_func=self.get_forward_output_and_loss_func(), + batch=batch_for_pipeline, + model=self.model, + forward_only=True, + tensor_shape=tensor_shape, + dtype=self.autocast_dtype, + sequence_parallel_enabled=self.cfg.get('sequence_parallel', False), + ) + # only the last stage of the pipeline returns losses if losses_reduced_per_micro_batch: # average loss across micro batches loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch] From 7d3e7ff3af2689ba4bdd85438ffbf8917412cafb Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 12 Oct 2022 14:40:26 -0600 Subject: [PATCH 36/37] log all total model parameters Signed-off-by: ericharper --- .../language_modeling/megatron_gpt_model.py | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 7c654591f4d9..747290b9f3a5 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -620,19 +620,38 @@ def setup(self, stage=None): """ # log number of parameters - if self.is_data_parallel_rank_zero(): - if isinstance(self.model, list): - num_parameters = sum( - [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model] - ) - else: - num_parameters = sum([p.nelement() for p in self.model.parameters()]) - - logging.info( - f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' - f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' - f'Number of parameters: {num_parameters:.2e}.' + if isinstance(self.model, list): + num_parameters_on_device = sum( + [sum([p.nelement() for p in model_module.parameters()]) for model_module in self.model] ) + if parallel_state.get_pipeline_model_parallel_world_size() > 1 and parallel_state.is_pipeline_last_stage( + ignore_virtual=True + ): + # substract the embedding weights on the last virtual stage + num_word_embedding_parameters = sum([p.nelement() for p in self.model[-1].word_embeddings_weight()]) + num_parameters_on_device -= num_word_embedding_parameters + else: + num_parameters_on_device = sum([p.nelement() for p in self.model.parameters()]) + + if parallel_state.get_pipeline_model_parallel_world_size() > 1 and parallel_state.is_pipeline_last_stage( + ignore_virtual=True + ): + # substract the embedding weights on the last stage + num_word_embedding_parameters = sum([p.nelement() for p in self.model.word_embeddings_weight()]) + + num_parameters_on_device -= num_word_embedding_parameters + + # to be summed across data parallel group + total_num_parameters = torch.tensor(num_parameters_on_device).cuda() + + torch.distributed.all_reduce(total_num_parameters, group=parallel_state.get_model_parallel_group()) + + logging.info( + f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' + f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' + f'Number of model parameters on device: {num_parameters_on_device:.2e}. ' + f'Total number of model parameters: {total_num_parameters:.2e}.' + ) resume_checkpoint_path = self.trainer._checkpoint_connector.resume_from_checkpoint_fit_path if resume_checkpoint_path: From d35c4b89ee46e77019cb7d94fda99b262e1d6197 Mon Sep 17 00:00:00 2001 From: ericharper Date: Wed, 12 Oct 2022 14:42:49 -0600 Subject: [PATCH 37/37] remove unused import Signed-off-by: ericharper --- nemo/collections/nlp/modules/common/megatron/megatron_init.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 7f14a56d8814..e64f8760c3d1 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -33,7 +33,6 @@ ) from apex.transformer.microbatches import ConstantNumMicroBatches from apex.transformer.pipeline_parallel.utils import setup_microbatch_calculator - from apex.transformer.utils import ensure_divisibility HAVE_APEX = True except (ImportError, ModuleNotFoundError):