From 9973d6c8fbb496a8b874112e1e878a1e0372f2a2 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 9 Nov 2021 12:54:10 -0500 Subject: [PATCH 01/15] Measure scales --- megatron/model/bert_model.py | 7 +++++++ megatron/model/language_model.py | 10 ++++++++-- megatron/model/utils.py | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 3ff5039d5fe..56fc5a75a0d 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -86,13 +86,20 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, self.gelu = erf_gelu def forward(self, hidden_states, word_embeddings_weight): + metrics={} + metrics["hidden_scale"]=hidden_states.std() hidden_states = self.dense(hidden_states) + metrics["dense_scale"]=hidden_states.std() hidden_states = self.gelu(hidden_states) + metrics["gelu_scale"]=hidden_states.std() hidden_states = self.layernorm(hidden_states) + metrics["ln_scale"]=hidden_states.std() output = parallel_lm_logits(hidden_states, word_embeddings_weight, self.parallel_output, bias=self.bias) + metrics["logits_scale"]=output.std() + print({key:value.detach().cpu().item() for key, value in metrics.items()}) return output diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 06330d81395..5264872b7b1 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -178,15 +178,21 @@ def add_tokentype_embeddings(self, num_tokentypes): def forward(self, input_ids, position_ids, tokentype_ids=None): # Embeddings. + metrics={} words_embeddings = self.word_embeddings(input_ids) + metrics["words_embeddings"]=words_embeddings.std() position_embeddings = self.position_embeddings(position_ids) + metrics["position_embeddings"]=position_embeddings.std() embeddings = words_embeddings + position_embeddings if tokentype_ids is not None: assert self.tokentype_embeddings is not None - embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) + tokentype_embeddings=self.tokentype_embeddings(tokentype_ids) + metrics["tokentype_embeddings"]=tokentype_embeddings.std() + embeddings = embeddings + tokentype_embeddings else: assert self.tokentype_embeddings is None - + + print({key:value.detach().cpu().item() for key, value in metrics.items()}) # Dropout. embeddings = self.embedding_dropout(embeddings) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 465e8aa4ff6..924f9393acd 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -31,7 +31,7 @@ def init_(tensor): def scaled_init_method_normal(sigma, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" - std = sigma / math.sqrt(2.0 * num_layers) + std = sigma / math.sqrt(2.0 * max(num_layers,1)) def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=std) From 47c0549370d832e805f2a1eef1f4f80b0c177729 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 15 Nov 2021 15:22:05 -0500 Subject: [PATCH 02/15] Add parameter names --- megatron/__init__.py | 20 ++++++++-------- megatron/model/bert_model.py | 34 ++++++++++++++++++++------ megatron/model/language_model.py | 41 +++++++++++++++++++++++++++----- megatron/model/transformer.py | 24 +++++++++++++++++-- megatron/optimizer/clip_grads.py | 8 +++++++ megatron/optimizer/optimizer.py | 24 +++++++++++++++++++ megatron/training.py | 1 + 7 files changed, 127 insertions(+), 25 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index f670e652aac..46e08ce3c26 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -39,11 +39,11 @@ def print_rank_0(message): """If distributed is initialized, print only on rank 0.""" - if torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - print(message, flush=True) - else: - print(message, flush=True) + #if torch.distributed.is_initialized(): + # if torch.distributed.get_rank() == 0: + # print(message, flush=True) + #else: + print(message, flush=True) def is_last_rank(): return torch.distributed.get_rank() == ( @@ -51,8 +51,8 @@ def is_last_rank(): def print_rank_last(message): """If distributed is initialized, print only on last rank.""" - if torch.distributed.is_initialized(): - if is_last_rank(): - print(message, flush=True) - else: - print(message, flush=True) + #if torch.distributed.is_initialized(): + # if is_last_rank(): + # print(message, flush=True) + #else: + print(message, flush=True) diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 56fc5a75a0d..56623c47b3f 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -74,11 +74,16 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) + self.bias.name_="output_layer.lm_head.logits.bias" mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) self.parallel_output = parallel_output self.dense = get_linear_layer(hidden_size, hidden_size, init_method) + self.dense.weight.name_="output_layer.lm_head.dense.weight" + self.dense.bias.name_="output_layer.lm_head.dense.bias" self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) + self.layernorm.weight.name_="output_layer.lm_head.layernorm.weight" + self.layernorm.bias.name_="output_layer.lm_head.layernorm.bias" self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu @@ -86,20 +91,26 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, self.gelu = erf_gelu def forward(self, hidden_states, word_embeddings_weight): - metrics={} - metrics["hidden_scale"]=hidden_states.std() + args=get_args() + if args.iteration % args.log_interval == 0: + metrics={} + metrics["hidden_scale"]=hidden_states.std() hidden_states = self.dense(hidden_states) - metrics["dense_scale"]=hidden_states.std() + if args.iteration % args.log_interval == 0: + metrics["dense_scale"]=hidden_states.std() hidden_states = self.gelu(hidden_states) - metrics["gelu_scale"]=hidden_states.std() + if args.iteration % args.log_interval == 0: + metrics["gelu_scale"]=hidden_states.std() hidden_states = self.layernorm(hidden_states) - metrics["ln_scale"]=hidden_states.std() + if args.iteration % args.log_interval == 0: + metrics["ln_scale"]=hidden_states.std() output = parallel_lm_logits(hidden_states, word_embeddings_weight, self.parallel_output, bias=self.bias) - metrics["logits_scale"]=output.std() - print({key:value.detach().cpu().item() for key, value in metrics.items()}) + if args.iteration % args.log_interval == 0: + metrics["logits_scale"]=output.std() + print({key:value.detach().cpu().item() for key, value in metrics.items()}) return output @@ -114,7 +125,14 @@ def post_language_model_processing(lm_output, pooled_output, binary_logits = None if binary_head is not None: + args=get_args() + if args.iteration % args.log_interval == 0: + metrics={} + metrics["pooled_output"]=pooled_output.std() binary_logits = binary_head(pooled_output) + if args.iteration % args.log_interval == 0: + metrics["binary_logits"]=binary_logits.std() + print({key:value.detach().cpu().item() for key, value in metrics.items()}) if lm_labels is None: return lm_logits, binary_logits @@ -169,6 +187,8 @@ def __init__(self, if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, init_method) + self.binary_head.weight.name_="output_layer.sop_head.binary_head.weight" + self.binary_head.bias.name_="output_layer.sop_head.binary_head.bias" self._binary_head_key = 'binary_head' def set_input_tensor(self, input_tensor): diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 5264872b7b1..6a533f77c89 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -91,13 +91,26 @@ class Pooler(MegatronModule): def __init__(self, hidden_size, init_method): super(Pooler, self).__init__() self.dense = get_linear_layer(hidden_size, hidden_size, init_method) + self.dense.weight.name_="output_layer.sop_head.dense.weight" + self.dense.bias.name_="output_layer.sop_head.dense.bias" def forward(self, hidden_states, sequence_index=0): # hidden_states: [b, s, h] # sequence_index: index of the token to pool. + args=get_args() + if args.iteration % args.log_interval == 0: + metrics={} + metrics["pooler_hidden"]=hidden_states.std() pooled = hidden_states[:, sequence_index, :] + if args.iteration % args.log_interval == 0: + metrics["pooler_pooled"]=pooled.std() pooled = self.dense(pooled) + if args.iteration % args.log_interval == 0: + metrics["pooler_dense"]=pooled.std() pooled = torch.tanh(pooled) + if args.iteration % args.log_interval == 0: + metrics["pooler_output"]=pooled.std() + print({key:value.detach().cpu().item() for key, value in metrics.items()}) return pooled @@ -135,11 +148,13 @@ def __init__(self, vocab_size, self.hidden_size, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' + self.word_embeddings.weight.name_="input_layer.word_embeddings.weight" # Position embedding (serial). self.position_embeddings = torch.nn.Embedding( max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' + self.position_embeddings.weight.name_="input_layer.position_embeddings.weight" # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) @@ -152,6 +167,7 @@ def __init__(self, self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size) # Initialize the token-type embeddings. + self.tokentype_embeddings.weight.name_="input_layer.tokentype_embeddings.weight" self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None @@ -178,21 +194,27 @@ def add_tokentype_embeddings(self, num_tokentypes): def forward(self, input_ids, position_ids, tokentype_ids=None): # Embeddings. - metrics={} + args=get_args() + if args.iteration % args.log_interval == 0: + metrics={} words_embeddings = self.word_embeddings(input_ids) - metrics["words_embeddings"]=words_embeddings.std() + if args.iteration % args.log_interval == 0: + metrics["words_embeddings"]=words_embeddings.std() position_embeddings = self.position_embeddings(position_ids) - metrics["position_embeddings"]=position_embeddings.std() + if args.iteration % args.log_interval == 0: + metrics["position_embeddings"]=position_embeddings.std() embeddings = words_embeddings + position_embeddings if tokentype_ids is not None: assert self.tokentype_embeddings is not None tokentype_embeddings=self.tokentype_embeddings(tokentype_ids) - metrics["tokentype_embeddings"]=tokentype_embeddings.std() + if args.iteration % args.log_interval == 0: + metrics["tokentype_embeddings"]=tokentype_embeddings.std() embeddings = embeddings + tokentype_embeddings else: assert self.tokentype_embeddings is None - - print({key:value.detach().cpu().item() for key, value in metrics.items()}) + + if args.iteration % args.log_interval == 0: + print({key:value.detach().cpu().item() for key, value in metrics.items()}) # Dropout. embeddings = self.embedding_dropout(embeddings) @@ -352,6 +374,10 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, else: encoder_input = None + args=get_args() + if args.iteration % args.log_interval == 0: + metrics = {} + metrics["encoder_input"] = encoder_input.std() # encoder. if enc_hidden_states is None: encoder_output = self.encoder(encoder_input, @@ -361,6 +387,9 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, else: encoder_output = enc_hidden_states.to(encoder_input.dtype) + if args.iteration % args.log_interval == 0: + metrics["encoder_output"] = encoder_output.std() + print({key:value.detach().cpu().item() for key, value in metrics.items()}) if self.post_process: if self.add_pooler: pooled_output = self.pooler(encoder_output, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index ac9d2021892..90438f4b6e7 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -57,7 +57,7 @@ class ParallelMLP(MegatronModule): applied. """ - def __init__(self, init_method, output_layer_init_method): + def __init__(self, init_method, output_layer_init_method, layer_number): super(ParallelMLP, self).__init__() args = get_args() @@ -68,6 +68,8 @@ def __init__(self, init_method, output_layer_init_method): gather_output=False, init_method=init_method, skip_bias_add=True) + self.dense_h_to_4h.weight.name_=f"layer_{layer_number}.mlp.dense_0.weight" + self.dense_h_to_4h.bias.name_=f"layer_{layer_number}.mlp.dense_0.bias" self.bias_gelu_fusion = args.bias_gelu_fusion self.activation_func = F.gelu @@ -83,6 +85,8 @@ def __init__(self, init_method, output_layer_init_method): input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True) + self.dense_4h_to_h.weight.name_=f"layer_{layer_number}.mlp.dense_1.weight" + self.dense_4h_to_h.bias.name_=f"layer_{layer_number}.mlp.dense_1.bias" def forward(self, hidden_states): @@ -144,6 +148,8 @@ def __init__(self, init_method, 3 * projection_size, gather_output=False, init_method=init_method) + self.query_key_value.weight.name_=f"layer_{layer_number}.attention.query_key_value.weight" + self.query_key_value.bias.name_=f"layer_{layer_number}.attention.query_key_value.bias" else: assert attention_type == AttnType.cross_attn self.query = mpu.ColumnParallelLinear( @@ -151,12 +157,16 @@ def __init__(self, init_method, projection_size, gather_output=False, init_method=init_method) + self.query.weight.name_=f"layer_{layer_number}.attention.query.weight" + self.query.bias.name_=f"layer_{layer_number}.attention.query.bias" self.key_value = mpu.ColumnParallelLinear( args.hidden_size, 2 * projection_size, gather_output=False, init_method=init_method) + self.key_value.weight.name_=f"layer_{layer_number}.attention.key_value.weight" + self.key_value.bias.name_=f"layer_{layer_number}.attention.key_value.bias" coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -184,6 +194,8 @@ def __init__(self, init_method, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True) + self.dense.weight.name_=f"layer_{layer_number}.attention.dense.weight" + self.dense.bias.name_=f"layer_{layer_number}.attention.dense.bias" def forward(self, hidden_states, attention_mask, layer_past=None, get_key_value=False, encoder_output=None): @@ -405,6 +417,8 @@ def __init__(self, init_method, output_layer_init_method, self.input_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon) + self.input_layernorm.weight.name_=f"layer_{self.layer_number}.input_layernorm.weight" + self.input_layernorm.bias.name_=f"layer_{self.layer_number}.input_layernorm.bias" # Self attention. self.self_attention = ParallelAttention( @@ -420,6 +434,8 @@ def __init__(self, init_method, output_layer_init_method, self.post_attention_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon) + self.post_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_attention_layernorm.weight" + self.post_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_attention_layernorm.bias" if self.layer_type == LayerType.decoder: self.inter_attention = ParallelAttention( @@ -431,10 +447,12 @@ def __init__(self, init_method, output_layer_init_method, self.post_inter_attention_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon) + self.post_inter_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.weight" + self.post_inter_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.bias" # MLP self.mlp = ParallelMLP(init_method, - output_layer_init_method) + output_layer_init_method, layer_number=self.layer_number) def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, @@ -590,6 +608,8 @@ def build_layer(layer_number): self.final_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon) + self.final_layernorm.weight.name_="output_layer.final_layernorm.weight" + self.final_layernorm.bias.name_="output_layer.final_layernorm.bias" def _get_layer(self, layer_number): return self.layers[layer_number] diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index 036a1d4c4cf..b4d943df2df 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -63,6 +63,14 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Make sure the grads are in fp32 assert param.grad.type() == 'torch.cuda.FloatTensor' grads.append(grad) + from megatron import get_args + args=get_args() + if args.iteration==1: + print(grad.shape, + grad_not_none and is_not_shared and is_not_tp_duplicate, + torch.norm(grad, norm_type).detach().cpu().item(), + grad.std().detach().cpu().item() + ) if grad_not_none and is_not_shared and is_not_tp_duplicate: grads_for_norm.append(grad) diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index 77baddd62ad..d7da6825327 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -406,6 +406,17 @@ def step(self): num_zeros_in_grad = self.count_zeros() if \ self.log_num_zeros_in_grad else None + from megatron import get_args + args=get_args() + if args.iteration==1: + for group in self.optimizer.param_groups: + for p in group['params']: + print( + p.detach().float().std().cpu().item(), + p.grad.detach().float().std().cpu().item(), + getattr(p, "name_", "unknown"), + p.shape + ) # Step the optimizer. self.optimizer.step() @@ -504,6 +515,19 @@ def step(self): num_zeros_in_grad = self.count_zeros() if \ self.log_num_zeros_in_grad else None + from megatron import get_args + args=get_args() + if args.iteration % args.log_interval == 0: + for group in self.optimizer.param_groups: + for p in group['params']: + g=p.grad.detach().float() + print( + p.detach().float().std().cpu().item(), + g.std().cpu().item(), + torch.norm(g, 2).cpu().item(), + getattr(p, "name_", "unknown"), + p.shape + ) # Update parameters. self.optimizer.step() diff --git a/megatron/training.py b/megatron/training.py index 62ed60c1238..f46ded1a3fb 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -625,6 +625,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler, optimizer, lr_scheduler) iteration += 1 + args.iteration=iteration args.consumed_train_samples += mpu.get_data_parallel_world_size() * \ args.micro_batch_size * \ get_num_microbatches() From da3fd2217d8e61bb2fe5c6c7310b6850070b194a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 18 Nov 2021 13:08:26 -0500 Subject: [PATCH 03/15] stuff --- megatron/model/bert_model.py | 14 +++++++------- megatron/model/language_model.py | 16 ++++++++-------- megatron/optimizer/__init__.py | 3 +++ megatron/optimizer/clip_grads.py | 12 ++++++------ megatron/optimizer/optimizer.py | 4 ++-- 5 files changed, 26 insertions(+), 23 deletions(-) diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 56623c47b3f..30f3f29933c 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -94,22 +94,22 @@ def forward(self, hidden_states, word_embeddings_weight): args=get_args() if args.iteration % args.log_interval == 0: metrics={} - metrics["hidden_scale"]=hidden_states.std() + metrics["hidden_scale"]=hidden_states.float().pow(2).mean().pow(0.5) hidden_states = self.dense(hidden_states) if args.iteration % args.log_interval == 0: - metrics["dense_scale"]=hidden_states.std() + metrics["dense_scale"]=hidden_states.float().pow(2).mean().pow(0.5) hidden_states = self.gelu(hidden_states) if args.iteration % args.log_interval == 0: - metrics["gelu_scale"]=hidden_states.std() + metrics["gelu_scale"]=hidden_states.float().pow(2).mean().pow(0.5) hidden_states = self.layernorm(hidden_states) if args.iteration % args.log_interval == 0: - metrics["ln_scale"]=hidden_states.std() + metrics["ln_scale"]=hidden_states.float().pow(2).mean().pow(0.5) output = parallel_lm_logits(hidden_states, word_embeddings_weight, self.parallel_output, bias=self.bias) if args.iteration % args.log_interval == 0: - metrics["logits_scale"]=output.std() + metrics["logits_scale"]=output.float().pow(2).mean().pow(0.5) print({key:value.detach().cpu().item() for key, value in metrics.items()}) return output @@ -128,10 +128,10 @@ def post_language_model_processing(lm_output, pooled_output, args=get_args() if args.iteration % args.log_interval == 0: metrics={} - metrics["pooled_output"]=pooled_output.std() + metrics["pooled_output"]=pooled_output.float().pow(2).mean().pow(0.5) binary_logits = binary_head(pooled_output) if args.iteration % args.log_interval == 0: - metrics["binary_logits"]=binary_logits.std() + metrics["binary_logits"]=binary_logits.float().pow(2).mean().pow(0.5) print({key:value.detach().cpu().item() for key, value in metrics.items()}) if lm_labels is None: diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 6a533f77c89..10644bfd745 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -103,13 +103,13 @@ def forward(self, hidden_states, sequence_index=0): metrics["pooler_hidden"]=hidden_states.std() pooled = hidden_states[:, sequence_index, :] if args.iteration % args.log_interval == 0: - metrics["pooler_pooled"]=pooled.std() + metrics["pooler_pooled"]=pooled.float().pow(2).mean().pow(0.5) pooled = self.dense(pooled) if args.iteration % args.log_interval == 0: - metrics["pooler_dense"]=pooled.std() + metrics["pooler_dense"]=pooled.float().pow(2).mean().pow(0.5) pooled = torch.tanh(pooled) if args.iteration % args.log_interval == 0: - metrics["pooler_output"]=pooled.std() + metrics["pooler_output"]=pooled.float().pow(2).mean().pow(0.5) print({key:value.detach().cpu().item() for key, value in metrics.items()}) return pooled @@ -199,16 +199,16 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): metrics={} words_embeddings = self.word_embeddings(input_ids) if args.iteration % args.log_interval == 0: - metrics["words_embeddings"]=words_embeddings.std() + metrics["words_embeddings"]=words_embeddings.float().pow(2).mean().pow(0.5) position_embeddings = self.position_embeddings(position_ids) if args.iteration % args.log_interval == 0: - metrics["position_embeddings"]=position_embeddings.std() + metrics["position_embeddings"]=position_embeddings.float().pow(2).mean().pow(0.5) embeddings = words_embeddings + position_embeddings if tokentype_ids is not None: assert self.tokentype_embeddings is not None tokentype_embeddings=self.tokentype_embeddings(tokentype_ids) if args.iteration % args.log_interval == 0: - metrics["tokentype_embeddings"]=tokentype_embeddings.std() + metrics["tokentype_embeddings"]=tokentype_embeddings.float().pow(2).mean().pow(0.5) embeddings = embeddings + tokentype_embeddings else: assert self.tokentype_embeddings is None @@ -377,7 +377,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, args=get_args() if args.iteration % args.log_interval == 0: metrics = {} - metrics["encoder_input"] = encoder_input.std() + metrics["encoder_input"] = encoder_input.float().pow(2).mean().pow(0.5) # encoder. if enc_hidden_states is None: encoder_output = self.encoder(encoder_input, @@ -388,7 +388,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, encoder_output = enc_hidden_states.to(encoder_input.dtype) if args.iteration % args.log_interval == 0: - metrics["encoder_output"] = encoder_output.std() + metrics["encoder_output"] = encoder_output.float().pow(2).mean().pow(0.5) print({key:value.detach().cpu().item() for key, value in metrics.items()}) if self.post_process: if self.add_pooler: diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 823a51f4492..5cf66baa2e7 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -43,6 +43,8 @@ def _get_params_for_weight_decay_optimization(modules): no_weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']) + print("weight_decay_params", [getattr(p, "name_", "unknown") for p in weight_decay_params['params']]) + print("no_weight_decay_params", [getattr(p, "name_", "unknown") for p in no_weight_decay_params['params']]) return weight_decay_params, no_weight_decay_params @@ -52,6 +54,7 @@ def get_megatron_optimizer(model): # Base optimizer. param_groups = _get_params_for_weight_decay_optimization(model) + print("weight_decay", args.weight_decay) if args.optimizer == 'adam': optimizer = Adam(param_groups, lr=args.lr, diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index b4d943df2df..66c0f3c7955 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -65,12 +65,12 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): grads.append(grad) from megatron import get_args args=get_args() - if args.iteration==1: - print(grad.shape, - grad_not_none and is_not_shared and is_not_tp_duplicate, - torch.norm(grad, norm_type).detach().cpu().item(), - grad.std().detach().cpu().item() - ) + #if args.iteration==1: + # print(grad.shape, + # grad_not_none and is_not_shared and is_not_tp_duplicate, + # torch.norm(grad, norm_type).detach().cpu().item(), + # grad.std().detach().cpu().item() + # ) if grad_not_none and is_not_shared and is_not_tp_duplicate: grads_for_norm.append(grad) diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index d7da6825327..456a1a6c350 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -522,8 +522,8 @@ def step(self): for p in group['params']: g=p.grad.detach().float() print( - p.detach().float().std().cpu().item(), - g.std().cpu().item(), + p.detach().float().pow(2).mean().pow(0.5).cpu().item(), + g.detach().float().pow(2).mean().pow(0.5).cpu().item(), torch.norm(g, 2).cpu().item(), getattr(p, "name_", "unknown"), p.shape From 5c2c6a391e0bfb581ff948dfa0fba75e4b30b986 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 24 Nov 2021 17:21:12 -0500 Subject: [PATCH 04/15] wip --- megatron/__init__.py | 50 +++++++++++++++++++------ megatron/arguments.py | 2 + megatron/model/bert_model.py | 47 +++++++---------------- megatron/model/fused_layer_norm.py | 11 +++++- megatron/model/language_model.py | 60 ++++++++++++------------------ megatron/model/transformer.py | 28 ++++++++------ megatron/model/utils.py | 17 ++++++++- megatron/mpu/layers.py | 16 ++++++-- 8 files changed, 132 insertions(+), 99 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index 46e08ce3c26..52bf306a53b 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -12,9 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging +import typing + import torch import os +logger = logging.getLogger(__name__) + from .package_info import ( __description__, __contact_names__, @@ -38,21 +43,42 @@ from .initialize import initialize_megatron def print_rank_0(message): - """If distributed is initialized, print only on rank 0.""" - #if torch.distributed.is_initialized(): - # if torch.distributed.get_rank() == 0: - # print(message, flush=True) - #else: - print(message, flush=True) + logger.info(str(message)) def is_last_rank(): return torch.distributed.get_rank() == ( torch.distributed.get_world_size() - 1) def print_rank_last(message): - """If distributed is initialized, print only on last rank.""" - #if torch.distributed.is_initialized(): - # if is_last_rank(): - # print(message, flush=True) - #else: - print(message, flush=True) + logger.info(str(message)) + +_iteration=0 +_metrics={} + +def next_iteration(iteration:int): + global _iteration, _metrics + _metrics={} + _iteration=iteration + +def record_metrics(metrics:typing.Dict[str, float]): + global _metrics + _metrics.update(metrics) + + +def record_scale(name:str,x:torch.Tensor,grad=True): + global _metrics + if get_log_scales(): + _metrics[name]=get_scale(x) + if grad and x.requires_grad: + x.register_hook(lambda g: record_scale(f"{name}_grad",g,False)) + + +def get_scale(x): + return x.float().pow(2).mean().pow(0.5) + +def get_log_scales(): + args=get_args() + return args.log_scales and args.iteration % args.log_interval == 0 + +def log_metrics(metrics): + logger.info(str({key:value.detach().cpu().item() for key, value in metrics.items()})) diff --git a/megatron/arguments.py b/megatron/arguments.py index b8c230f5793..81bab5f9cf1 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -304,6 +304,8 @@ def _add_logging_args(parser): group.add_argument('--log-params-norm', action='store_true', help='If set, calculate and log parameters norm.') + group.add_argument('--log-scales', action='store_true', + help='Log the scales of parameters, gradients and activations.') group.add_argument('--log-num-zeros-in-grad', action='store_true', help='If set, calculate and log the number of zeros in gradient.') group.add_argument('--tensorboard-log-interval', type=int, default=1, diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 30f3f29933c..7838e05c852 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -15,9 +15,10 @@ """BERT model.""" +import logging import torch -from megatron import get_args +from megatron import get_args, record_scale from megatron import mpu from megatron.model.enums import AttnMaskType from megatron.model.language_model import parallel_lm_logits @@ -67,23 +68,20 @@ class BertLMHead(MegatronModule): """ def __init__(self, mpu_vocab_size, hidden_size, init_method, - layernorm_epsilon, parallel_output): + layernorm_epsilon, parallel_output, name_=""): super(BertLMHead, self).__init__() + self.name_="output_layer.lm_head" args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) - self.bias.name_="output_layer.lm_head.logits.bias" + self.bias.name_=f"{self.name_}.logits.bias" mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) self.parallel_output = parallel_output - self.dense = get_linear_layer(hidden_size, hidden_size, init_method) - self.dense.weight.name_="output_layer.lm_head.dense.weight" - self.dense.bias.name_="output_layer.lm_head.dense.bias" - self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon) - self.layernorm.weight.name_="output_layer.lm_head.layernorm.weight" - self.layernorm.bias.name_="output_layer.lm_head.layernorm.bias" + self.dense = get_linear_layer(hidden_size, hidden_size, init_method, name_=f"{self.name_}.dense") + self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon, name_=f"{self.name_}.layernorm") self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu @@ -91,26 +89,16 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, self.gelu = erf_gelu def forward(self, hidden_states, word_embeddings_weight): - args=get_args() - if args.iteration % args.log_interval == 0: - metrics={} - metrics["hidden_scale"]=hidden_states.float().pow(2).mean().pow(0.5) + record_scale(f"{self.name_}.hidden",hidden_states) hidden_states = self.dense(hidden_states) - if args.iteration % args.log_interval == 0: - metrics["dense_scale"]=hidden_states.float().pow(2).mean().pow(0.5) hidden_states = self.gelu(hidden_states) - if args.iteration % args.log_interval == 0: - metrics["gelu_scale"]=hidden_states.float().pow(2).mean().pow(0.5) + record_scale(f"{self.name_}.gelu",hidden_states) hidden_states = self.layernorm(hidden_states) - if args.iteration % args.log_interval == 0: - metrics["ln_scale"]=hidden_states.float().pow(2).mean().pow(0.5) output = parallel_lm_logits(hidden_states, word_embeddings_weight, self.parallel_output, bias=self.bias) - if args.iteration % args.log_interval == 0: - metrics["logits_scale"]=output.float().pow(2).mean().pow(0.5) - print({key:value.detach().cpu().item() for key, value in metrics.items()}) + record_scale(f"{self.name_}.logits",output) return output @@ -125,14 +113,7 @@ def post_language_model_processing(lm_output, pooled_output, binary_logits = None if binary_head is not None: - args=get_args() - if args.iteration % args.log_interval == 0: - metrics={} - metrics["pooled_output"]=pooled_output.float().pow(2).mean().pow(0.5) binary_logits = binary_head(pooled_output) - if args.iteration % args.log_interval == 0: - metrics["binary_logits"]=binary_logits.float().pow(2).mean().pow(0.5) - print({key:value.detach().cpu().item() for key, value in metrics.items()}) if lm_labels is None: return lm_logits, binary_logits @@ -154,9 +135,11 @@ def __init__(self, add_binary_head=True, parallel_output=True, pre_process=True, - post_process=True): + post_process=True, + name_="bert"): super(BertModel, self).__init__() args = get_args() + self.name_=name_ self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy self.add_binary_head = add_binary_head @@ -186,9 +169,7 @@ def __init__(self, self.binary_head = None if self.add_binary_head: self.binary_head = get_linear_layer(args.hidden_size, 2, - init_method) - self.binary_head.weight.name_="output_layer.sop_head.binary_head.weight" - self.binary_head.bias.name_="output_layer.sop_head.binary_head.bias" + init_method, name_=f"{self.name_}.output_layer.sop_head.binary_head") self._binary_head_key = 'binary_head' def set_input_tensor(self, input_tensor): diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 78645c23613..6fbcb11f18d 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -23,6 +23,8 @@ from torch.nn import init import importlib +from megatron import record_scale + global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None @@ -61,8 +63,9 @@ def backward(ctx, grad_output): class MixedFusedLayerNorm(torch.nn.Module): - def __init__(self, normalized_shape, eps=1e-5): + def __init__(self, normalized_shape, eps=1e-5, name_=""): super(MixedFusedLayerNorm, self).__init__() + self.name_=name_ global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = importlib.import_module( @@ -73,7 +76,9 @@ def __init__(self, normalized_shape, eps=1e-5): self.normalized_shape = torch.Size(normalized_shape) self.eps = eps self.weight = Parameter(torch.Tensor(*normalized_shape)) + self.weight.name_=f"{self.name_}.weight" self.bias = Parameter(torch.Tensor(*normalized_shape)) + self.bias.name_=f"{self.name_}.bias" self.reset_parameters() @@ -85,6 +90,8 @@ def reset_parameters(self): def forward(self, input): - return FusedLayerNormAffineFunction.apply( + output = FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) + record_scale(self.name_, output) + return output diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 10644bfd745..1b8443a8510 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -18,7 +18,7 @@ import torch import torch.nn.functional as F -from megatron import get_args +from megatron import get_args,record_scale from megatron import mpu from .module import MegatronModule from megatron.model.enums import LayerType, AttnMaskType @@ -88,29 +88,22 @@ class Pooler(MegatronModule): bias is set to zero. """ - def __init__(self, hidden_size, init_method): + def __init__(self, hidden_size, init_method, name_=""): super(Pooler, self).__init__() - self.dense = get_linear_layer(hidden_size, hidden_size, init_method) - self.dense.weight.name_="output_layer.sop_head.dense.weight" - self.dense.bias.name_="output_layer.sop_head.dense.bias" + self.name_="output_layer.sop_head" + self.dense = get_linear_layer(hidden_size, hidden_size, init_method, name_=f"{self.name_}.dense") def forward(self, hidden_states, sequence_index=0): # hidden_states: [b, s, h] # sequence_index: index of the token to pool. args=get_args() - if args.iteration % args.log_interval == 0: - metrics={} - metrics["pooler_hidden"]=hidden_states.std() + record_scale(f"{self.name_}.input",hidden_states) pooled = hidden_states[:, sequence_index, :] - if args.iteration % args.log_interval == 0: - metrics["pooler_pooled"]=pooled.float().pow(2).mean().pow(0.5) + record_scale(f"{self.name_}.pooled",pooled) pooled = self.dense(pooled) - if args.iteration % args.log_interval == 0: - metrics["pooler_dense"]=pooled.float().pow(2).mean().pow(0.5) + record_scale(f"{self.dense.name_}.pooled",pooled) pooled = torch.tanh(pooled) - if args.iteration % args.log_interval == 0: - metrics["pooler_output"]=pooled.float().pow(2).mean().pow(0.5) - print({key:value.detach().cpu().item() for key, value in metrics.items()}) + record_scale(f"{self.name_}.tanh",pooled) return pooled @@ -142,19 +135,22 @@ def __init__(self, self.num_tokentypes = num_tokentypes args = get_args() + self.name_="input_layer" # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( vocab_size, self.hidden_size, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' - self.word_embeddings.weight.name_="input_layer.word_embeddings.weight" + self.word_embeddings.name_=f"{self.name_}.word_embeddings" + self.word_embeddings.weight.name_=f"{self.word_embeddings.name_}.weight" # Position embedding (serial). self.position_embeddings = torch.nn.Embedding( max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' - self.position_embeddings.weight.name_="input_layer.position_embeddings.weight" + self.position_embeddings.name_=f"{self.name_}.position_embeddings" + self.position_embeddings.weight.name_=f"{self.position_embeddings.name_}.weight" # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) @@ -166,8 +162,9 @@ def __init__(self, if self.num_tokentypes > 0: self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size) + self.tokentype_embeddings.name_=f"{self.name_}.tokentype_embeddings" + self.tokentype_embeddings.weight.name_=f"{self.tokentype_embeddings.name_}.weight" # Initialize the token-type embeddings. - self.tokentype_embeddings.weight.name_="input_layer.tokentype_embeddings.weight" self.init_method(self.tokentype_embeddings.weight) else: self.tokentype_embeddings = None @@ -195,28 +192,23 @@ def add_tokentype_embeddings(self, num_tokentypes): def forward(self, input_ids, position_ids, tokentype_ids=None): # Embeddings. args=get_args() - if args.iteration % args.log_interval == 0: - metrics={} words_embeddings = self.word_embeddings(input_ids) - if args.iteration % args.log_interval == 0: - metrics["words_embeddings"]=words_embeddings.float().pow(2).mean().pow(0.5) + record_scale(self.word_embeddings.name_,words_embeddings) position_embeddings = self.position_embeddings(position_ids) - if args.iteration % args.log_interval == 0: - metrics["position_embeddings"]=position_embeddings.float().pow(2).mean().pow(0.5) + record_scale(self.position_embeddings.name_,position_embeddings) embeddings = words_embeddings + position_embeddings if tokentype_ids is not None: assert self.tokentype_embeddings is not None tokentype_embeddings=self.tokentype_embeddings(tokentype_ids) - if args.iteration % args.log_interval == 0: - metrics["tokentype_embeddings"]=tokentype_embeddings.float().pow(2).mean().pow(0.5) + record_scale(self.tokentype_embeddings.name_,tokentype_embeddings) embeddings = embeddings + tokentype_embeddings else: assert self.tokentype_embeddings is None - if args.iteration % args.log_interval == 0: - print({key:value.detach().cpu().item() for key, value in metrics.items()}) + record_scale(f"{self.name_}.embeddings",embeddings) # Dropout. embeddings = self.embedding_dropout(embeddings) + record_scale(f"{self.name_}.dropout",embeddings) return embeddings @@ -305,9 +297,11 @@ def __init__(self, decoder_attn_mask_type=AttnMaskType.causal, add_pooler=False, pre_process=True, - post_process=True): + post_process=True, + name_=""): super(TransformerLanguageModel, self).__init__() args = get_args() + self.name_ = name_ self.pre_process = pre_process self.post_process = post_process @@ -353,7 +347,7 @@ def __init__(self, if self.post_process: # Pooler. if self.add_pooler: - self.pooler = Pooler(self.hidden_size, self.init_method) + self.pooler = Pooler(self.hidden_size, self.init_method, name_=f"{self.name_}.output_layer.sop_head") self._pooler_key = 'pooler' def set_input_tensor(self, input_tensor): @@ -375,9 +369,6 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, encoder_input = None args=get_args() - if args.iteration % args.log_interval == 0: - metrics = {} - metrics["encoder_input"] = encoder_input.float().pow(2).mean().pow(0.5) # encoder. if enc_hidden_states is None: encoder_output = self.encoder(encoder_input, @@ -387,9 +378,6 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, else: encoder_output = enc_hidden_states.to(encoder_input.dtype) - if args.iteration % args.log_interval == 0: - metrics["encoder_output"] = encoder_output.float().pow(2).mean().pow(0.5) - print({key:value.detach().cpu().item() for key, value in metrics.items()}) if self.post_process: if self.add_pooler: pooled_output = self.pooler(encoder_output, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 90438f4b6e7..a57d152d421 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -57,9 +57,10 @@ class ParallelMLP(MegatronModule): applied. """ - def __init__(self, init_method, output_layer_init_method, layer_number): + def __init__(self, init_method, output_layer_init_method, name_=""): super(ParallelMLP, self).__init__() args = get_args() + self.name_=name_ # Project to 4h. self.dense_h_to_4h = mpu.ColumnParallelLinear( @@ -67,9 +68,8 @@ def __init__(self, init_method, output_layer_init_method, layer_number): args.ffn_hidden_size, gather_output=False, init_method=init_method, - skip_bias_add=True) - self.dense_h_to_4h.weight.name_=f"layer_{layer_number}.mlp.dense_0.weight" - self.dense_h_to_4h.bias.name_=f"layer_{layer_number}.mlp.dense_0.bias" + skip_bias_add=True, + name_=f"{name_}.dense_0") self.bias_gelu_fusion = args.bias_gelu_fusion self.activation_func = F.gelu @@ -84,9 +84,8 @@ def __init__(self, init_method, output_layer_init_method, layer_number): args.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, - skip_bias_add=True) - self.dense_4h_to_h.weight.name_=f"layer_{layer_number}.mlp.dense_1.weight" - self.dense_4h_to_h.bias.name_=f"layer_{layer_number}.mlp.dense_1.bias" + skip_bias_add=True, + name_=f"{name_}.dense_1") def forward(self, hidden_states): @@ -404,6 +403,7 @@ def __init__(self, init_method, output_layer_init_method, args = get_args() super(ParallelTransformerLayer, self).__init__() + self.name_=f"layer_{layer_number}" self.layer_number = layer_number self.layer_type = layer_type @@ -416,9 +416,9 @@ def __init__(self, init_method, output_layer_init_method, # Layernorm on the input data. self.input_layernorm = LayerNorm( args.hidden_size, - eps=args.layernorm_epsilon) - self.input_layernorm.weight.name_=f"layer_{self.layer_number}.input_layernorm.weight" - self.input_layernorm.bias.name_=f"layer_{self.layer_number}.input_layernorm.bias" + eps=args.layernorm_epsilon, + name_=f"{self.name_}.input_layernorm", + ) # Self attention. self.self_attention = ParallelAttention( @@ -433,7 +433,9 @@ def __init__(self, init_method, output_layer_init_method, # Layernorm on the attention output self.post_attention_layernorm = LayerNorm( args.hidden_size, - eps=args.layernorm_epsilon) + eps=args.layernorm_epsilon, + name_=f"{self.name_}.post_attention_layernorm", + ) self.post_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_attention_layernorm.weight" self.post_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_attention_layernorm.bias" @@ -446,7 +448,9 @@ def __init__(self, init_method, output_layer_init_method, # Layernorm on the attention output. self.post_inter_attention_layernorm = LayerNorm( args.hidden_size, - eps=args.layernorm_epsilon) + eps=args.layernorm_epsilon, + name_=f"{self.name_}.post_inter_attention_layernorm", + ) self.post_inter_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.weight" self.post_inter_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.bias" diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 924f9393acd..e08b52642fc 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,6 +18,7 @@ import math import torch +from megatron import record_scale from megatron import get_args @@ -44,12 +45,26 @@ def attention_mask_func(attention_scores, attention_mask): return attention_scores -def get_linear_layer(rows, columns, init_method): +def get_linear_layer(rows, columns, init_method, name_=""): """Simple linear layer with weight initialization.""" layer = torch.nn.Linear(rows, columns) init_method(layer.weight) with torch.no_grad(): layer.bias.zero_() + layer.name_=name_ + layer.weight.name_=f"{name_}.weight" + layer.bias.name_=f"{name_}.bias" + + + old_forward=layer.forward + + def forward(self,input): + output=old_forward(input) + record_scale(self.name_,output) + return output + + layer.forward=forward + return layer @torch.jit.script diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 8dd69f72cb8..8fe6f1646f7 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -35,7 +35,7 @@ from .utils import divide from .utils import split_tensor_along_last_dim from .utils import VocabUtility -from megatron import get_args +from megatron import get_args,record_scale,get_log_scales _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False, @@ -225,8 +225,9 @@ class ColumnParallelLinear(torch.nn.Module): def __init__(self, input_size, output_size, bias=True, gather_output=True, init_method=init.xavier_normal_, stride=1, keep_master_weight_for_test=False, - skip_bias_add=False): + skip_bias_add=False,name_=""): super(ColumnParallelLinear, self).__init__() + self.name_=name_ # Keep input parameters self.input_size = input_size @@ -256,6 +257,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, device=torch.cuda.current_device(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=stride) + self.weight.name_=f"{self.name_}.weight" if bias: if args.use_cpu_initialization: @@ -270,6 +272,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() + self.bias.name_ = f"{self.name_}.bias" else: self.register_parameter('bias', None) @@ -288,6 +291,8 @@ def forward(self, input_): else: output = output_parallel output_bias = self.bias if self.skip_bias_add else None + if get_log_scales(): + record_scale(self.name_, output if output_bias is None else output + output_bias) return output, output_bias @@ -325,8 +330,9 @@ def __init__(self, input_size, output_size, bias=True, input_is_parallel=False, init_method=init.xavier_normal_, stride=1, keep_master_weight_for_test=False, - skip_bias_add=False): + skip_bias_add=False,name_=""): super(RowParallelLinear, self).__init__() + self.name_=name_ # Keep input parameters self.input_size = input_size @@ -356,6 +362,7 @@ def __init__(self, input_size, output_size, bias=True, device=torch.cuda.current_device(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=1, stride=stride) + self.weight.name_ = f"{self.name_}.weight" if bias: if args.use_cpu_initialization: self.bias = Parameter(torch.empty(self.output_size, @@ -367,6 +374,7 @@ def __init__(self, input_size, output_size, bias=True, # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() + self.bias.name_ = f"{self.name_}.bias" else: self.register_parameter('bias', None) @@ -388,5 +396,7 @@ def forward(self, input_): else: output = output_ output_bias = self.bias + if get_log_scales(): + record_scale(self.name_, output if output_bias is None else output + output_bias) return output, output_bias From 3598b23ceea2d458d51d6697ddfb92c8b1c48a1e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 12:08:37 -0500 Subject: [PATCH 05/15] stuff --- megatron/__init__.py | 10 +++--- megatron/model/language_model.py | 16 +++++----- megatron/model/transformer.py | 53 ++++++++++++++++---------------- megatron/optimizer/__init__.py | 2 -- megatron/optimizer/clip_grads.py | 8 ----- megatron/optimizer/optimizer.py | 35 ++++++--------------- 6 files changed, 48 insertions(+), 76 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index 52bf306a53b..06148c146e0 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -60,10 +60,6 @@ def next_iteration(iteration:int): _metrics={} _iteration=iteration -def record_metrics(metrics:typing.Dict[str, float]): - global _metrics - _metrics.update(metrics) - def record_scale(name:str,x:torch.Tensor,grad=True): global _metrics @@ -74,11 +70,13 @@ def record_scale(name:str,x:torch.Tensor,grad=True): def get_scale(x): - return x.float().pow(2).mean().pow(0.5) + return x.detach().float().pow(2).mean().pow(0.5) + def get_log_scales(): args=get_args() return args.log_scales and args.iteration % args.log_interval == 0 + def log_metrics(metrics): - logger.info(str({key:value.detach().cpu().item() for key, value in metrics.items()})) + logger.info(str({key:value.cpu().item() for key, value in metrics.items()})) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 1b8443a8510..e110f3977e4 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -90,18 +90,16 @@ class Pooler(MegatronModule): def __init__(self, hidden_size, init_method, name_=""): super(Pooler, self).__init__() - self.name_="output_layer.sop_head" + self.name_=name_ self.dense = get_linear_layer(hidden_size, hidden_size, init_method, name_=f"{self.name_}.dense") def forward(self, hidden_states, sequence_index=0): # hidden_states: [b, s, h] # sequence_index: index of the token to pool. - args=get_args() record_scale(f"{self.name_}.input",hidden_states) pooled = hidden_states[:, sequence_index, :] record_scale(f"{self.name_}.pooled",pooled) pooled = self.dense(pooled) - record_scale(f"{self.dense.name_}.pooled",pooled) pooled = torch.tanh(pooled) record_scale(f"{self.name_}.tanh",pooled) return pooled @@ -127,7 +125,8 @@ def __init__(self, max_sequence_length, embedding_dropout_prob, init_method, - num_tokentypes=0): + num_tokentypes=0, + name_=""): super(Embedding, self).__init__() self.hidden_size = hidden_size @@ -135,7 +134,7 @@ def __init__(self, self.num_tokentypes = num_tokentypes args = get_args() - self.name_="input_layer" + self.name_=name_ # Word embeddings (parallel). self.word_embeddings = mpu.VocabParallelEmbedding( @@ -329,7 +328,8 @@ def __init__(self, output_layer_init_method, self_attn_mask_type=self.encoder_attn_mask_type, pre_process=self.pre_process, - post_process=self.post_process + post_process=self.post_process, + name_=self.name_, ) self._encoder_key = 'encoder' @@ -341,7 +341,8 @@ def __init__(self, self.init_method, output_layer_init_method, layer_type=LayerType.decoder, - self_attn_mask_type=self.decoder_attn_mask_type) + self_attn_mask_type=self.decoder_attn_mask_type, + name_=f"{self.name_}.decoder") self._decoder_key = 'decoder' if self.post_process: @@ -368,7 +369,6 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, else: encoder_input = None - args=get_args() # encoder. if enc_hidden_states is None: encoder_output = self.encoder(encoder_input, diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index a57d152d421..7d778606dc7 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -115,9 +115,11 @@ class ParallelAttention(MegatronModule): def __init__(self, init_method, output_layer_init_method, layer_number, attention_type=AttnType.self_attn, - attn_mask_type=AttnMaskType.padding): + attn_mask_type=AttnMaskType.padding, + name_=""): super(ParallelAttention, self).__init__() args = get_args() + self.name_=name_ self.fp16 = args.fp16 self.bf16 = args.bf16 @@ -146,26 +148,23 @@ def __init__(self, init_method, args.hidden_size, 3 * projection_size, gather_output=False, - init_method=init_method) - self.query_key_value.weight.name_=f"layer_{layer_number}.attention.query_key_value.weight" - self.query_key_value.bias.name_=f"layer_{layer_number}.attention.query_key_value.bias" + init_method=init_method, + name_=f"layer_{self.name_}.query_key_value") else: assert attention_type == AttnType.cross_attn self.query = mpu.ColumnParallelLinear( args.hidden_size, projection_size, gather_output=False, - init_method=init_method) - self.query.weight.name_=f"layer_{layer_number}.attention.query.weight" - self.query.bias.name_=f"layer_{layer_number}.attention.query.bias" + init_method=init_method, + name_=f"layer_{self.name_}.query") self.key_value = mpu.ColumnParallelLinear( args.hidden_size, 2 * projection_size, gather_output=False, - init_method=init_method) - self.key_value.weight.name_=f"layer_{layer_number}.attention.key_value.weight" - self.key_value.bias.name_=f"layer_{layer_number}.attention.key_value.bias" + init_method=init_method, + name_=f"layer_{self.name_}.key_value") coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -192,9 +191,8 @@ def __init__(self, init_method, args.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, - skip_bias_add=True) - self.dense.weight.name_=f"layer_{layer_number}.attention.dense.weight" - self.dense.bias.name_=f"layer_{layer_number}.attention.dense.bias" + skip_bias_add=True, + name_=f"layer_{self.name_}.dense") def forward(self, hidden_states, attention_mask, layer_past=None, get_key_value=False, encoder_output=None): @@ -399,11 +397,12 @@ class ParallelTransformerLayer(MegatronModule): def __init__(self, init_method, output_layer_init_method, layer_number, layer_type=LayerType.encoder, - self_attn_mask_type=AttnMaskType.padding): + self_attn_mask_type=AttnMaskType.padding, + name_=""): args = get_args() super(ParallelTransformerLayer, self).__init__() - self.name_=f"layer_{layer_number}" + self.name_=name_ self.layer_number = layer_number self.layer_type = layer_type @@ -426,7 +425,8 @@ def __init__(self, init_method, output_layer_init_method, output_layer_init_method, layer_number, attention_type=AttnType.self_attn, - attn_mask_type=self_attn_mask_type) + attn_mask_type=self_attn_mask_type, + name_=f"{self.name_}.attention") self.hidden_dropout = args.hidden_dropout self.bias_dropout_fusion = args.bias_dropout_fusion @@ -436,27 +436,24 @@ def __init__(self, init_method, output_layer_init_method, eps=args.layernorm_epsilon, name_=f"{self.name_}.post_attention_layernorm", ) - self.post_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_attention_layernorm.weight" - self.post_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_attention_layernorm.bias" if self.layer_type == LayerType.decoder: self.inter_attention = ParallelAttention( init_method, output_layer_init_method, layer_number, - attention_type=AttnType.cross_attn) + attention_type=AttnType.cross_attn, + name_=f"{self.name_}.inter_attention") # Layernorm on the attention output. self.post_inter_attention_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon, name_=f"{self.name_}.post_inter_attention_layernorm", ) - self.post_inter_attention_layernorm.weight.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.weight" - self.post_inter_attention_layernorm.bias.name_=f"layer_{self.layer_number}.post_inter_attention_layernorm.bias" # MLP self.mlp = ParallelMLP(init_method, - output_layer_init_method, layer_number=self.layer_number) + output_layer_init_method, name_=f"{self.name_}.mlp") def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, @@ -555,9 +552,11 @@ class ParallelTransformer(MegatronModule): def __init__(self, init_method, output_layer_init_method, layer_type=LayerType.encoder, self_attn_mask_type=AttnMaskType.padding, - pre_process=True, post_process=True): + pre_process=True, post_process=True, + name_=""): super(ParallelTransformer, self).__init__() args = get_args() + self.name_=name_ self.bf16 = args.bf16 self.fp32_residual_connection = args.fp32_residual_connection @@ -581,7 +580,8 @@ def build_layer(layer_number): output_layer_init_method, layer_number, layer_type=layer_type, - self_attn_mask_type=self_attn_mask_type) + self_attn_mask_type=self_attn_mask_type, + name_=f"{self.name_}.layer_{layer_number}") if args.virtual_pipeline_model_parallel_size is not None: assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \ 'num_layers_per_stage must be divisible by ' \ @@ -611,9 +611,8 @@ def build_layer(layer_number): # Final layer norm before output. self.final_layernorm = LayerNorm( args.hidden_size, - eps=args.layernorm_epsilon) - self.final_layernorm.weight.name_="output_layer.final_layernorm.weight" - self.final_layernorm.bias.name_="output_layer.final_layernorm.bias" + eps=args.layernorm_epsilon, + name_=f"{self.name_}.output_layer.final_layernorm") def _get_layer(self, layer_number): return self.layers[layer_number] diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 5cf66baa2e7..0e535a78a8a 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -43,8 +43,6 @@ def _get_params_for_weight_decay_optimization(modules): no_weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']) - print("weight_decay_params", [getattr(p, "name_", "unknown") for p in weight_decay_params['params']]) - print("no_weight_decay_params", [getattr(p, "name_", "unknown") for p in no_weight_decay_params['params']]) return weight_decay_params, no_weight_decay_params diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index 66c0f3c7955..036a1d4c4cf 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -63,14 +63,6 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Make sure the grads are in fp32 assert param.grad.type() == 'torch.cuda.FloatTensor' grads.append(grad) - from megatron import get_args - args=get_args() - #if args.iteration==1: - # print(grad.shape, - # grad_not_none and is_not_shared and is_not_tp_duplicate, - # torch.norm(grad, norm_type).detach().cpu().item(), - # grad.std().detach().cpu().item() - # ) if grad_not_none and is_not_shared and is_not_tp_duplicate: grads_for_norm.append(grad) diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index 456a1a6c350..da2d32880db 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -25,7 +25,7 @@ from megatron import get_timers from megatron import mpu -from megatron import print_rank_0 +from megatron import print_rank_0,record_scale,get_log_scales from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32 @@ -136,6 +136,13 @@ def state_dict(self): def load_state_dict(self, state_dict): pass + def _record_scales(self): + if get_log_scales(): + for group in self.optimizer.param_groups: + for p in group['params']: + name_=getattr(p, "name_", "unknown") + record_scale(name_, p, False) + record_scale(f"{name_}_grad", p.grad, False) # Promote state so it can be retrieved or set via # "optimizer_instance.state" @@ -406,17 +413,7 @@ def step(self): num_zeros_in_grad = self.count_zeros() if \ self.log_num_zeros_in_grad else None - from megatron import get_args - args=get_args() - if args.iteration==1: - for group in self.optimizer.param_groups: - for p in group['params']: - print( - p.detach().float().std().cpu().item(), - p.grad.detach().float().std().cpu().item(), - getattr(p, "name_", "unknown"), - p.shape - ) + self._record_scales() # Step the optimizer. self.optimizer.step() @@ -515,19 +512,7 @@ def step(self): num_zeros_in_grad = self.count_zeros() if \ self.log_num_zeros_in_grad else None - from megatron import get_args - args=get_args() - if args.iteration % args.log_interval == 0: - for group in self.optimizer.param_groups: - for p in group['params']: - g=p.grad.detach().float() - print( - p.detach().float().pow(2).mean().pow(0.5).cpu().item(), - g.detach().float().pow(2).mean().pow(0.5).cpu().item(), - torch.norm(g, 2).cpu().item(), - getattr(p, "name_", "unknown"), - p.shape - ) + self._record_scales() # Update parameters. self.optimizer.step() From 5171dfa0adb62f12fe45d10ab63be3c4ec27b103 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 12:18:34 -0500 Subject: [PATCH 06/15] logging --- megatron/__init__.py | 28 ++++++++++++++++++++++++++-- megatron/training.py | 7 +++++-- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index 06148c146e0..3fef4b4fc91 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -78,5 +78,29 @@ def get_log_scales(): return args.log_scales and args.iteration % args.log_interval == 0 -def log_metrics(metrics): - logger.info(str({key:value.cpu().item() for key, value in metrics.items()})) +def log_metrics(): + metrics = {} + for key, value in _metrics.items(): + metrics_ = metrics + keys = key.split(".") + for prefix in keys[:-1]: + if prefix not in metrics_: + metrics_[prefix] = {} + metrics_ = metrics_[prefix] + metrics_[keys[-1]] = value + return metrics + +def _log_dicts(self, metrics, indent=0): + for key, value in metrics.items(): + key_ = key.rjust(len(key) + indent) + # Merge keys when there is only one entry. + while isinstance(value, dict) and len(value) == 1: + for value_key, value_ in value.items(): + key_ = ".".join([key_, value_key]) + value = value_ + if isinstance(value, dict): + logger.info(key_ + ":") + self._log_dicts(value, indent + 2) + else: + sep = self._config.logging_width - len(value) - len(key_) - 2 + logger.info(f"{key_.ljust(len(key_)+sep,'.')} {value}") diff --git a/megatron/training.py b/megatron/training.py index f46ded1a3fb..228f8df3d0a 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -25,7 +25,7 @@ import torch from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP -from megatron import get_args +from megatron import get_args, get_log_scales, next_iteration, log_metrics from megatron import get_timers from megatron import get_tensorboard_writer from megatron import get_current_global_batch_size @@ -535,6 +535,9 @@ def add_to_logging(name): timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) + if get_log_scales(): + log_metrics() + if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations @@ -625,7 +628,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler, optimizer, lr_scheduler) iteration += 1 - args.iteration=iteration + next_iteration(iteration) args.consumed_train_samples += mpu.get_data_parallel_world_size() * \ args.micro_batch_size * \ get_num_microbatches() From 839c7937b26b75051f057b4d11c60cada1c2929c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 12:39:27 -0500 Subject: [PATCH 07/15] fixes --- megatron/__init__.py | 54 --------------------------- megatron/metrics.py | 59 ++++++++++++++++++++++++++++++ megatron/model/bert_model.py | 3 +- megatron/model/fused_layer_norm.py | 2 +- megatron/model/language_model.py | 3 +- megatron/model/utils.py | 4 +- megatron/mpu/layers.py | 2 +- 7 files changed, 66 insertions(+), 61 deletions(-) create mode 100644 megatron/metrics.py diff --git a/megatron/__init__.py b/megatron/__init__.py index 3fef4b4fc91..b3a03290088 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import typing import torch import os @@ -51,56 +50,3 @@ def is_last_rank(): def print_rank_last(message): logger.info(str(message)) - -_iteration=0 -_metrics={} - -def next_iteration(iteration:int): - global _iteration, _metrics - _metrics={} - _iteration=iteration - - -def record_scale(name:str,x:torch.Tensor,grad=True): - global _metrics - if get_log_scales(): - _metrics[name]=get_scale(x) - if grad and x.requires_grad: - x.register_hook(lambda g: record_scale(f"{name}_grad",g,False)) - - -def get_scale(x): - return x.detach().float().pow(2).mean().pow(0.5) - - -def get_log_scales(): - args=get_args() - return args.log_scales and args.iteration % args.log_interval == 0 - - -def log_metrics(): - metrics = {} - for key, value in _metrics.items(): - metrics_ = metrics - keys = key.split(".") - for prefix in keys[:-1]: - if prefix not in metrics_: - metrics_[prefix] = {} - metrics_ = metrics_[prefix] - metrics_[keys[-1]] = value - return metrics - -def _log_dicts(self, metrics, indent=0): - for key, value in metrics.items(): - key_ = key.rjust(len(key) + indent) - # Merge keys when there is only one entry. - while isinstance(value, dict) and len(value) == 1: - for value_key, value_ in value.items(): - key_ = ".".join([key_, value_key]) - value = value_ - if isinstance(value, dict): - logger.info(key_ + ":") - self._log_dicts(value, indent + 2) - else: - sep = self._config.logging_width - len(value) - len(key_) - 2 - logger.info(f"{key_.ljust(len(key_)+sep,'.')} {value}") diff --git a/megatron/metrics.py b/megatron/metrics.py new file mode 100644 index 00000000000..668b835929a --- /dev/null +++ b/megatron/metrics.py @@ -0,0 +1,59 @@ +import logging + +import torch +from megatron.global_vars import get_args + +logger = logging.getLogger(__name__) + +_iteration=0 +_metrics={} + +def next_iteration(iteration:int): + global _iteration, _metrics + _metrics={} + _iteration=iteration + + +def record_scale(name:str,x:torch.Tensor,grad=True): + global _metrics + if get_log_scales(): + _metrics[name]=get_scale(x) + if grad and x.requires_grad: + x.register_hook(lambda g: record_scale(f"{name}_grad",g,False)) + + +def get_scale(x): + return x.detach().float().pow(2).mean().pow(0.5) + + +def get_log_scales(): + args=get_args() + return args.log_scales and args.iteration % args.log_interval == 0 + + +def log_metrics(): + metrics = {} + for key, value in _metrics.items(): + metrics_ = metrics + keys = key.split(".") + for prefix in keys[:-1]: + if prefix not in metrics_: + metrics_[prefix] = {} + metrics_ = metrics_[prefix] + metrics_[keys[-1]] = value + return metrics + +def _log_dicts(self, metrics, indent=0): + for key, value in metrics.items(): + key_ = key.rjust(len(key) + indent) + # Merge keys when there is only one entry. + while isinstance(value, dict) and len(value) == 1: + for value_key, value_ in value.items(): + key_ = ".".join([key_, value_key]) + value = value_ + if isinstance(value, dict): + logger.info(key_ + ":") + self._log_dicts(value, indent + 2) + else: + sep = self._config.logging_width - len(value) - len(key_) - 2 + logger.info(f"{key_.ljust(len(key_)+sep,'.')} {value}") diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 7838e05c852..6256dca742e 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -18,8 +18,9 @@ import logging import torch -from megatron import get_args, record_scale +from megatron import get_args from megatron import mpu +from megatron.metrics import record_scale from megatron.model.enums import AttnMaskType from megatron.model.language_model import parallel_lm_logits from megatron.model.language_model import get_language_model diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 6fbcb11f18d..dda74d154b1 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -23,7 +23,7 @@ from torch.nn import init import importlib -from megatron import record_scale +from megatron.metrics import record_scale global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index e110f3977e4..1f1ffc36043 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -18,9 +18,10 @@ import torch import torch.nn.functional as F -from megatron import get_args,record_scale +from megatron import get_args from megatron import mpu from .module import MegatronModule +from megatron.metrics import record_scale from megatron.model.enums import LayerType, AttnMaskType from megatron.model.transformer import ParallelTransformer from megatron.model.utils import get_linear_layer diff --git a/megatron/model/utils.py b/megatron/model/utils.py index e08b52642fc..422db2894f3 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,9 +18,7 @@ import math import torch -from megatron import record_scale - -from megatron import get_args +from megatron.metrics import record_scale def init_method_normal(sigma): """Init method based on N(0, sigma).""" diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 8fe6f1646f7..b2ee8430c56 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -35,7 +35,7 @@ from .utils import divide from .utils import split_tensor_along_last_dim from .utils import VocabUtility -from megatron import get_args,record_scale,get_log_scales +from megatron.metrics import get_args, get_log_scales, record_scale _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False, From 96c79931f5c3d0c28065a40f558ed79f5d659608 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 12:49:43 -0500 Subject: [PATCH 08/15] fix --- megatron/optimizer/optimizer.py | 3 ++- megatron/training.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index da2d32880db..a7fa5398e3e 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -25,7 +25,8 @@ from megatron import get_timers from megatron import mpu -from megatron import print_rank_0,record_scale,get_log_scales +from megatron import print_rank_0 +from megatron.metrics import record_scale,get_log_scales from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32 diff --git a/megatron/training.py b/megatron/training.py index 228f8df3d0a..9c70623625a 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -25,7 +25,7 @@ import torch from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP -from megatron import get_args, get_log_scales, next_iteration, log_metrics +from megatron.metrics import get_args, get_log_scales, next_iteration, log_metrics from megatron import get_timers from megatron import get_tensorboard_writer from megatron import get_current_global_batch_size From cf49f87b7abaf09f26bdb5b0c3e4d5c49accc6b8 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 14:06:27 -0500 Subject: [PATCH 09/15] Fix and wait for file --- megatron/data/dataset_utils.py | 5 +++++ megatron/model/utils.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 81acb6cde64..e034324a883 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -702,6 +702,11 @@ def get_samples_mapping(indexed_dataset, # Wait until rank 0 generate the index file. torch.distributed.barrier(device_ids=[int(os.environ['LOCAL_RANK'])]) + # It can take some time for the file to be visible on other nodes. + for _ in range(120): + if indexmap_filename.is_file(): + break + time.sleep(1.0) # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 422db2894f3..622d6d3c105 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -56,9 +56,9 @@ def get_linear_layer(rows, columns, init_method, name_=""): old_forward=layer.forward - def forward(self,input): + def forward(input): output=old_forward(input) - record_scale(self.name_,output) + record_scale(layer.name_,output) return output layer.forward=forward From 1b69da903cb7d5ff1b7177343538409fef4dd246 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 14:30:29 -0500 Subject: [PATCH 10/15] More loading fix and waiting log --- megatron/data/biencoder_dataset_utils.py | 7 +++++++ megatron/data/dataset_utils.py | 4 +++- megatron/data/gpt_dataset.py | 8 ++++++++ megatron/data/realm_dataset_utils.py | 7 +++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/megatron/data/biencoder_dataset_utils.py b/megatron/data/biencoder_dataset_utils.py index b1b61cd87b7..dee12e1b120 100644 --- a/megatron/data/biencoder_dataset_utils.py +++ b/megatron/data/biencoder_dataset_utils.py @@ -189,6 +189,13 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo # Wait until rank 0 generate the index file. torch.distributed.barrier(device_ids=[int(os.environ['LOCAL_RANK'])]) + # It can take some time for the file to be visible on other nodes. + for i in range(120): + if indexmap_filename.is_file(): + break + if i%10==0: + print_rank_0(" Waiting for index file...") + time.sleep(1.0) # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index e034324a883..fa8cd2eb867 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -703,9 +703,11 @@ def get_samples_mapping(indexed_dataset, # Wait until rank 0 generate the index file. torch.distributed.barrier(device_ids=[int(os.environ['LOCAL_RANK'])]) # It can take some time for the file to be visible on other nodes. - for _ in range(120): + for i in range(120): if indexmap_filename.is_file(): break + if i%10==0: + print_rank_0(" Waiting for index file...") time.sleep(1.0) # Load indexed dataset. diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index ca16f38efbd..815cc985e2c 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -302,6 +302,14 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # Wait until rank 0 generate the index file. torch.distributed.barrier(device_ids=[int(os.environ['LOCAL_RANK'])]) + # It can take some time for the file to be visible on other nodes. + for i in range(120): + if doc_idx_filename.is_file() and sample_idx_filename.is_file() and shuffle_idx_filename.is_file(): + break + if i%10==0: + print_rank_0(" Waiting for index files...") + time.sleep(1.0) + # Load mappings. start_time = time.time() print_rank_0(' > loading doc-idx mapping from {}'.format( diff --git a/megatron/data/realm_dataset_utils.py b/megatron/data/realm_dataset_utils.py index dd33fcd2886..05ed12d8cdb 100644 --- a/megatron/data/realm_dataset_utils.py +++ b/megatron/data/realm_dataset_utils.py @@ -179,6 +179,13 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo # Wait until rank 0 generate the index file. torch.distributed.barrier(device_ids=[int(os.environ['LOCAL_RANK'])]) + # It can take some time for the file to be visible on other nodes. + for i in range(120): + if indexmap_filename.is_file(): + break + if i%10==0: + print_rank_0(" Waiting for index file...") + time.sleep(1.0) # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( From 8e2d5e99d1bb3c9fe8f881309b1fc6938dbafb8c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 14:40:11 -0500 Subject: [PATCH 11/15] Fixes and more logging --- megatron/arguments.py | 57 +++++++++++++++++++------------------------ megatron/metrics.py | 10 +++++--- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 81bab5f9cf1..5dc4540f3db 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -19,6 +19,7 @@ import os import torch +from megatron import print_rank_0 def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): @@ -73,13 +74,12 @@ def parse_args(extra_args_provider=None, defaults={}, 'size ({})'.format(args.world_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size) args.data_parallel_size = args.world_size // model_parallel_size - if args.rank == 0: - print('using world size: {}, data-parallel-size: {}, ' - 'tensor-model-parallel size: {}, ' - 'pipeline-model-parallel size: {} '.format( - args.world_size, args.data_parallel_size, - args.tensor_model_parallel_size, - args.pipeline_model_parallel_size), flush=True) + print_rank_0('using world size: {}, data-parallel-size: {}, ' + 'tensor-model-parallel size: {}, ' + 'pipeline-model-parallel size: {} '.format( + args.world_size, args.data_parallel_size, + args.tensor_model_parallel_size, + args.pipeline_model_parallel_size)) # Deprecated arguments assert args.batch_size is None, '--batch-size argument is no longer ' \ @@ -98,11 +98,9 @@ def parse_args(extra_args_provider=None, defaults={}, # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: - if args.rank == 0: - print('WARNING: overriding default arguments for {key}:{v} \ - with {key}:{v2}'.format(key=key, v=defaults[key], - v2=getattr(args, key)), - flush=True) + print_rank_0('WARNING: overriding default arguments for {key}:{v} \ + with {key}:{v2}'.format(key=key, v=defaults[key], + v2=getattr(args, key))) else: setattr(args, key, defaults[key]) @@ -111,9 +109,8 @@ def parse_args(extra_args_provider=None, defaults={}, assert args.micro_batch_size > 0 if args.global_batch_size is None: args.global_batch_size = args.micro_batch_size * args.data_parallel_size - if args.rank == 0: - print('setting global batch size to {}'.format( - args.global_batch_size), flush=True) + print_rank_0('setting global batch size to {}'.format( + args.global_batch_size)) assert args.global_batch_size > 0 if args.num_layers_per_virtual_pipeline_stage is not None: assert args.pipeline_model_parallel_size > 2, \ @@ -140,13 +137,10 @@ def parse_args(extra_args_provider=None, defaults={}, # be done in fp32. if not args.accumulate_allreduce_grads_in_fp32: args.accumulate_allreduce_grads_in_fp32 = True - if args.rank == 0: - print('accumulate and all-reduce gradients in fp32 for ' - 'bfloat16 data type.', flush=True) + print_rank_0('accumulate and all-reduce gradients in fp32 for ' + 'bfloat16 data type.') - if args.rank == 0: - print('using {} for parameters ...'.format(args.params_dtype), - flush=True) + print_rank_0('using {} for parameters ...'.format(args.params_dtype)) # If we do accumulation and all-reduces in fp32, we need to have # local DDP and we should set the use-contiguous-buffers-in-ddp. @@ -239,17 +233,16 @@ def parse_args(extra_args_provider=None, defaults={}, def _print_args(args): """Print arguments.""" - if args.rank == 0: - print('------------------------ arguments ------------------------', - flush=True) - str_list = [] - for arg in vars(args): - dots = '.' * (48 - len(arg)) - str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg))) - for arg in sorted(str_list, key=lambda x: x.lower()): - print(arg, flush=True) - print('-------------------- end of arguments ---------------------', - flush=True) + print_rank_0('------------------------ arguments ------------------------', + flush=True) + str_list = [] + for arg in vars(args): + dots = '.' * (48 - len(arg)) + str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg))) + for arg in sorted(str_list, key=lambda x: x.lower()): + print_rank_0(arg, flush=True) + print_rank_0('-------------------- end of arguments ---------------------', + flush=True) def _check_arg_is_not_none(args, arg): diff --git a/megatron/metrics.py b/megatron/metrics.py index 668b835929a..d5effe95692 100644 --- a/megatron/metrics.py +++ b/megatron/metrics.py @@ -7,6 +7,7 @@ _iteration=0 _metrics={} +_LOGGING_WIDTH=50 def next_iteration(iteration:int): global _iteration, _metrics @@ -41,9 +42,10 @@ def log_metrics(): metrics_[prefix] = {} metrics_ = metrics_[prefix] metrics_[keys[-1]] = value - return metrics + _log_dicts(metrics) -def _log_dicts(self, metrics, indent=0): + +def _log_dicts(metrics, indent=0): for key, value in metrics.items(): key_ = key.rjust(len(key) + indent) # Merge keys when there is only one entry. @@ -53,7 +55,7 @@ def _log_dicts(self, metrics, indent=0): value = value_ if isinstance(value, dict): logger.info(key_ + ":") - self._log_dicts(value, indent + 2) + _log_dicts(value, indent + 2) else: - sep = self._config.logging_width - len(value) - len(key_) - 2 + sep = _LOGGING_WIDTH - len(value) - len(key_) - 2 logger.info(f"{key_.ljust(len(key_)+sep,'.')} {value}") From da433858c88463c354c16ab6e1d4fcfd487e33a3 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 25 Nov 2021 16:36:26 -0500 Subject: [PATCH 12/15] fixes --- megatron/arguments.py | 22 +++++++++++----------- megatron/metrics.py | 16 ++++++++++++++-- megatron/model/bert_model.py | 10 +++++++--- megatron/model/language_model.py | 8 +++++--- megatron/model/transformer.py | 10 +++++----- megatron/optimizer/__init__.py | 4 ++++ megatron/optimizer/optimizer.py | 6 ++++-- megatron/training.py | 2 +- 8 files changed, 51 insertions(+), 27 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 5dc4540f3db..fed8440fbd1 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -16,10 +16,12 @@ """Megatron arguments.""" import argparse +import logging import os import torch -from megatron import print_rank_0 + +logger = logging.getLogger(__name__) def parse_args(extra_args_provider=None, defaults={}, ignore_unknown_args=False): @@ -74,7 +76,7 @@ def parse_args(extra_args_provider=None, defaults={}, 'size ({})'.format(args.world_size, args.tensor_model_parallel_size, args.pipeline_model_parallel_size) args.data_parallel_size = args.world_size // model_parallel_size - print_rank_0('using world size: {}, data-parallel-size: {}, ' + logger.info('using world size: {}, data-parallel-size: {}, ' 'tensor-model-parallel size: {}, ' 'pipeline-model-parallel size: {} '.format( args.world_size, args.data_parallel_size, @@ -98,7 +100,7 @@ def parse_args(extra_args_provider=None, defaults={}, # arguments that are passed to the program. We check this by # ensuring the arg is set to None. if getattr(args, key) is not None: - print_rank_0('WARNING: overriding default arguments for {key}:{v} \ + logger.warning('Overriding default arguments for {key}:{v} \ with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key))) else: @@ -109,7 +111,7 @@ def parse_args(extra_args_provider=None, defaults={}, assert args.micro_batch_size > 0 if args.global_batch_size is None: args.global_batch_size = args.micro_batch_size * args.data_parallel_size - print_rank_0('setting global batch size to {}'.format( + logger.info('setting global batch size to {}'.format( args.global_batch_size)) assert args.global_batch_size > 0 if args.num_layers_per_virtual_pipeline_stage is not None: @@ -137,10 +139,10 @@ def parse_args(extra_args_provider=None, defaults={}, # be done in fp32. if not args.accumulate_allreduce_grads_in_fp32: args.accumulate_allreduce_grads_in_fp32 = True - print_rank_0('accumulate and all-reduce gradients in fp32 for ' + logger.info('accumulate and all-reduce gradients in fp32 for ' 'bfloat16 data type.') - print_rank_0('using {} for parameters ...'.format(args.params_dtype)) + logger.info('using {} for parameters ...'.format(args.params_dtype)) # If we do accumulation and all-reduces in fp32, we need to have # local DDP and we should set the use-contiguous-buffers-in-ddp. @@ -233,16 +235,14 @@ def parse_args(extra_args_provider=None, defaults={}, def _print_args(args): """Print arguments.""" - print_rank_0('------------------------ arguments ------------------------', - flush=True) + logger.info('------------------------ arguments ------------------------') str_list = [] for arg in vars(args): dots = '.' * (48 - len(arg)) str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg))) for arg in sorted(str_list, key=lambda x: x.lower()): - print_rank_0(arg, flush=True) - print_rank_0('-------------------- end of arguments ---------------------', - flush=True) + logger.info(arg) + logger.info('-------------------- end of arguments ---------------------') def _check_arg_is_not_none(args, arg): diff --git a/megatron/metrics.py b/megatron/metrics.py index d5effe95692..c8844b1f38f 100644 --- a/megatron/metrics.py +++ b/megatron/metrics.py @@ -1,4 +1,5 @@ import logging +import math import torch from megatron.global_vars import get_args @@ -29,7 +30,7 @@ def get_scale(x): def get_log_scales(): args=get_args() - return args.log_scales and args.iteration % args.log_interval == 0 + return args.log_scales and (_iteration+1) % args.log_interval == 0 def log_metrics(): @@ -41,13 +42,14 @@ def log_metrics(): if prefix not in metrics_: metrics_[prefix] = {} metrics_ = metrics_[prefix] - metrics_[keys[-1]] = value + metrics_[keys[-1]] = _format_value(value) _log_dicts(metrics) def _log_dicts(metrics, indent=0): for key, value in metrics.items(): key_ = key.rjust(len(key) + indent) + # Merge keys when there is only one entry. while isinstance(value, dict) and len(value) == 1: for value_key, value_ in value.items(): @@ -59,3 +61,13 @@ def _log_dicts(metrics, indent=0): else: sep = _LOGGING_WIDTH - len(value) - len(key_) - 2 logger.info(f"{key_.ljust(len(key_)+sep,'.')} {value}") + + +def _format_value(value, precision=5,max_leading_zeros=3): + decimals = 0 if value == 0 or not math.isfinite(value) else precision - math.floor(math.log10(abs(value))) + + if 0 <= decimals <= precision + max_leading_zeros: + value = f"{value:.{decimals}f}" + else: + value = f"{value:.{precision}e}" + return value \ No newline at end of file diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 6256dca742e..60673528ae0 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -72,7 +72,7 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, layernorm_epsilon, parallel_output, name_=""): super(BertLMHead, self).__init__() - self.name_="output_layer.lm_head" + self.name_=name_ args = get_args() @@ -159,13 +159,15 @@ def __init__(self, init_method=init_method, scaled_init_method=scaled_init_method, pre_process=self.pre_process, - post_process=self.post_process) + post_process=self.post_process, + name_=self.name_) self.initialize_word_embeddings(init_method_normal) if self.post_process: self.lm_head = BertLMHead( self.word_embeddings_weight().size(0), - args.hidden_size, init_method, args.layernorm_epsilon, parallel_output) + args.hidden_size, init_method, args.layernorm_epsilon, parallel_output, + name_=f"{self.name_}.output_layer.lm_head") self._lm_head_key = 'lm_head' self.binary_head = None if self.add_binary_head: @@ -173,6 +175,8 @@ def __init__(self, init_method, name_=f"{self.name_}.output_layer.sop_head.binary_head") self._binary_head_key = 'binary_head' + for p in self.parameters(): + print(getattr(p, "name_", "unknown"), p.shape) def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 1f1ffc36043..61e005f12d6 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -48,7 +48,7 @@ def get_language_model(num_tokentypes, add_pooler, encoder_attn_mask_type, init_method=None, scaled_init_method=None, add_decoder=False, decoder_attn_mask_type=AttnMaskType.causal, - pre_process=True, post_process=True): + pre_process=True, post_process=True, name_=""): """Build language model and return along with the key to save.""" args = get_args() @@ -69,7 +69,8 @@ def get_language_model(num_tokentypes, add_pooler, decoder_attn_mask_type=decoder_attn_mask_type, add_pooler=add_pooler, pre_process=pre_process, - post_process=post_process + post_process=post_process, + name_=name_ ) # key used for checkpoints. language_model_key = 'language_model' @@ -320,7 +321,8 @@ def __init__(self, args.max_position_embeddings, args.hidden_dropout, self.init_method, - self.num_tokentypes) + self.num_tokentypes, + name_=f"{self.name_}.input_layer") self._embedding_key = 'embedding' # Transformer. diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 7d778606dc7..b5be1567cd6 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -149,7 +149,7 @@ def __init__(self, init_method, 3 * projection_size, gather_output=False, init_method=init_method, - name_=f"layer_{self.name_}.query_key_value") + name_=f"{self.name_}.query_key_value") else: assert attention_type == AttnType.cross_attn self.query = mpu.ColumnParallelLinear( @@ -157,14 +157,14 @@ def __init__(self, init_method, projection_size, gather_output=False, init_method=init_method, - name_=f"layer_{self.name_}.query") + name_=f"{self.name_}.query") self.key_value = mpu.ColumnParallelLinear( args.hidden_size, 2 * projection_size, gather_output=False, init_method=init_method, - name_=f"layer_{self.name_}.key_value") + name_=f"{self.name_}.key_value") coeff = None self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -192,7 +192,7 @@ def __init__(self, init_method, input_is_parallel=True, init_method=output_layer_init_method, skip_bias_add=True, - name_=f"layer_{self.name_}.dense") + name_=f"{self.name_}.dense") def forward(self, hidden_states, attention_mask, layer_past=None, get_key_value=False, encoder_output=None): @@ -581,7 +581,7 @@ def build_layer(layer_number): layer_number, layer_type=layer_type, self_attn_mask_type=self_attn_mask_type, - name_=f"{self.name_}.layer_{layer_number}") + name_=f"{self.name_}.layer_{layer_number-1}") if args.virtual_pipeline_model_parallel_size is not None: assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \ 'num_layers_per_stage must be divisible by ' \ diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 0e535a78a8a..8178a076320 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -43,7 +43,11 @@ def _get_params_for_weight_decay_optimization(modules): no_weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']) + for p in weight_decay_params['params']: + print("weight_decay_params",getattr(p, "name_", "unknown"), p.shape) + for p in no_weight_decay_params['params']: + print("no_weight_decay_params",getattr(p, "name_", "unknown"), p.shape) return weight_decay_params, no_weight_decay_params diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index a7fa5398e3e..15ed05a92ba 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -142,8 +142,8 @@ def _record_scales(self): for group in self.optimizer.param_groups: for p in group['params']: name_=getattr(p, "name_", "unknown") - record_scale(name_, p, False) - record_scale(f"{name_}_grad", p.grad, False) + record_scale(f"optimizer.{name_}", p, False) + record_scale(f"optimizer.{name_}_grad", p.grad, False) # Promote state so it can be retrieved or set via # "optimizer_instance.state" @@ -253,6 +253,8 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, float16_params_this_group.append(param) # Create a copy main_param = param.detach().clone().float() + if hasattr(param, "name_"): + main_param.name_=param.name_ # Copy tensor model parallel attributes. mpu.copy_tensor_model_parallel_attributes(main_param, param) diff --git a/megatron/training.py b/megatron/training.py index 9c70623625a..61031e29bea 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -620,6 +620,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler, print_datetime('before the start of training step') report_memory_flag = True while iteration < args.train_iters: + next_iteration(iteration) update_num_microbatches(args.consumed_train_samples) loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \ train_step(forward_step_func, @@ -628,7 +629,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler, optimizer, lr_scheduler) iteration += 1 - next_iteration(iteration) args.consumed_train_samples += mpu.get_data_parallel_world_size() * \ args.micro_batch_size * \ get_num_microbatches() From 23935dd39f0bbbb70e06f54e09415667ae56edce Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 30 Nov 2021 13:02:29 -0500 Subject: [PATCH 13/15] More fixes and tweaks --- megatron/metrics.py | 6 +++--- megatron/model/bert_model.py | 6 ++---- megatron/model/fused_layer_norm.py | 4 ++-- megatron/model/language_model.py | 8 ++++---- megatron/model/transformer.py | 26 ++++++++++++++++++++++++-- megatron/model/utils.py | 4 ++-- megatron/mpu/layers.py | 14 ++++++-------- megatron/optimizer/__init__.py | 13 +++++++------ megatron/optimizer/clip_grads.py | 8 ++++++-- megatron/optimizer/optimizer.py | 12 ++++++++---- 10 files changed, 64 insertions(+), 37 deletions(-) diff --git a/megatron/metrics.py b/megatron/metrics.py index c8844b1f38f..8883a9dbb58 100644 --- a/megatron/metrics.py +++ b/megatron/metrics.py @@ -16,12 +16,12 @@ def next_iteration(iteration:int): _iteration=iteration -def record_scale(name:str,x:torch.Tensor,grad=True): +def record_scale(name:str,x:torch.Tensor,grad=True, bias=None): global _metrics if get_log_scales(): - _metrics[name]=get_scale(x) + _metrics[f"{name}.scale" if grad else name]=get_scale(x if bias is None else x+bias) if grad and x.requires_grad: - x.register_hook(lambda g: record_scale(f"{name}_grad",g,False)) + x.register_hook(lambda g: record_scale(f"{name}.grad",g,False)) def get_scale(x): diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py index 60673528ae0..a649885760b 100644 --- a/megatron/model/bert_model.py +++ b/megatron/model/bert_model.py @@ -77,12 +77,12 @@ def __init__(self, mpu_vocab_size, hidden_size, init_method, args = get_args() self.bias = torch.nn.Parameter(torch.zeros(mpu_vocab_size)) - self.bias.name_=f"{self.name_}.logits.bias" + self.bias.name_=f"{self.name_}.logits.linear_bias" mpu.set_tensor_model_parallel_attributes(self.bias, True, 0, 1) self.parallel_output = parallel_output self.dense = get_linear_layer(hidden_size, hidden_size, init_method, name_=f"{self.name_}.dense") - self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon, name_=f"{self.name_}.layernorm") + self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon, name_=f"{self.name_}.layer_norm") self.gelu = torch.nn.functional.gelu if args.openai_gelu: self.gelu = openai_gelu @@ -175,8 +175,6 @@ def __init__(self, init_method, name_=f"{self.name_}.output_layer.sop_head.binary_head") self._binary_head_key = 'binary_head' - for p in self.parameters(): - print(getattr(p, "name_", "unknown"), p.shape) def set_input_tensor(self, input_tensor): """See megatron.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index dda74d154b1..8218c65a5e5 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -76,9 +76,9 @@ def __init__(self, normalized_shape, eps=1e-5, name_=""): self.normalized_shape = torch.Size(normalized_shape) self.eps = eps self.weight = Parameter(torch.Tensor(*normalized_shape)) - self.weight.name_=f"{self.name_}.weight" + self.weight.name_=f"{self.name_}.layer_norm_weight" self.bias = Parameter(torch.Tensor(*normalized_shape)) - self.bias.name_=f"{self.name_}.bias" + self.bias.name_=f"{self.name_}.layer_norm_bias" self.reset_parameters() diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 61e005f12d6..3bf9a9712cf 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -144,14 +144,14 @@ def __init__(self, init_method=self.init_method) self._word_embeddings_key = 'word_embeddings' self.word_embeddings.name_=f"{self.name_}.word_embeddings" - self.word_embeddings.weight.name_=f"{self.word_embeddings.name_}.weight" + self.word_embeddings.weight.name_=f"{self.word_embeddings.name_}.embedding_weight" # Position embedding (serial). self.position_embeddings = torch.nn.Embedding( max_sequence_length, self.hidden_size) self._position_embeddings_key = 'position_embeddings' self.position_embeddings.name_=f"{self.name_}.position_embeddings" - self.position_embeddings.weight.name_=f"{self.position_embeddings.name_}.weight" + self.position_embeddings.weight.name_=f"{self.position_embeddings.name_}.embedding_weight" # Initialize the position embeddings. self.init_method(self.position_embeddings.weight) @@ -164,7 +164,7 @@ def __init__(self, self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, self.hidden_size) self.tokentype_embeddings.name_=f"{self.name_}.tokentype_embeddings" - self.tokentype_embeddings.weight.name_=f"{self.tokentype_embeddings.name_}.weight" + self.tokentype_embeddings.weight.name_=f"{self.tokentype_embeddings.name_}.embedding_weight" # Initialize the token-type embeddings. self.init_method(self.tokentype_embeddings.weight) else: @@ -322,7 +322,7 @@ def __init__(self, args.hidden_dropout, self.init_method, self.num_tokentypes, - name_=f"{self.name_}.input_layer") + name_=f"{self.name_}.input_layer.embedding") self._embedding_key = 'embedding' # Transformer. diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index b5be1567cd6..a333b96cf96 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -20,6 +20,7 @@ from megatron import get_args from megatron import mpu +from megatron.metrics import record_scale from .module import MegatronModule from megatron.model.enums import AttnMaskType, LayerType, AttnType from megatron.model import LayerNorm @@ -100,6 +101,7 @@ def forward(self, hidden_states): intermediate_parallel = \ self.activation_func(intermediate_parallel + bias_parallel) + record_scale(f"{self.name_}.gelu", intermediate_parallel) # [s, b, h] output, output_bias = self.dense_4h_to_h(intermediate_parallel) return output, output_bias @@ -238,6 +240,10 @@ def forward(self, hidden_states, attention_mask, layer_past=None, self.hidden_size_per_attention_head) query_layer = query_layer.view(*new_tensor_shape) + record_scale(f"{self.name_}.query_layer", query_layer) + record_scale(f"{self.name_}.key_layer", key_layer) + record_scale(f"{self.name_}.value_layer", value_layer) + # ================================== # Adjust key and value for inference # ================================== @@ -286,6 +292,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None, # change view to [b, np, sq, sk] attention_scores = matmul_result.view(*output_size) + record_scale(f"{self.name_}.attention_scores", attention_scores) # ================================================== # Update attention mask for inference. [b, np, sq, sk] # ================================================== @@ -310,6 +317,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None, # attention scores and attention mask [b, np, sq, sk] attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) + record_scale(f"{self.name_}.attention_probs", attention_probs) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -351,6 +359,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None, (self.hidden_size_per_partition,) context_layer = context_layer.view(*new_context_layer_shape) + record_scale(f"{self.name_}.context_layer", context_layer) + # ================= # Output. [sq, b, h] # ================= @@ -426,7 +436,7 @@ def __init__(self, init_method, output_layer_init_method, layer_number, attention_type=AttnType.self_attn, attn_mask_type=self_attn_mask_type, - name_=f"{self.name_}.attention") + name_=f"{self.name_}.self_attention") self.hidden_dropout = args.hidden_dropout self.bias_dropout_fusion = args.bias_dropout_fusion @@ -469,6 +479,8 @@ def forward(self, hidden_states, attention_mask, layer_past=layer_past, get_key_value=get_key_value) + record_scale(f"{self.name_}.attention", attention_output, bias=attention_bias) + if get_key_value: attention_output, presents = attention_output @@ -477,6 +489,7 @@ def forward(self, hidden_states, attention_mask, residual = layernorm_output else: residual = hidden_states + record_scale(f"{self.name_}.attention_residual_input", residual) # jit scripting for a nn.module (with dropout) is not # trigerring the fusion kernel. For now, we use two @@ -490,6 +503,7 @@ def forward(self, hidden_states, attention_mask, else: bias_dropout_add_func = get_bias_dropout_add(self.training) + # re-enable torch grad to enable fused optimization. with torch.enable_grad(): layernorm_input = bias_dropout_add_func( @@ -498,6 +512,8 @@ def forward(self, hidden_states, attention_mask, residual, self.hidden_dropout) + record_scale(f"{self.name_}.attention_residual", layernorm_input) + # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) @@ -506,11 +522,13 @@ def forward(self, hidden_states, attention_mask, self.inter_attention(layernorm_output, enc_dec_attn_mask, encoder_output=encoder_output) + record_scale(f"{self.name_}.inter_attention", attention_output, bias=attention_bias) # residual connection if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = layernorm_input + record_scale(f"{self.name_}.inter_attention_residual_input", residual) # re-enable torch grad to enable fused optimization. with torch.enable_grad(): @@ -519,6 +537,7 @@ def forward(self, hidden_states, attention_mask, attention_bias.expand_as(residual), residual, self.hidden_dropout) + record_scale(f"{self.name_}.inter_attention_residual", layernorm_input) # Layer norm post the decoder attention layernorm_output = self.post_inter_attention_layernorm(layernorm_input) @@ -531,6 +550,7 @@ def forward(self, hidden_states, attention_mask, residual = layernorm_output else: residual = layernorm_input + record_scale(f"{self.name_}.mlp_residual_input", residual) # re-enable torch grad to enable fused optimization. with torch.enable_grad(): @@ -540,6 +560,8 @@ def forward(self, hidden_states, attention_mask, residual, self.hidden_dropout) + record_scale(f"{self.name_}.mlp_residual", layernorm_input) + if get_key_value: output = [output, presents] @@ -581,7 +603,7 @@ def build_layer(layer_number): layer_number, layer_type=layer_type, self_attn_mask_type=self_attn_mask_type, - name_=f"{self.name_}.layer_{layer_number-1}") + name_=f"{self.name_}.layer_{layer_number-1}.transformer_layer") if args.virtual_pipeline_model_parallel_size is not None: assert args.num_layers % args.virtual_pipeline_model_parallel_size == 0, \ 'num_layers_per_stage must be divisible by ' \ diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 622d6d3c105..d87616c6d98 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -50,8 +50,8 @@ def get_linear_layer(rows, columns, init_method, name_=""): with torch.no_grad(): layer.bias.zero_() layer.name_=name_ - layer.weight.name_=f"{name_}.weight" - layer.bias.name_=f"{name_}.bias" + layer.weight.name_=f"{name_}.linear_weight" + layer.bias.name_=f"{name_}.linear_bias" old_forward=layer.forward diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index b2ee8430c56..9bf58d2b8fa 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -257,7 +257,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, device=torch.cuda.current_device(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=stride) - self.weight.name_=f"{self.name_}.weight" + self.weight.name_=f"{self.name_}.linear_weight" if bias: if args.use_cpu_initialization: @@ -272,7 +272,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() - self.bias.name_ = f"{self.name_}.bias" + self.bias.name_ = f"{self.name_}.linear_bias" else: self.register_parameter('bias', None) @@ -291,8 +291,7 @@ def forward(self, input_): else: output = output_parallel output_bias = self.bias if self.skip_bias_add else None - if get_log_scales(): - record_scale(self.name_, output if output_bias is None else output + output_bias) + record_scale(self.name_, output, bias=output_bias) return output, output_bias @@ -362,7 +361,7 @@ def __init__(self, input_size, output_size, bias=True, device=torch.cuda.current_device(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=1, stride=stride) - self.weight.name_ = f"{self.name_}.weight" + self.weight.name_ = f"{self.name_}.linear_weight" if bias: if args.use_cpu_initialization: self.bias = Parameter(torch.empty(self.output_size, @@ -374,7 +373,7 @@ def __init__(self, input_size, output_size, bias=True, # Always initialize bias to zero. with torch.no_grad(): self.bias.zero_() - self.bias.name_ = f"{self.name_}.bias" + self.bias.name_ = f"{self.name_}.linear_bias" else: self.register_parameter('bias', None) @@ -396,7 +395,6 @@ def forward(self, input_): else: output = output_ output_bias = self.bias - if get_log_scales(): - record_scale(self.name_, output if output_bias is None else output + output_bias) + record_scale(self.name_, output, bias=output_bias) return output, output_bias diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 8178a076320..7298930daae 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -13,8 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from apex.optimizers import FusedAdam as Adam -from apex.optimizers import FusedSGD as SGD +import warnings + +try: + from apex.optimizers import FusedAdam as Adam + from apex.optimizers import FusedSGD as SGD +except ImportError: + warnings.warn("Apex not found") from megatron import get_args from megatron.model import LayerNorm @@ -43,11 +48,7 @@ def _get_params_for_weight_decay_optimization(modules): no_weight_decay_params['params'].extend( [p for n, p in list(module_._parameters.items()) if p is not None and n == 'bias']) - for p in weight_decay_params['params']: - print("weight_decay_params",getattr(p, "name_", "unknown"), p.shape) - for p in no_weight_decay_params['params']: - print("no_weight_decay_params",getattr(p, "name_", "unknown"), p.shape) return weight_decay_params, no_weight_decay_params diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index 036a1d4c4cf..30e1b820ea0 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -17,9 +17,13 @@ import torch from torch._six import inf +import warnings -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C +try: + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C +except ImportError: + warnings.warn("Apex not found") from megatron import mpu from megatron.model.module import param_is_not_shared diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index 15ed05a92ba..175a44b4c8d 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -17,11 +17,15 @@ from abc import ABC from abc import abstractmethod +import warnings import torch -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C +try: + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C +except ImportError: + warnings.warn("Apex not found") from megatron import get_timers from megatron import mpu @@ -142,8 +146,8 @@ def _record_scales(self): for group in self.optimizer.param_groups: for p in group['params']: name_=getattr(p, "name_", "unknown") - record_scale(f"optimizer.{name_}", p, False) - record_scale(f"optimizer.{name_}_grad", p.grad, False) + record_scale(f"optimizer.{name_}.scale", p, False) + record_scale(f"optimizer.{name_}.grad", p.grad, False) # Promote state so it can be retrieved or set via # "optimizer_instance.state" From 1527f75fd70e419c1f03fd2efbb16e9f52e4400c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 1 Dec 2021 12:32:25 -0500 Subject: [PATCH 14/15] Update names --- megatron/model/transformer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index a333b96cf96..5a2c91306f6 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -426,7 +426,7 @@ def __init__(self, init_method, output_layer_init_method, self.input_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon, - name_=f"{self.name_}.input_layernorm", + name_=f"{self.name_}.input_layer_norm", ) # Self attention. @@ -444,7 +444,7 @@ def __init__(self, init_method, output_layer_init_method, self.post_attention_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon, - name_=f"{self.name_}.post_attention_layernorm", + name_=f"{self.name_}.post_attention_layer_norm", ) if self.layer_type == LayerType.decoder: @@ -458,7 +458,7 @@ def __init__(self, init_method, output_layer_init_method, self.post_inter_attention_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon, - name_=f"{self.name_}.post_inter_attention_layernorm", + name_=f"{self.name_}.post_inter_attention_layer_norm", ) # MLP @@ -634,7 +634,7 @@ def build_layer(layer_number): self.final_layernorm = LayerNorm( args.hidden_size, eps=args.layernorm_epsilon, - name_=f"{self.name_}.output_layer.final_layernorm") + name_=f"{self.name_}.output_layer.final_layer_norm") def _get_layer(self, layer_number): return self.layers[layer_number] From d347b2f8dd6a9f31b4746a4742ce6587e40f32b9 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 9 Dec 2021 17:51:54 -0500 Subject: [PATCH 15/15] fix --- megatron/fused_kernels/layer_norm_cuda_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu index ce42584aa33..a892c069f53 100644 --- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu +++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu @@ -21,7 +21,7 @@ #include "ATen/ATen.h" #include "ATen/AccumulateType.h" #include "ATen/cuda/CUDAContext.h" -#include +#include "ATen/cuda/DeviceUtils.cuh" #include #include