hpcaitech · binmakeswell · May 11, 2023 · May 5, 2023 · May 6, 2023 · May 6, 2023
@@ -27,7 +27,7 @@ def bert_model_builder(checkpoint: bool = False):
                               attention_probs_dropout_prob=0.)
         print('building AlbertForSequenceClassification model')
 
-        # adapting huggingface BertForSequenceClassification for single unitest calling interface
+        # adapting huggingface BertForSequenceClassification for single unittest calling interface
         class ModelAdaptor(AlbertForSequenceClassification):
 
             def forward(self, input_ids, labels):

@@ -7,8 +7,8 @@
 @clear_cache_before_run()
 @parameterize('device', ['cpu', 'cuda'])
 def test_accelerator(device):
-    acceleartor = Accelerator(device)
+    accelerator = Accelerator(device)
     model = nn.Linear(8, 8)
-    model = acceleartor.configure_model(model)
+    model = accelerator.configure_model(model)
     assert next(model.parameters()).device.type == device
-    del model, acceleartor
+    del model, accelerator
@@ -56,7 +56,7 @@ def no_sync(self, model: nn.Module) -> Iterator[None]:
 def check_dataloader_sharding():
     plugin = DPPluginWrapper()
 
-    # create a custom dasetset with 0 to 10
+    # create a custom dataset with 0 to 10
     dataset = TensorDataset(torch.arange(0, 10))
     train_dataloader = plugin.prepare_dataloader(dataset, batch_size=2)
 

@@ -48,7 +48,7 @@ def run_trainer(rank, world_size, port):
     pipelinable.policy = "uniform"
     model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
 
-    # craete dataloaders
+    # create dataloaders
     root = Path(os.environ['DATA'])
     transform_train = transforms.Compose([
         transforms.RandomCrop(32, padding=4, pad_if_needed=True),
@@ -68,7 +68,7 @@ def run_trainer(rank, world_size, port):
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, *_ = colossalai.initialize(model=model,
                                                          optimizer=optimizer,
                                                          criterion=criterion,

@@ -50,7 +50,7 @@ def run_trainer(rank, world_size, port):
     pipelinable.policy = "uniform"
     model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
 
-    # craete dataloaders
+    # create dataloaders
     root = Path(os.environ['DATA'])
     transform_train = transforms.Compose([
         transforms.RandomCrop(32, padding=4, pad_if_needed=True),
@@ -70,7 +70,7 @@ def run_trainer(rank, world_size, port):
     # create lr scheduler
     lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
 
-    # intiailize
+    # initialize
     engine, train_dataloader, *_ = colossalai.initialize(model=model,
                                                          optimizer=optimizer,
                                                          criterion=criterion,

@@ -64,7 +64,7 @@ def forward(self, x, y):
 
 
 def _run_act_ckpt_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
@@ -122,7 +122,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_act_ckpt_python_code_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward

@@ -32,7 +32,7 @@ def forward(self, x):
 
 
 def _run_act_ckpt_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward
@@ -89,7 +89,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_act_ckpt_python_code_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and run forward

@@ -56,7 +56,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
     fx_out = gm(data)
     assert torch.equal(non_fx_out, fx_out), "fx_out doesn't comply with original output"
 
-    # test barckward
+    # test backward
     loss0 = non_fx_out.sum()
     loss0.backward()
     loss1 = fx_out.sum()
@@ -65,7 +65,7 @@ def _test_fwd_and_bwd(model: torch.nn.Module, gm: ColoGraphModule, data: torch.T
 
 
 def _run_offload_codegen(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and input
@@ -120,7 +120,7 @@ def test_act_ckpt_codegen():
 
 
 def _run_offload_codegen_torch11(rank, world_size, port):
-    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currectly
+    # launch colossalai to make sure we could execute colossalai.utils.checkpoint currently
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model and input

@@ -45,7 +45,7 @@ def check_ring_qk(rank, world_size):
     ring_qk = colossalai.nn.layer.parallel_sequence.RingQK.apply
     sub_a = ring_qk(sub_q, sub_k, batch_size, num_heads, sub_seq_length)
 
-    # check master and distributed attetion scores
+    # check master and distributed attention scores
     sub_master_a = a[:, rank * sub_seq_length:(rank + 1) * sub_seq_length]
     assert torch.allclose(sub_a, sub_master_a, rtol=1e-5, atol=1e-2)
 

@@ -41,7 +41,7 @@ def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.f
     if data_type == torch.float16:
         layer = layer.half()
 
-    # use matrix multiplication instead of COL_MOE_KERNL in MOE dispatch and combine
+    # use matrix multiplication instead of COL_MOE_KERNEL in MOE dispatch and combine
     layer.use_kernel = False
     old_out, _ = layer(tokens)
     ech = old_out.shape
@@ -57,7 +57,7 @@ def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.f
     layer.gate_weight.grad.zero_()
 
     layer.use_kernel = True
-    new_out, _ = layer(tokens)    # get ouputs through colossal kernel
+    new_out, _ = layer(tokens)    # get outputs through colossal kernel
 
     if data_type == torch.float32:
         check_equal(old_out, new_out)

@@ -329,6 +329,6 @@ def test_pretrain_load(world_size):
 
 if __name__ == '__main__':
     # test_model_parameters()
-    # test_colo_optgimizer()
+    # test_colo_optimizer()
     test_model(4)
     # test_pretrain_load(4)
@@ -90,7 +90,7 @@ def run_check(rank, world_size, port):
     prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
     next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
     logger.info('Rank {0}: prev rank {1}, next rank {2}'.format(rank, prev_rank, next_rank))
-    logger.info('Distributed environment is initialzied.')
+    logger.info('Distributed environment is initialized.')
 
     check_comm(world_size, rank, prev_rank, next_rank, logger)
     gpc.destroy()

@@ -23,7 +23,7 @@ def add_param(param_list, param_cp_list, *args, **kwargs):
     param_cp_list.append(param.clone())
 
 
-def check_euqal(param, param_cp):
+def check_equal(param, param_cp):
     if param.device != param_cp.device:
         temp = param.data.to(param_cp.device)
     else:
@@ -57,7 +57,7 @@ def exam_chunk_basic(init_device, keep_gathered, pin_memory):
         my_chunk.append_tensor(param)
     assert my_chunk.utilized_size == 597
     for param, param_cp in zip(param_list, param_cp_list):
-        check_euqal(param, param_cp)
+        check_equal(param, param_cp)
     my_chunk.close_chunk()
 
     if keep_gathered is False:
@@ -77,7 +77,7 @@ def exam_chunk_basic(init_device, keep_gathered, pin_memory):
     my_chunk.access_chunk()
     assert my_chunk.device_type == 'cuda'
     for param, param_cp in zip(param_list, param_cp_list):
-        check_euqal(param, param_cp)
+        check_equal(param, param_cp)
 
     assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4
     my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)