From 07f3e980f6d906cb8a1864dce40a1557fb62e134 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Fri, 3 Dec 2021 16:53:25 +0100 Subject: [PATCH 1/2] fixed 1D ViT convergence problem --- colossalai/context/parallel_context.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py index 5ced84021447..6f4ea48327d6 100644 --- a/colossalai/context/parallel_context.py +++ b/colossalai/context/parallel_context.py @@ -435,15 +435,18 @@ def set_seed(self): if torch.cuda.is_available(): # create random seed for different parallel modes # data parallel seed are kept the same - tp_rank = self._local_ranks.get(ParallelMode.TENSOR, 0) - pp_rank = self._local_ranks.get(ParallelMode.PIPELINE, 0) - parallel_seed = seed + tp_rank + pp_rank * 1024 + parallel_seed = seed add_seed(ParallelMode.DATA, parallel_seed) + # model parallel seeds are different across ranks + pipeline_offset = self._local_ranks.get(ParallelMode.PIPELINE, 0) + # add seed for data parallel and tensor parallel only if self.is_initialized(ParallelMode.TENSOR): - dp_rank = self._local_ranks.get(ParallelMode.DATA, 0) + 1 - tp_seed = parallel_seed + dp_rank * 128 + tp_rank = self.get_local_rank(ParallelMode.TENSOR) + # 100 is only to increase the diff in seeds between pipeline stages + tp_rank_with_offset = tp_rank + pipeline_offset * 1024 + tp_seed = seed + tp_rank_with_offset add_seed(ParallelMode.TENSOR, tp_seed) set_mode(ParallelMode.DATA) From 073b2fbc1fdffb2352da90306d5ca79b1a2c58d0 Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Wed, 8 Dec 2021 03:52:42 +0100 Subject: [PATCH 2/2] update api for better usability --- colossalai/__init__.py | 4 +- colossalai/amp/__init__.py | 32 + colossalai/{engine => }/amp/amp_type.py | 2 +- colossalai/amp/apex_amp/__init__.py | 15 + colossalai/amp/apex_amp/apex_amp.py | 23 + colossalai/amp/naive_amp/__init__.py | 20 + .../naive_amp/_fp16_optimizer.py} | 20 +- colossalai/amp/naive_amp/naive_amp.py | 65 ++ colossalai/amp/torch_amp/__init__.py | 18 + .../torch_amp/_grad_scaler.py} | 6 +- colossalai/amp/torch_amp/torch_amp.py | 54 ++ colossalai/builder/__init__.py | 10 +- colossalai/builder/builder.py | 61 +- colossalai/builder/pipeline.py | 30 +- colossalai/context/__init__.py | 4 +- colossalai/context/_utils.py | 70 -- colossalai/context/config.py | 4 + colossalai/context/parallel_context.py | 163 ++-- .../initializer_1d.py | 1 - .../initializer_2p5d.py | 22 +- colossalai/core.py | 12 +- colossalai/engine/__init__.py | 2 - colossalai/engine/_base_engine.py | 165 +--- colossalai/engine/amp/__init__.py | 2 - colossalai/engine/schedule/__init__.py | 6 +- colossalai/engine/schedule/_base_schedule.py | 68 +- colossalai/engine/schedule/_no_pipeline.py | 197 ----- .../engine/schedule/_non_pipeline_schedule.py | 61 ++ .../{_pipeline.py => _pipeline_schedule.py} | 71 +- colossalai/engine/schedule/_utils.py | 27 - colossalai/initialize.py | 595 +++++++-------- colossalai/logging/__init__.py | 28 +- colossalai/logging/logging.py | 104 ++- colossalai/nn/__init__.py | 1 - colossalai/nn/data/__init__.py | 3 - colossalai/nn/data/_utils.py | 14 - colossalai/nn/data/base_dataset.py | 17 - colossalai/nn/data/caltech101_dataset.py | 43 -- colossalai/nn/data/cifar10_dataset.py | 44 -- colossalai/nn/data/sampler/__init__.py | 4 - colossalai/nn/layer/__init__.py | 4 +- colossalai/nn/layer/_common_utils.py | 69 +- .../nn/layer/non_parallel_layers/__init__.py | 8 + .../layers.py => non_parallel_layers/_vit.py} | 46 +- colossalai/nn/layer/parallel_1d/__init__.py | 2 +- colossalai/nn/layer/parallel_1d/_vit.py | 36 +- colossalai/nn/layer/parallel_1d/layers.py | 42 +- colossalai/nn/layer/parallel_2d/_vit.py | 32 +- colossalai/nn/layer/parallel_2d/layers.py | 12 +- .../nn/layer/parallel_2p5d/_transformer.py | 11 +- colossalai/nn/layer/parallel_2p5d/_vit.py | 48 +- colossalai/nn/layer/parallel_2p5d/layers.py | 13 +- colossalai/nn/layer/parallel_3d/_vit.py | 35 +- colossalai/nn/layer/parallel_3d/layers.py | 16 +- .../parallel_vision_transformer/__init__.py | 3 - .../parallel_vision_transformer/layers.py | 59 -- .../nn/layer/vanilla_resnet/__init__.py | 5 - .../nn/layer/vanilla_resnet/basic_block.py | 64 -- .../nn/layer/vanilla_resnet/bottleneck.py | 69 -- colossalai/nn/layer/vanilla_resnet/conv.py | 15 - .../nn/layer/vanilla_resnet/reslayer.py | 63 -- .../vanilla_vision_transformer/__init__.py | 7 - colossalai/nn/loss/__init__.py | 1 - colossalai/nn/loss/base_loss.py | 13 - colossalai/nn/lr_scheduler/delayed.py | 7 + colossalai/nn/lr_scheduler/multistep.py | 2 - colossalai/nn/lr_scheduler/onecycle.py | 8 - colossalai/nn/lr_scheduler/poly.py | 2 - colossalai/nn/lr_scheduler/torch.py | 4 - colossalai/nn/model/__init__.py | 6 +- .../{base_model.py => model_from_config.py} | 5 +- .../nn/model/vanilla_resnet/__init__.py | 3 - colossalai/nn/model/vanilla_resnet/resnet.py | 163 ---- .../nn/model/vision_transformer/__init__.py | 3 - colossalai/nn/optimizer/__init__.py | 8 +- colossalai/nn/optimizer/_utils.py | 194 ----- .../nn/optimizer/colossalai_optimizer.py | 47 ++ colossalai/nn/optimizer/fused_adam.py | 2 +- colossalai/nn/optimizer/fused_lamb.py | 2 +- colossalai/nn/optimizer/fused_sgd.py | 2 +- .../zero_redundancy_optimizer_level_1.py | 707 ------------------ colossalai/registry/__init__.py | 7 +- colossalai/trainer/__init__.py | 4 +- colossalai/trainer/_trainer.py | 164 ++-- colossalai/trainer/hooks/_base_hook.py | 43 +- colossalai/trainer/hooks/_checkpoint_hook.py | 48 +- colossalai/trainer/hooks/_log_hook.py | 186 ++--- .../trainer/hooks/_lr_scheduler_hook.py | 49 +- colossalai/trainer/hooks/_metric_hook.py | 92 +-- colossalai/utils/__init__.py | 37 +- colossalai/utils/common.py | 193 ++++- colossalai/utils/data_sampler/__init__.py | 4 + .../data_sampler}/base_sampler.py | 0 .../data_sampler}/data_parallel_sampler.py | 53 +- .../utils/gradient_accumulation/__init__.py | 29 + .../_gradient_accumulation.py | 154 ++++ colossalai/utils/memory.py | 30 +- .../multi_tensor_apply/__init__.py | 0 .../multi_tensor_apply/multi_tensor_apply.py | 0 colossalai/zero/__init__.py | 28 + .../{nn/optimizer => zero}/loss_scaler.py | 0 .../zero_redundancy_optimizer_level_2.py | 23 +- .../zero_redundancy_optimizer_level_3.py | 39 +- docs/run_demo.md | 4 +- docs/run_demo_zh.md | 4 +- docs/trainer_engine.md | 2 +- docs/trainer_engine_zh.md | 2 +- examples/colossal_cifar_demo.ipynb | 178 ++--- examples/run_trainer.py | 4 +- examples/vit-b16/train_dali.py | 4 +- model_zoo/__init__.py | 2 - model_zoo/bert/__init__.py | 0 model_zoo/mlp_mixer/__init__.py | 1 - model_zoo/vit/__init__.py | 2 - .../vit/vision_transformer_from_config.py | 4 +- tests/test_config/sample_config.py | 4 +- tests/test_context/test_2d_init.py | 20 +- tests/test_context/test_2p5d_init.py | 19 +- tests/test_context/test_3d_init.py | 20 +- tests/test_data/test_cifar10_dataset.py | 47 +- tests/test_data/test_data_parallel_sampler.py | 58 +- .../test_deterministic_dataloader.py | 68 +- .../configs/vit_2d.py | 150 ---- .../configs/vit_2p5d.py | 144 ---- .../run_cifar10_vit2d_with_pipeline.py | 139 ++++ .../test.sh | 3 +- .../test_vit_2d/test_vit_2d.py | 87 --- .../test_vit_2p5d/test_vit_2p5d.py | 89 --- .../vit_t_2d.py | 74 ++ .../configs/non_pipeline_resnet_apex_amp.py | 26 - tests/test_engine/test.sh | 2 +- .../test_engine/test_engine_apex_amp.py | 114 +++ .../test_engine/test_engine_naive_amp.py | 113 +++ .../test_engine/test_engine_no_amp.py | 110 +++ .../test_engine/test_engine_torch_amp.py | 111 +++ .../test_engine_apex_amp.py | 46 -- .../test_engine_no_amp.py | 49 -- .../test_engine_torch_amp.py | 48 -- .../test_pipeline_engine/test_engine.py | 45 -- tests/test_fp16_optimizer/configs/vit_2d.py | 143 ---- tests/test_fp16_optimizer/test.sh | 4 - .../test_vit_2d/test_vit_2d.py | 85 --- tests/test_layers/test.sh | 2 +- tests/test_layers/test_1d/test_1d.py | 15 +- tests/test_layers/test_1d/test_layer.py | 28 +- tests/test_layers/test_2d/test_2d.py | 12 +- tests/test_layers/test_2p5d/test_2p5d.py | 12 +- tests/test_layers/test_2p5d/test_operation.py | 15 +- tests/test_layers/test_3d/test_3d.py | 19 +- tests/test_layers/test_3d/test_layer.py | 26 +- tests/test_layers/test_3d/test_operation.py | 16 +- .../test_sequence/test_sequence.py | 16 +- tests/test_lr_scheduler/test_lr_scheduler.py | 69 -- .../test_vanilla_resnet.py | 98 --- .../test_vision_transformer/configs/vit_1d.py | 137 ---- .../test_vision_transformer/configs/vit_2d.py | 107 --- .../configs/vit_2d_imagenet.py | 105 --- .../configs/vit_2p5d.py | 130 ---- .../test_vision_transformer/configs/vit_3d.py | 155 ---- .../configs/vit_3d_imagenet.py | 119 --- .../configs/vit_vanilla.py | 56 -- .../test_vision_transformer/test.sh | 4 - .../test_vit_1d/test_vit_1d.py | 104 --- .../2d-nproc4-lr1e-3/acc-2D-lr1e-3.jpg | Bin 29576 -> 0 bytes .../2d-nproc4-lr1e-3/alignment.o3475503 | 177 ----- .../2d-nproc4-lr1e-3/loss-2D-lr1e-3.jpg | Bin 37591 -> 0 bytes .../2d-nproc4-lr1e-4/acc-2D-lr1e-4.jpg | Bin 29143 -> 0 bytes .../2d-nproc4-lr1e-4/alignment.o3472937 | 177 ----- .../2d-nproc4-lr1e-4/loss-2D-lr1e-4.jpg | Bin 36154 -> 0 bytes .../acc-vanilla-lr1e-3.jpg | Bin 30033 -> 0 bytes .../vanilla-nproc1-lr1e-3/alignment.o3476018 | 165 ---- .../loss-vanilla-lr1e-3.jpg | Bin 37624 -> 0 bytes .../test_vit_2d/test_vit_2d.py | 84 --- .../test_vit_2p5d/log/111log1e-3.txt | 103 --- .../test_vit_2p5d/log/111log1e-3hxmodel.txt | 196 ----- .../test_vit_2p5d/log/111log1e-4.txt | 103 --- .../test_vit_2p5d/log/111log1e-4hxmodel.txt | 195 ----- .../test_vit_2p5d/log/421log1e-3.txt | 115 --- .../test_vit_2p5d/log/421log1e-4.txt | 115 --- .../test_vit_2p5d/log/822log1e-3.txt | 131 ---- .../test_vit_2p5d/log/822log1e-4.txt | 131 ---- .../test_vit_2p5d/test_vit_2p5d.py | 86 --- .../test_vit_3d/profiling_3d.py | 360 --------- .../test_vit_3d/test_vit_3d.py | 205 ----- .../test_vit_vanilla.py | 28 - .../configs/test_trainer_resnet.py | 72 -- tests/test_trainer/test.sh | 3 +- .../test_pipeline/debug_schedule.py | 0 .../test_pipeline/test_p2p.py | 6 +- .../test_pipeline/test_partition.py | 4 +- .../test_pipeline/test_schedule.py | 4 +- tests/test_trainer/test_trainer.py | 29 - .../test_trainer_with_non_pipe_schedule.py | 113 +++ .../test_trainer_with_pipe_schedule.py | 146 ++++ .../test_utils/test_gradient_accumluation.py | 117 +++ tests/test_zero_data_parallel/config.py | 87 --- tests/test_zero_data_parallel/test_zero.py | 226 +++--- tests/test_zero_data_parallel/test_zero.sh | 2 +- tests/test_zero_tensor_parallel/components.py | 76 ++ .../configs/vit_2d_zero1.py | 159 ---- .../configs/vit_2d_zero2.py | 139 +--- .../configs/vit_2d_zero3.py | 139 +--- tests/test_zero_tensor_parallel/test.sh | 2 +- .../test_zero_tensor_parallel/test_vit_2d.py | 102 +++ .../test_vit_2d/test_vit_2d.py | 84 --- 205 files changed, 3635 insertions(+), 8625 deletions(-) create mode 100644 colossalai/amp/__init__.py rename colossalai/{engine => }/amp/amp_type.py (83%) create mode 100644 colossalai/amp/apex_amp/__init__.py create mode 100644 colossalai/amp/apex_amp/apex_amp.py create mode 100644 colossalai/amp/naive_amp/__init__.py rename colossalai/{nn/optimizer/fp16_optimizer.py => amp/naive_amp/_fp16_optimizer.py} (97%) create mode 100644 colossalai/amp/naive_amp/naive_amp.py create mode 100644 colossalai/amp/torch_amp/__init__.py rename colossalai/{engine/amp/grad_scaler.py => amp/torch_amp/_grad_scaler.py} (99%) create mode 100644 colossalai/amp/torch_amp/torch_amp.py delete mode 100644 colossalai/context/_utils.py delete mode 100644 colossalai/engine/amp/__init__.py delete mode 100644 colossalai/engine/schedule/_no_pipeline.py create mode 100644 colossalai/engine/schedule/_non_pipeline_schedule.py rename colossalai/engine/schedule/{_pipeline.py => _pipeline_schedule.py} (82%) delete mode 100644 colossalai/engine/schedule/_utils.py delete mode 100644 colossalai/nn/data/__init__.py delete mode 100644 colossalai/nn/data/_utils.py delete mode 100644 colossalai/nn/data/base_dataset.py delete mode 100644 colossalai/nn/data/caltech101_dataset.py delete mode 100644 colossalai/nn/data/cifar10_dataset.py delete mode 100644 colossalai/nn/data/sampler/__init__.py create mode 100644 colossalai/nn/layer/non_parallel_layers/__init__.py rename colossalai/nn/layer/{vanilla_vision_transformer/layers.py => non_parallel_layers/_vit.py} (88%) delete mode 100644 colossalai/nn/layer/parallel_vision_transformer/__init__.py delete mode 100644 colossalai/nn/layer/parallel_vision_transformer/layers.py delete mode 100644 colossalai/nn/layer/vanilla_resnet/__init__.py delete mode 100644 colossalai/nn/layer/vanilla_resnet/basic_block.py delete mode 100644 colossalai/nn/layer/vanilla_resnet/bottleneck.py delete mode 100644 colossalai/nn/layer/vanilla_resnet/conv.py delete mode 100644 colossalai/nn/layer/vanilla_resnet/reslayer.py delete mode 100644 colossalai/nn/layer/vanilla_vision_transformer/__init__.py delete mode 100644 colossalai/nn/loss/base_loss.py rename colossalai/nn/model/{base_model.py => model_from_config.py} (92%) delete mode 100644 colossalai/nn/model/vanilla_resnet/__init__.py delete mode 100644 colossalai/nn/model/vanilla_resnet/resnet.py delete mode 100644 colossalai/nn/model/vision_transformer/__init__.py delete mode 100644 colossalai/nn/optimizer/_utils.py create mode 100644 colossalai/nn/optimizer/colossalai_optimizer.py delete mode 100644 colossalai/nn/optimizer/zero_redundancy_optimizer_level_1.py create mode 100644 colossalai/utils/data_sampler/__init__.py rename colossalai/{nn/data/sampler => utils/data_sampler}/base_sampler.py (100%) rename colossalai/{nn/data/sampler => utils/data_sampler}/data_parallel_sampler.py (68%) create mode 100644 colossalai/utils/gradient_accumulation/__init__.py create mode 100644 colossalai/utils/gradient_accumulation/_gradient_accumulation.py rename colossalai/{nn => utils}/multi_tensor_apply/__init__.py (100%) rename colossalai/{nn => utils}/multi_tensor_apply/multi_tensor_apply.py (100%) create mode 100644 colossalai/zero/__init__.py rename colossalai/{nn/optimizer => zero}/loss_scaler.py (100%) rename colossalai/{nn/optimizer => zero}/zero_redundancy_optimizer_level_2.py (99%) rename colossalai/{nn/optimizer => zero}/zero_redundancy_optimizer_level_3.py (99%) create mode 100644 model_zoo/bert/__init__.py rename colossalai/nn/model/vision_transformer/vision_transformer.py => model_zoo/vit/vision_transformer_from_config.py (95%) delete mode 100644 tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py delete mode 100644 tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py create mode 100644 tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py delete mode 100644 tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py delete mode 100644 tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py create mode 100644 tests/test_data_pipeline_tensor_parallel/vit_t_2d.py create mode 100644 tests/test_engine/test_engine/test_engine_apex_amp.py create mode 100644 tests/test_engine/test_engine/test_engine_naive_amp.py create mode 100644 tests/test_engine/test_engine/test_engine_no_amp.py create mode 100644 tests/test_engine/test_engine/test_engine_torch_amp.py delete mode 100644 tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py delete mode 100644 tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py delete mode 100644 tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py delete mode 100644 tests/test_engine/test_pipeline_engine/test_engine.py delete mode 100644 tests/test_fp16_optimizer/configs/vit_2d.py delete mode 100644 tests/test_fp16_optimizer/test.sh delete mode 100644 tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py delete mode 100644 tests/test_lr_scheduler/test_lr_scheduler.py delete mode 100644 tests/test_models/test_vanilla_resnet/test_vanilla_resnet.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_1d.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_2d.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_2d_imagenet.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_2p5d.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_3d.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_3d_imagenet.py delete mode 100644 tests/test_models/test_vision_transformer/configs/vit_vanilla.py delete mode 100644 tests/test_models/test_vision_transformer/test.sh delete mode 100644 tests/test_models/test_vision_transformer/test_vit_1d/test_vit_1d.py delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-3/acc-2D-lr1e-3.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-3/alignment.o3475503 delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-3/loss-2D-lr1e-3.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-4/acc-2D-lr1e-4.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-4/alignment.o3472937 delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-4/loss-2D-lr1e-4.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/acc-vanilla-lr1e-3.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/alignment.o3476018 delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/loss-vanilla-lr1e-3.jpg delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3hxmodel.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4hxmodel.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-3.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-4.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-3.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-4.txt delete mode 100644 tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py delete mode 100644 tests/test_models/test_vision_transformer/test_vit_3d/profiling_3d.py delete mode 100644 tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py delete mode 100644 tests/test_models/test_vision_transformer/test_vit_vanilla.py rename tests/{test_engine => test_trainer}/test_pipeline/debug_schedule.py (100%) rename tests/{test_engine => test_trainer}/test_pipeline/test_p2p.py (97%) rename tests/{test_engine => test_trainer}/test_pipeline/test_partition.py (91%) rename tests/{test_engine => test_trainer}/test_pipeline/test_schedule.py (92%) delete mode 100644 tests/test_trainer/test_trainer.py create mode 100644 tests/test_trainer/test_trainer_with_non_pipe_schedule.py create mode 100644 tests/test_trainer/test_trainer_with_pipe_schedule.py create mode 100644 tests/test_utils/test_gradient_accumluation.py create mode 100644 tests/test_zero_tensor_parallel/components.py delete mode 100644 tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py create mode 100644 tests/test_zero_tensor_parallel/test_vit_2d.py delete mode 100644 tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py diff --git a/colossalai/__init__.py b/colossalai/__init__.py index 854d941bc33e..e7ea7d65a431 100644 --- a/colossalai/__init__.py +++ b/colossalai/__init__.py @@ -1,4 +1,4 @@ -from .initialize import init_dist, initialize -from .nn import * +from .initialize import (initialize, launch, launch_from_openmpi, + launch_from_slurm, launch_from_torch, get_default_parser) __version__ = '0.0.1' diff --git a/colossalai/amp/__init__.py b/colossalai/amp/__init__.py new file mode 100644 index 000000000000..268eced66fd7 --- /dev/null +++ b/colossalai/amp/__init__.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from .amp_type import AMP_TYPE +from colossalai.context import Config +import torch.nn as nn +from torch.optim import Optimizer +from torch.nn.modules.loss import _Loss +from .torch_amp import convert_to_torch_amp +from .apex_amp import convert_to_apex_amp +from .naive_amp import convert_to_naive_amp + + +def convert_to_amp(model: nn.Module, + optimizer: Optimizer, + criterion: _Loss, + mode: AMP_TYPE, + amp_config: Config = None): + assert isinstance(mode, AMP_TYPE), \ + f'expected the argument mode be AMP_TYPE, but got {type(mode)}' + + if amp_config is None: + amp_config = Config() + + if mode == AMP_TYPE.TORCH: + model, optimizer, criterion = convert_to_torch_amp(model, optimizer, criterion, amp_config) + elif mode == AMP_TYPE.APEX: + model, optimizer = convert_to_apex_amp(model, optimizer, amp_config) + elif mode == AMP_TYPE.NAIVE: + model, optimizer = convert_to_naive_amp(model, optimizer, amp_config) + + return model, optimizer, criterion diff --git a/colossalai/engine/amp/amp_type.py b/colossalai/amp/amp_type.py similarity index 83% rename from colossalai/engine/amp/amp_type.py rename to colossalai/amp/amp_type.py index 7f7c5a659df0..6f322f866cfc 100644 --- a/colossalai/engine/amp/amp_type.py +++ b/colossalai/amp/amp_type.py @@ -7,4 +7,4 @@ class AMP_TYPE(Enum): APEX = 'apex' TORCH = 'torch' - PARALLEL = 'parallel' + NAIVE = 'naive' diff --git a/colossalai/amp/apex_amp/__init__.py b/colossalai/amp/apex_amp/__init__.py new file mode 100644 index 000000000000..2d0ff9771360 --- /dev/null +++ b/colossalai/amp/apex_amp/__init__.py @@ -0,0 +1,15 @@ +from .apex_amp import ApexAMPOptimizer +import torch.nn as nn +from torch.optim import Optimizer +import apex.amp as apex_amp + + +def convert_to_apex_amp(model: nn.Module, + optimizer: Optimizer, + amp_config): + model, optimizer = apex_amp.initialize(model, optimizer, **amp_config) + optimizer = ApexAMPOptimizer(optimizer) + return model, optimizer + + +__all__ = ['convert_to_apex_amp', 'ApexAMPOptimizer'] diff --git a/colossalai/amp/apex_amp/apex_amp.py b/colossalai/amp/apex_amp/apex_amp.py new file mode 100644 index 000000000000..d44478364ab7 --- /dev/null +++ b/colossalai/amp/apex_amp/apex_amp.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch.nn as nn +try: + import apex.amp as apex_amp +except: + pass +from torch import Tensor + +from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.utils import clip_grad_norm_fp32 + + +class ApexAMPOptimizer(ColossalaiOptimizer): + + def backward(self, loss: Tensor): + with apex_amp.scale_loss(loss, self.optim) as scaled_loss: + scaled_loss.backward() + + def clip_grad_norm(self, model: nn.Module, max_norm: float): + if max_norm > 0: + clip_grad_norm_fp32(apex_amp.master_params(self.optim), max_norm) diff --git a/colossalai/amp/naive_amp/__init__.py b/colossalai/amp/naive_amp/__init__.py new file mode 100644 index 000000000000..e3a49c7e8780 --- /dev/null +++ b/colossalai/amp/naive_amp/__init__.py @@ -0,0 +1,20 @@ +import torch.nn as nn +from torch.optim import Optimizer +from colossalai.utils import is_no_pp_or_last_stage + +from .naive_amp import NaiveAMPOptimizer, NaiveAMPModel + + +def convert_to_naive_amp(model: nn.Module, + optimizer: Optimizer, + amp_config): + if is_no_pp_or_last_stage(): + model = NaiveAMPModel(model, output_to_fp32=True) + else: + model = NaiveAMPModel(model, output_to_fp32=False) + + optimizer = NaiveAMPOptimizer(optimizer, **amp_config) + return model, optimizer + + +__all__ = ['convert_to_naive_amp', 'NaiveAMPOptimizer'] diff --git a/colossalai/nn/optimizer/fp16_optimizer.py b/colossalai/amp/naive_amp/_fp16_optimizer.py similarity index 97% rename from colossalai/nn/optimizer/fp16_optimizer.py rename to colossalai/amp/naive_amp/_fp16_optimizer.py index 4ae970910c5c..d917a97bcfeb 100644 --- a/colossalai/nn/optimizer/fp16_optimizer.py +++ b/colossalai/amp/naive_amp/_fp16_optimizer.py @@ -12,11 +12,9 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.registry import OPTIMIZER_WRAPPERS -from colossalai.utils import print_rank_0 -from ._utils import copy_tensor_parallel_attributes, clip_grad_norm_fp32, count_zeros_fp32 -from ..multi_tensor_apply import multi_tensor_applier +from colossalai.logging import get_dist_logger +from colossalai.utils import (print_rank_0, copy_tensor_parallel_attributes, + clip_grad_norm_fp32, count_zeros_fp32, multi_tensor_applier) def _zero_grad_group_helper(group, set_to_none): @@ -92,7 +90,7 @@ def __init__(self, self._growth_tracker = 0 self._hysteresis_tracker = self.hysteresis - self._logger = get_global_dist_logger() + self._logger = get_dist_logger() @property def scale(self): @@ -145,7 +143,6 @@ def load_state_dict(self, state_dict): self._max_scale = state_dict['max_scale'] -@OPTIMIZER_WRAPPERS.register_module class FP16Optimizer(Optimizer): """Float16 optimizer for fp16 and bf16 data types. @@ -184,13 +181,13 @@ def __init__(self, max_scale: int = 2 ** 32): # default args for compatibility bf16 = False - params_have_main_grad = False + params_have_main_grad = True # have a defaults for compatibility with pytorch optim self.defaults = optimizer.defaults # log config - self._logger = get_global_dist_logger() + self._logger = get_dist_logger() self._logger.info(f"\n========= FP16 Optimizer Config =========\n" f"Optimizer: {optimizer.__class__.__name__}\n" f"clip_grad = {clip_grad}\n" @@ -328,6 +325,7 @@ def _copy_model_grads_to_main_grads(self): else: if model_param.grad is not None: main_param.grad = model_param.grad.float() + # For fp32 grads, we need to reset the grads to main grad. if self.params_have_main_grad: for model_group in self.fp32_from_fp32_groups: @@ -387,10 +385,6 @@ def reload_model_params(self): @torch.no_grad() def step(self): - # for param_group in self.float16_groups: - # for param in param_group: - # print(param.grad is None) - # Copy gradients from model params to main params. self._copy_model_grads_to_main_grads() diff --git a/colossalai/amp/naive_amp/naive_amp.py b/colossalai/amp/naive_amp/naive_amp.py new file mode 100644 index 000000000000..dd0b88b44a51 --- /dev/null +++ b/colossalai/amp/naive_amp/naive_amp.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch +import torch.nn as nn +from torch import Tensor +from typing import Union, List, Any, Dict +from torch.optim import Optimizer +import torch.cuda.amp as torch_amp + +from colossalai.nn.optimizer import ColossalaiOptimizer +from ._fp16_optimizer import FP16Optimizer + + +class NaiveAMPOptimizer(ColossalaiOptimizer): + + def __init__(self, optim: Optimizer, *args, **kwargs): + optim = FP16Optimizer(optimizer=optim, *args, **kwargs) + super().__init__(optim) + + def backward(self, loss: Tensor): + loss = self.optim.scale_loss(loss) + loss.backward() + + def step(self): + self.optim.step() + + def clip_grad_norm(self, model: nn.Module, max_norm: float): + pass + + +class NaiveAMPModel(nn.Module): + + def __init__(self, + model: nn.Module, + output_to_fp32: bool = True): + super().__init__() + self.model = model.half() + self._output_to_fp32 = output_to_fp32 + + def _convert_to_fp16(self, input_: Any): + if isinstance(input_, Tensor) and input_.dtype == torch.float32: + input_ = input_.half() + return input_ + + def _convert_to_fp32(self, input_: Any): + if isinstance(input_, Tensor) and input_.dtype == torch.float16: + input_ = input_.float() + return input_ + + def forward(self, *args, **kwargs): + if args: + args = [self._convert_to_fp16(arg) for arg in args] + if kwargs: + for k, v in kwargs.items(): + kwargs[k] = self._convert_to_fp16(v) + + out = self.model(*args, **kwargs) + + if self._output_to_fp32: + if isinstance(out, Tensor): + out = self._convert_to_fp32(out) + elif isinstance(out, (tuple, list)): + out = [self._convert_to_fp32(val) for val in out] + return out diff --git a/colossalai/amp/torch_amp/__init__.py b/colossalai/amp/torch_amp/__init__.py new file mode 100644 index 000000000000..b3c5b0c5b6a3 --- /dev/null +++ b/colossalai/amp/torch_amp/__init__.py @@ -0,0 +1,18 @@ +import torch.nn as nn +from torch.optim import Optimizer +from torch.nn.modules.loss import _Loss +from colossalai.context import Config +from .torch_amp import TorchAMPOptimizer, TorchAMPModel, TorchAMPLoss + + +def convert_to_torch_amp(model: nn.Module, + optimizer: Optimizer, + criterion: _Loss, + amp_config: Config): + model = TorchAMPModel(model) + optimizer = TorchAMPOptimizer(optimizer, **amp_config) + criterion = TorchAMPLoss(criterion) + return model, optimizer, criterion + + +__all__ = ['convert_to_torch_amp', 'TorchAMPModel', 'TorchAMPLoss', 'TorchAMPOptimizer'] diff --git a/colossalai/engine/amp/grad_scaler.py b/colossalai/amp/torch_amp/_grad_scaler.py similarity index 99% rename from colossalai/engine/amp/grad_scaler.py rename to colossalai/amp/torch_amp/_grad_scaler.py index 7859d132db17..7e79ecab838a 100644 --- a/colossalai/engine/amp/grad_scaler.py +++ b/colossalai/amp/torch_amp/_grad_scaler.py @@ -1,4 +1,8 @@ -# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.p +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py +# to support tensor parallel + import torch from collections import defaultdict, abc import warnings diff --git a/colossalai/amp/torch_amp/torch_amp.py b/colossalai/amp/torch_amp/torch_amp.py new file mode 100644 index 000000000000..3963601843b2 --- /dev/null +++ b/colossalai/amp/torch_amp/torch_amp.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch.nn as nn +import torch.cuda.amp as torch_amp + +from torch import Tensor +from torch.nn.modules.loss import _Loss +from torch.optim import Optimizer +from ._grad_scaler import GradScaler + +from colossalai.nn.optimizer import ColossalaiOptimizer +from colossalai.utils import clip_grad_norm_fp32 + + +class TorchAMPOptimizer(ColossalaiOptimizer): + + def __init__(self, optim: Optimizer, *args, **kwargs): + super().__init__(optim) + self.scaler = GradScaler(*args, **kwargs) + + def backward(self, loss: Tensor): + self.scaler.scale(loss).backward() + + def step(self): + self.scaler.step(self.optim) + self.scaler.update() + + def clip_grad_norm(self, model: nn.Module, max_norm: float): + if max_norm > 0.0: + self.scaler.unscale_(self.optim) + clip_grad_norm_fp32(model.parameters(), max_norm) + + +class TorchAMPModel(nn.Module): + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @torch_amp.autocast() + def forward(self, *args, **kwargs): + return self.model(*args, **kwargs) + + +class TorchAMPLoss(nn.Module): + + def __init__(self, loss: _Loss): + super().__init__() + self.loss = loss + + @torch_amp.autocast() + def forward(self, *args, **kwargs): + return self.loss(*args, **kwargs) diff --git a/colossalai/builder/__init__.py b/colossalai/builder/__init__.py index 2ae19413269a..6c1105a2d39c 100644 --- a/colossalai/builder/__init__.py +++ b/colossalai/builder/__init__.py @@ -1,10 +1,10 @@ -from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper, - build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler, +from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_layer, + build_loss, build_hooks, build_dataset, build_transform, build_data_sampler, build_gradient_handler) -from .pipeline import ModelInitializer +from .pipeline import PipelineModelInitializer __all__ = [ - 'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper', + 'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler', - 'build_gradient_handler', 'ModelInitializer' + 'build_gradient_handler', 'PipelineModelInitializer' ] diff --git a/colossalai/builder/builder.py b/colossalai/builder/builder.py index c32ad3b39927..6e8e245516f0 100644 --- a/colossalai/builder/builder.py +++ b/colossalai/builder/builder.py @@ -106,7 +106,7 @@ def build_dataset(config): return build_from_registry(config, DATASETS) -def build_optimizer(config, model, params: Iterable = None, need_module=False): +def build_optimizer(config, model): """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`, 'model' and 'params'. @@ -115,23 +115,12 @@ def build_optimizer(config, model, params: Iterable = None, need_module=False): :type config: dict or :class:`colossalai.context.Config` :param model: A model containing parameters for the optimizer :type model: :class:`nn.Module` - :param params: A dict containing parameters for the optimizer - :type params: dict, optional - :param need_module: Indicates whether the optimizer needs a module - :type params: bool, optional - :raises AssertionError: Raises an AssertionError if both `model` and `params` are None :return: An object of :class:`torch.optim.Optimizer` :rtype: :class:`torch.optim.Optimizer` """ - assert model is not None or params is not None, 'arguments model and params can not both be None' - if need_module: - config['module'] = model - elif model is not None: - config['params'] = model.parameters() - elif params is not None: - config['params'] = params - - return build_from_registry(config, OPTIMIZERS) + config_ = config.copy() + config_['params'] = model.parameters() + return build_from_registry(config_, OPTIMIZERS) def build_gradient_handler(config, model, optimizer): @@ -149,8 +138,9 @@ def build_gradient_handler(config, model, optimizer): :rtype: :class:`BaseGradientHandler` """ config_ = config.copy() - mod_type = config_.pop('type') - return GRADIENT_HANDLER.get_module(mod_type)(model, optimizer, **config_) + config_['model'] = model + config_['optimizer'] = optimizer + return build_from_registry(config_, GRADIENT_HANDLER) def build_hooks(config, trainer): @@ -164,8 +154,9 @@ def build_hooks(config, trainer): :return: An object of :class:`BaseHook` :rtype: :class:`BaseHook` """ - config['trainer'] = trainer - return build_from_registry(config, HOOKS) + config_ = config.copy() + config_['trainer'] = trainer + return build_from_registry(config_, HOOKS) def build_transform(config): @@ -195,32 +186,8 @@ def build_data_sampler(config, dataset): :rtype: :class:`colossalai.nn.data.sampler.BaseSampler` """ config_ = config.copy() - mod_type = config_.pop('type') - return SAMPLERS.get_module(mod_type)(dataset, **config_) - - -def build_optimizer_wrapper(config, optimizer, model=None): - """Returns an optimizer wrapper object of :class:`torch.optim.Optimizer` constructed - from `config`, `model` and `optimizer`. - - :param config: A python dict or a :class:`colossalai.context.Config` object - containing information used in the construction of the return object - :type config: dict or :class:`colossalai.context.Config` - :param optimizer: An optimizer object containing parameters for the gradient handler - :type optimizer: :class:`torch.optim.Optimizer` - :param model: A model containing parameters for the gradient handler - :type model: :class:`nn.Module`, optional - :return: An object of :class:`torch.optim.Optimizer` - :rtype: :class:`torch.optim.Optimizer` - """ - config_ = config.copy() - mod_type = config_.pop('type') - - # LSG: special treatment for zeor level 3 - if mod_type == 'ZeroRedundancyOptimizer_Level_3': - return OPTIMIZER_WRAPPERS.get_module(mod_type)(model, optimizer, **config_) - else: - return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_) + config_['dataset'] = dataset + return build_from_registry(config_, DATA_SAMPLERS) def build_lr_scheduler(config, optimizer): @@ -241,8 +208,8 @@ def build_lr_scheduler(config, optimizer): :rtype: :class:`torch.optim.lr_scheduler` """ config_ = config.copy() - mod_type = config_.pop('type') - return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_) + config_['optimizer'] = optimizer + return build_from_registry(config_, LR_SCHEDULERS) def build_schedule(config): diff --git a/colossalai/builder/pipeline.py b/colossalai/builder/pipeline.py index 4de5c96cbbea..5a568a909445 100644 --- a/colossalai/builder/pipeline.py +++ b/colossalai/builder/pipeline.py @@ -4,7 +4,7 @@ from colossalai.builder import build_model, build_layer from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.utils import set_to_cuda @@ -111,21 +111,21 @@ def _binary_search(weights, num): return intervals -def _partition_uniform(num_items, num_parts, num_chunks): +def _partition_uniform(num_items, pipeline_parallel_size, num_chunks): assert num_items % num_chunks == 0, \ "Layer length should be divided by the number of chunks, otherwise parameter method is recomended" - logger = get_global_dist_logger() - parts = [[] for _ in range(num_parts)] + logger = get_dist_logger() + parts = [[] for _ in range(pipeline_parallel_size)] partition_items = num_items // num_chunks for idx in range(num_chunks): base_idx = idx * partition_items - chunk_size = partition_items // num_parts - left = num_parts - partition_items % num_parts + chunk_size = partition_items // pipeline_parallel_size + left = pipeline_parallel_size - partition_items % pipeline_parallel_size if chunk_size == 0: logger.warning("Some nodes in Pipeline have no requests") - for p in range(num_parts): + for p in range(pipeline_parallel_size): st = base_idx base_idx += chunk_size + (p >= left) parts[p].append((st, base_idx)) @@ -133,34 +133,34 @@ def _partition_uniform(num_items, num_parts, num_chunks): return parts -def _partition_balanced(weights, num_parts, num_chunks): - num_total = num_parts * num_chunks +def _partition_balanced(weights, pipeline_parallel_size, num_chunks): + num_total = pipeline_parallel_size * num_chunks num_items = len(weights) if num_items <= num_total: - return _partition_uniform(num_items, num_parts, num_chunks) + return _partition_uniform(num_items, pipeline_parallel_size, num_chunks) intervals = _binary_search(weights, num_total) current = 0 - parts = [[] for _ in range(num_parts)] + parts = [[] for _ in range(pipeline_parallel_size)] for inter in intervals: parts[current].append(inter) - current = (current + 1) % num_parts + current = (current + 1) % pipeline_parallel_size return parts -class ModelInitializer(): +class PipelineModelInitializer(): def __init__(self, config, num_chunks, verbose=False): self.num_chunks = num_chunks self.ori_model = build_model(config) self.layers = self.ori_model.layers_cfg layer_length = len(self.layers) self.verbose = verbose - self._logger = get_global_dist_logger() + self._logger = get_dist_logger() self._logger.info(f"The total length of layers is {layer_length}", ranks=[0]) - def model_initialize(self, partition_method='parameter'): + def initialize(self, partition_method='parameter'): # Some space for initializing comunication groups self._interval = None self._partition_layers(method=partition_method) diff --git a/colossalai/context/__init__.py b/colossalai/context/__init__.py index 3009779c80a1..ac14087739a7 100644 --- a/colossalai/context/__init__.py +++ b/colossalai/context/__init__.py @@ -1,5 +1,5 @@ -from .config import Config +from .config import Config, ConfigException from .parallel_context import ParallelContext -from .parallel_context import ParallelMode +from .parallel_mode import ParallelMode from .process_group_initializer import * from .random import * diff --git a/colossalai/context/_utils.py b/colossalai/context/_utils.py deleted file mode 100644 index a770ea7b468d..000000000000 --- a/colossalai/context/_utils.py +++ /dev/null @@ -1,70 +0,0 @@ -import math - - -def set_parallel_size(obj, config: dict, key: str, attr_name: str): - if key in config: - ele = config[key] - if isinstance(ele, int): - setattr(obj, attr_name, ele) - elif isinstance(ele, dict): - setattr(obj, attr_name, ele['size']) - else: - raise NotImplementedError( - f"Parallel configuration does not support this kind of argument, please use int or dict" - ) - - -def add_tensor_pg(pg_init, mode, size, depth=None): - if mode == '1d': - pg_init.append(dict( - type='Initializer1D', - parallel_size=size - )) - elif mode == '2d': - dim = math.floor(math.sqrt(size)) - pg_init.append(dict( - type='Initializer2D_Col', - summa_dim=dim - )) - pg_init.append(dict( - type='Initializer2D_Row', - summa_dim=dim - )) - elif mode == '2.5d': - dim = math.floor(math.sqrt(size // depth)) - pg_init.append(dict( - type='Initializer_Tesseract_ROW', - tesseract_dim=dim, - tesseract_dep=depth - )) - pg_init.append(dict( - type='Initializer_Tesseract_COL', - tesseract_dim=dim, - tesseract_dep=depth - )) - pg_init.append(dict( - type='Initializer_Tesseract_DEP', - tesseract_dim=dim, - tesseract_dep=depth - )) - pg_init.append(dict( - type='Initializer_Tesseract_XZ', - tesseract_dim=dim, - tesseract_dep=depth - )) - elif mode == '3d': - dim = math.floor(math.pow(size, 1.0 / 3.0) + 0.5) - pg_init.append(dict( - type='ParallelInitializer3D_Input', - depth=dim - )) - pg_init.append(dict( - type='ParallelInitializer3D_Weight', - depth=dim - )) - pg_init.append(dict( - type='ParallelInitializer3D_Output', - depth=dim - )) - else: - raise NotImplementedError("This kind of tensor splitting has not been implemented yet") diff --git a/colossalai/context/config.py b/colossalai/context/config.py index 52a375aa1a44..5943aa7ed798 100644 --- a/colossalai/context/config.py +++ b/colossalai/context/config.py @@ -97,3 +97,7 @@ def from_file(filename: str): sys.path.pop(0) return config + + +class ConfigException(Exception): + pass diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py index 6f4ea48327d6..4f8e9f80780f 100644 --- a/colossalai/context/parallel_context.py +++ b/colossalai/context/parallel_context.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -import os import random from typing import Union @@ -11,8 +10,8 @@ from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING from colossalai.context.config import Config +from colossalai.logging import get_dist_logger from colossalai.registry import DIST_GROUP_INITIALIZER -from ._utils import set_parallel_size from .parallel_mode import ParallelMode from .random import add_seed, get_seeds, set_mode @@ -21,11 +20,24 @@ class ParallelContext: """This class provides interface functions for users to get the parallel context, such as the global rank, the local rank, the world size, etc. of each device. - :param args: The distributed arguments in the system - :type args: dict """ - def __init__(self, args=None): + __instance = None + + @staticmethod + def get_instance(): + if ParallelContext.__instance is None: + ParallelContext() + return ParallelContext.__instance + + def __init__(self): + # create a singleton instance + if ParallelContext.__instance is not None: + raise Exception( + 'ParallelContext is a singleton class, you should get the instance by colossalai.core.global_context') + else: + ParallelContext.__instance = self + # distributed settings self._global_ranks = dict() self._local_ranks = dict() @@ -34,7 +46,6 @@ def __init__(self, args=None): self._ranks_in_group = dict() # load config from file - self._dist_args = args self._config = None # default 3D parallel args, will be overwritten during process group intialization @@ -43,10 +54,22 @@ def __init__(self, args=None): self.pipeline_parallel_size = 1 self.tensor_parallel_size = 1 + # logging + self._verbose = False + self._logger = get_dist_logger() + @property def config(self): return self._config + @property + def verbose(self): + return self._verbose + + @verbose.setter + def verbose(self, verbose_: bool): + self._verbose = verbose_ + def load_config(self, config: Union[dict, str]): """Loads the configuration from either a dict or a file. @@ -62,14 +85,6 @@ def load_config(self, config: Union[dict, str]): else: raise TypeError("Invalid type for config, only dictionary or string is supported") - def set_dist_args(self, args): - """Sets the distributed arguments. - - :param args: The distributed arguments in the system - :type args: dict - """ - self._dist_args = args - @staticmethod def _check_parallel_mode(parallel_mode: ParallelMode): assert isinstance(parallel_mode, ParallelMode) @@ -268,41 +283,36 @@ def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list): self._check_parallel_mode(parallel_mode) self._ranks_in_group[parallel_mode] = ranks - def init_global_dist(self, addr=None, port=None): - """Initializes the global distributed environment. - - :param addr: The IP address of the current device - :type addr: str, optional - :param port: The port to be used in the system of the current device - :type port: int, optional + def init_global_dist(self, + rank: int, + world_size: int, + backend: str, + host: str, + port: int + ): + """Initializes the global distributed environment + :param rank: rank for the default process group + :type rank: int + :param world_size: world size of the default process group + :type world_size: int + :param host: the master address for distributed training + :type host: str + :param port: the master port for distributed training + :type port: str + :param backend: backend for torch.distributed + :type backend: str """ - # get config - local_rank = self._dist_args.local_rank - rank = self._dist_args.rank - world_size = self._dist_args.world_size - if local_rank is None: - local_rank = os.getenv('LOCAL_RANK') - if rank is None: - rank = os.getenv('RANK') - if world_size is None: - world_size = os.getenv('WORLD_SIZE') - # default env config, overwrite by exporting - # them in your bash script - - addr = os.getenv('MASTER_ADDR', 'localhost') if addr is None else addr - port = os.getenv('MASTER_PORT', '8008') if port is None else port - init_method = f'tcp://{addr}:{port}' - - dist.init_process_group(backend=self._dist_args.backend, - rank=rank, + # initialize the default process group + init_method = f'tcp://{host}:{port}' + dist.init_process_group(rank=rank, world_size=world_size, + backend=backend, init_method=init_method) # None will give the default global process group for pytorch dist operations self._register_dist(rank, world_size, None, list(range(world_size)), ParallelMode.GLOBAL) self.add_global_rank(ParallelMode.GLOBAL, rank) - # self._global_ranks[ParallelMode.GLOBAL] = rank def _register_dist(self, local_rank, world_size, process_group, ranks_in_group, mode): @@ -321,7 +331,20 @@ def check_sanity(self): pps = self.pipeline_parallel_size tps = self.tensor_parallel_size ws = self.world_size - assert ws == dps * pps * tps, f"Expected the world size {ws} to be equal to data parallel size ({dps}) * pipeline parallel size ({pps}) * tensor parallel size ({tps})" + assert ws == dps * pps * \ + tps, f"Expected the world size {ws} to be equal to data parallel size ({dps}) * pipeline parallel size ({pps}) * tensor parallel size ({tps})" + + def _set_parallel_size_from_config(self, config: dict, key: str, attr_name: str): + if key in config: + ele = config[key] + if isinstance(ele, int): + setattr(self, attr_name, ele) + elif isinstance(ele, dict): + setattr(self, attr_name, ele['size']) + else: + raise NotImplementedError( + f"Parallel configuration does not support this kind of argument, please use int or dict" + ) def init_parallel_groups(self): """Initializes the parallel groups. @@ -334,21 +357,20 @@ def init_parallel_groups(self): world_size = self.get_world_size(ParallelMode.GLOBAL) self.world_size = world_size - assert hasattr(self.config, 'parallel'), 'Expected the field parallel to be present in the config file' - # set parallel size as attributes for global context - parallel_config = self.config.parallel - set_parallel_size(self, parallel_config, 'pipeline', - 'pipeline_parallel_size') - set_parallel_size(self, parallel_config, 'tensor', - 'tensor_parallel_size') + parallel_config = self.config.get('parallel', None) + if parallel_config is not None: + self._set_parallel_size_from_config(parallel_config, 'pipeline', 'pipeline_parallel_size') + self._set_parallel_size_from_config(parallel_config, 'tensor', 'tensor_parallel_size') # the user should not set the data parallel size manually # instead, it should be calculated based on other parallel config self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size) # get the tensor parallel mode and check - tensor_parallel_mode = parallel_config['tensor'].get('mode', None) + tensor_parallel_mode = None + if parallel_config is not None and 'tensor' in parallel_config and 'mode' in parallel_config['tensor']: + tensor_parallel_mode = parallel_config['tensor']['mode'] assert tensor_parallel_mode in ALLOWED_MODES, f"mode in the parallel config must be set to one of {ALLOWED_MODES}" self.check_sanity() @@ -409,23 +431,21 @@ def destroy(self): # destroy global process group dist.destroy_process_group() - def set_device(self): + def set_device(self, device_ordinal: int = None): """Sets distributed processes to be bound to devices. """ - devices_per_node = torch.cuda.device_count() global_rank = self.get_global_rank() - device = global_rank % devices_per_node - torch.cuda.set_device(device) - print(f'process rank {global_rank} is bound to device {device}') + if device_ordinal is None: + devices_per_node = torch.cuda.device_count() + device_ordinal = global_rank % devices_per_node + + torch.cuda.set_device(device_ordinal) + if self._verbose: + self._logger.info(f'process rank {global_rank} is bound to device {device_ordinal}') - def set_seed(self): + def set_seed(self, seed: int): """Sets seeds for all random libraries. """ - if hasattr(self.config, 'seed'): - seed = getattr(self.config, 'seed') - else: - seed = 1024 # default seed - random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) @@ -453,11 +473,18 @@ def set_seed(self): seeds = get_seeds() seed_str = ', '.join([f'{k}: {v}' for k, v in seeds.items()]) - print(f"initialized seed on rank {global_rank}, " - f"numpy: {seed}, python random: {seed}, {seed_str}," - f"the default parallel seed is {ParallelMode.DATA}.", flush=True) + if self._verbose: + self._logger.info( + f"initialized seed on rank {global_rank}, " + f"numpy: {seed}, python random: {seed}, {seed_str}," + f"the default parallel seed is {ParallelMode.DATA}.", + ranks=[0]) else: - print(f"initialized seed on rank {global_rank}, " - f"numpy: {seed}, python random: {seed}, pytorch: {seed}", flush=True) - print('WARNING: CUDA is not available, thus CUDA RNG cannot be used to track CUDA random number states', - flush=True) + if self._verbose: + self._logger.info( + f"initialized seed on rank {global_rank}, " + f"numpy: {seed}, python random: {seed}, pytorch: {seed}", + ranks=[0]) + self._logger.info( + 'WARNING: CUDA is not available, thus CUDA RNG cannot be used to track CUDA random number states', + ranks=[0]) diff --git a/colossalai/context/process_group_initializer/initializer_1d.py b/colossalai/context/process_group_initializer/initializer_1d.py index 784480a72425..1b487aba1db2 100644 --- a/colossalai/context/process_group_initializer/initializer_1d.py +++ b/colossalai/context/process_group_initializer/initializer_1d.py @@ -4,7 +4,6 @@ import torch.distributed as dist from colossalai.context import Config -from colossalai.core import global_context as gpc from colossalai.registry import DIST_GROUP_INITIALIZER from .process_group_initializer import ProcessGroupInitializer from ..parallel_mode import ParallelMode diff --git a/colossalai/context/process_group_initializer/initializer_2p5d.py b/colossalai/context/process_group_initializer/initializer_2p5d.py index cacfdc590f0f..ab8fe3573a49 100644 --- a/colossalai/context/process_group_initializer/initializer_2p5d.py +++ b/colossalai/context/process_group_initializer/initializer_2p5d.py @@ -8,7 +8,6 @@ from colossalai.constants import TESSERACT_DIM, TESSERACT_DEP from colossalai.context import Config -from colossalai.core import global_context as gpc from colossalai.registry import DIST_GROUP_INITIALIZER from .process_group_initializer import ProcessGroupInitializer from ..parallel_mode import ParallelMode @@ -42,8 +41,6 @@ def __init__(self, tesseract_dep: int, *args): super(Initializer_2p5D_ROW, self).__init__(*args) - - self.tensor_parallel_size = gpc.tensor_parallel_size self.num_group = self.world_size // self.tensor_parallel_size self.tesseract_dep = tesseract_dep self.tesseract_dim = tesseract_dim @@ -66,7 +63,7 @@ def init_dist_group(self): for j in range(self.tesseract_dim): for k in range(self.tesseract_dep): ranks = [h * self.tensor_parallel_size + i + self.tesseract_dim * ( - j + self.tesseract_dim * k) for i in range(self.tesseract_dim)] + j + self.tesseract_dim * k) for i in range(self.tesseract_dim)] group = dist.new_group(ranks) if self.rank in ranks: @@ -81,13 +78,12 @@ def init_dist_group(self): class Initializer_2p5D_Col(ProcessGroupInitializer): '''2p5d tensor parallel initialization among cols. ''' + def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): super(Initializer_2p5D_Col, self).__init__(*args) - - self.tensor_parallel_size = gpc.tensor_parallel_size self.num_group = self.world_size // self.tensor_parallel_size self.tesseract_dep = tesseract_dep self.tesseract_dim = tesseract_dim @@ -110,7 +106,7 @@ def init_dist_group(self): for i in range(self.tesseract_dim): for k in range(self.tesseract_dep): ranks = [h * self.tensor_parallel_size + i + self.tesseract_dim * ( - j + self.tesseract_dim * k) for j in range(self.tesseract_dim)] + j + self.tesseract_dim * k) for j in range(self.tesseract_dim)] group = dist.new_group(ranks) if self.rank in ranks: @@ -125,13 +121,12 @@ def init_dist_group(self): class Initializer_2p5D_Dep(ProcessGroupInitializer): '''2p5D tensor parallel initialization among depths. ''' + def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): super(Initializer_2p5D_Dep, self).__init__(*args) - - self.tensor_parallel_size = gpc.tensor_parallel_size self.num_group = self.world_size // self.tensor_parallel_size self.tesseract_dep = tesseract_dep self.tesseract_dim = tesseract_dim @@ -154,7 +149,7 @@ def init_dist_group(self): for i in range(self.tesseract_dim): for j in range(self.tesseract_dim): ranks = [h * self.tensor_parallel_size + i + self.tesseract_dim * ( - j + self.tesseract_dim * k) for k in range(self.tesseract_dep)] + j + self.tesseract_dim * k) for k in range(self.tesseract_dep)] group = dist.new_group(ranks) if self.rank in ranks: @@ -170,13 +165,12 @@ def init_dist_group(self): class Initializer_2p5D_XZ(ProcessGroupInitializer): '''2p5d tensor parallel initialization among cols times dep. ''' + def __init__(self, tesseract_dim: int, tesseract_dep: int, *args): super(Initializer_2p5D_XZ, self).__init__(*args) - - self.tensor_parallel_size = gpc.tensor_parallel_size self.num_group = self.world_size // self.tensor_parallel_size self.tesseract_dep = tesseract_dep self.tesseract_dim = tesseract_dim @@ -198,8 +192,8 @@ def init_dist_group(self): for h in range(self.num_group): for i in range(self.tesseract_dim): ranks = [h * self.tensor_parallel_size + i + self.tesseract_dim * ( - j + self.tesseract_dim * k) for k in range(self.tesseract_dep) for j in - range(self.tesseract_dim)] + j + self.tesseract_dim * k) for k in range(self.tesseract_dep) for j in + range(self.tesseract_dim)] group = dist.new_group(ranks) if self.rank in ranks: diff --git a/colossalai/core.py b/colossalai/core.py index 39453e4a01ab..ff30347913a3 100644 --- a/colossalai/core.py +++ b/colossalai/core.py @@ -3,14 +3,4 @@ from colossalai.context import ParallelContext -global_context = ParallelContext() - - -def set_global_context(context: ParallelContext): - '''Reset global context to be identical to a given :class:ParallelContext. - - :param context: Parallel context to generate our global parallel context. - :type context: ParallelContext - ''' - global global_context - global_context = context +global_context = ParallelContext.get_instance() diff --git a/colossalai/engine/__init__.py b/colossalai/engine/__init__.py index 7e55922363d8..73ccb094e756 100644 --- a/colossalai/engine/__init__.py +++ b/colossalai/engine/__init__.py @@ -1,7 +1,5 @@ from ._base_engine import Engine from .gradient_handler import * -from .schedule import * -from .amp import * __all__ = ['Engine'] diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py index 2b9f6fd41002..8a3f6eac34df 100644 --- a/colossalai/engine/_base_engine.py +++ b/colossalai/engine/_base_engine.py @@ -3,17 +3,15 @@ import torch +from typing import List from torch.nn import Module from torch.nn.modules.loss import _Loss from torch.optim import Optimizer from colossalai.builder import build_gradient_handler -from colossalai.logging import get_global_dist_logger -from colossalai.nn import (ZeroRedundancyOptimizer_Level_2, - ZeroRedundancyOptimizer_Level_3) -from colossalai.utils import is_using_ddp, ConditionalContext, is_using_pp -from colossalai.utils.cuda import get_current_device -from .schedule import BaseSchedule +from colossalai.logging import get_dist_logger +from colossalai.utils import is_using_ddp, is_using_pp +from torch import Tensor class Engine: @@ -22,73 +20,40 @@ class Engine: It controls a iteration in training. :param model: The neural network model + :type model: ``torch.nn.Module`` :param optimizer: Optimizer for updating the parameters - :param step_schedule: Running schedule in :meth:`step` - :param gradient_accumulation: Steps of gradient accumulation + :type optimizer: ``torch.optim.Optimizer`` + :param criterion: Loss function for calculating loss + :type criterion: ``torch.nn.modules.loss._Loss`` :param gradient_clipping: The norm of gradient clipping - :type model: Module - :type optimizer: Optimizer - :type step_schedule: BaseSchedule, optional - :type gradient_accumulation: int, optional :type gradient_clipping: float, optional + :param verbose: whether to display log info + :type verbose: bool """ def __init__(self, model: Module, optimizer: Optimizer, criterion: _Loss, - step_schedule: BaseSchedule, - gradient_handlers: list = None, - gradient_accumulation: int = 1, - gradient_clipping: float = 0.0, + gradient_handlers: List = None, + clip_grad_norm: float = 0.0, + verbose: bool = True ): self._model = model self._optimizer = optimizer self._criterion = criterion - self._schedule = step_schedule - - # schedule initialize - self._schedule.initialize(model, optimizer) + self._clip_grad_norm = clip_grad_norm + self._verbose = verbose + self._logger = get_dist_logger() # state self.training = True # default - # gradient accumulation - assert gradient_accumulation > 0, 'gradient accumulation size must be larger than 0' - self._grad_accum_size = gradient_accumulation - self._grad_clip = gradient_clipping - self._logger = get_global_dist_logger() - # build gradient handler - self._gradient_handlers = [] - - if gradient_handlers is not None: - assert isinstance(gradient_handlers, list), \ - f'argument gradient_handler_cfg expected type list, ' \ - f'but got type {type(gradient_handlers)}' - elif isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, - ZeroRedundancyOptimizer_Level_3)): - gradient_handlers = [dict(type='ZeROGradientHandler')] - self._logger.info( - "Training with zero is detected, ZeROGradientHandler is automatically " - "added even though not specified in the configuration", - ranks=[0]) - elif is_using_ddp() and is_using_pp(): - gradient_handlers = [dict(type='DataParallelGradientHandler')] - self._logger.info( - "Data parallel training is detected when using pipeline parallel, DataParallelGradientHandler is automatically " - "added even though not specified in the configuration", - ranks=[0]) - - if gradient_handlers is None: - self._logger.warning( - "No gradient handler is set up, please make sure you do not need " - "to all-reduce the gradients after a training step.", - ranks=[0]) + if gradient_handlers: + self._gradient_handlers = gradient_handlers else: - for cfg in gradient_handlers: - handler = build_gradient_handler(cfg, model, optimizer) - self._gradient_handlers.append(handler) + self._gradient_handlers = [] @property def model(self): @@ -106,11 +71,27 @@ def criterion(self): def schedule(self): return self._schedule - @property - def gradient_accumulation(self): - return self._grad_accum_size + def zero_grad(self): + self.optimizer.zero_grad() + + def step(self): + self._all_reduce_gradients() + self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm) + self.optimizer.step() + + def backward(self, loss: Tensor): + return self.optimizer.backward(loss) + + def backward_by_grad(self, tensor, grad): + return self.optimizer.backward_by_grad(tensor, grad) - def handle_gradient(self): + def calc_loss(self, *args, **kwargs): + return self.criterion(*args, **kwargs) + + def __call__(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def _all_reduce_gradients(self): """Handles all-reduce operations of gradients across different parallel groups. """ for handler in self._gradient_handlers: @@ -127,71 +108,3 @@ def eval(self): """ self.training = False self._model.eval() - - def step(self, - data_iter, - is_last_iteration: bool = False, - return_loss=True): - """A running step based on the schedule. Usually, it runs a training or - evaluation over a batch of dataset. - - :param data_iter: Data iterator of the dataset - :param is_last_iteration: If True, this iteration is the last iteration in the epoch - :param return_loss: loss will be returned if True - :type data_iter: Iterator - :type is_last_iteration: bool, optional - :type return_loss: bool, optional - :return: (output, lablel, loss) - """ - if self.training: - self._optimizer.zero_grad() - - # differentiate training and eval with grad accum - if self.training: - outputs = [] - labels = [] - loss = torch.zeros(1, device=get_current_device()) - with ConditionalContext(self._model.no_sync(), enable=is_using_ddp() and not is_using_pp()): - for i in range(self._grad_accum_size - 1): - output, label, loss_ = self._schedule.forward_backward_step( - data_iter, self._model, self._criterion, self._optimizer, - forward_only=False, - grad_accum_size=self._grad_accum_size, - return_loss=return_loss) - outputs.append(output) - labels.append(label) - loss.add_(loss_) - output, label, loss_ = self._schedule.forward_backward_step( - data_iter, self._model, self._criterion, self._optimizer, - forward_only=False, - grad_accum_size=self._grad_accum_size, - return_loss=return_loss) - outputs.append(output) - labels.append(label) - loss.add_(loss_) - output = self._accum_outputs(outputs) - label = self._accum_outputs(labels) - # all reduce gradients - self.handle_gradient() - self._schedule.optimizer_step( - self._model, self._optimizer, self._grad_clip) - else: - output, label, loss = self._schedule.forward_backward_step( - data_iter, self._model, self._criterion, self._optimizer, - forward_only=True, - grad_accum_size=1, - return_loss=return_loss) - - # consume the remaining dataset left out due to gradient accumulation - if is_last_iteration: - while True: - try: - _ = next(data_iter) - except StopIteration: - break - - return output, label, loss - - @staticmethod - def _accum_outputs(tensor_tuples): - return tuple([torch.cat(x) for x in zip(*tensor_tuples)]) diff --git a/colossalai/engine/amp/__init__.py b/colossalai/engine/amp/__init__.py deleted file mode 100644 index 927d5cf09d1a..000000000000 --- a/colossalai/engine/amp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .grad_scaler import GradScaler -from .amp_type import AMP_TYPE diff --git a/colossalai/engine/schedule/__init__.py b/colossalai/engine/schedule/__init__.py index dba95469ba47..a885a672e619 100644 --- a/colossalai/engine/schedule/__init__.py +++ b/colossalai/engine/schedule/__init__.py @@ -1,5 +1,5 @@ from ._base_schedule import BaseSchedule -from ._no_pipeline import NoPipelineSchedule -from ._pipeline import PipelineSchedule +from ._pipeline_schedule import PipelineSchedule +from ._non_pipeline_schedule import NonPipelineSchedule -__all__ = ['BaseSchedule', 'NoPipelineSchedule', 'PipelineSchedule'] +__all__ = ['BaseSchedule', 'PipelineSchedule', 'NonPipelineSchedule'] diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py index 0583ccbf3d14..e28690cb0ebb 100644 --- a/colossalai/engine/schedule/_base_schedule.py +++ b/colossalai/engine/schedule/_base_schedule.py @@ -5,8 +5,10 @@ import torch -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger +from torch import Tensor +from typing import Iterable, Union, List, Callable +from .._base_engine import Engine +from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device @@ -18,8 +20,9 @@ class BaseSchedule(ABC): control of FP16 in class schedule. """ - def __init__(self): - self.logger = get_global_dist_logger() + def __init__(self, batch_data_process_func: Callable = None): + self.logger = get_dist_logger() + self.batch_data_process_func = batch_data_process_func @staticmethod def _move_tensor(element): @@ -35,6 +38,11 @@ def _move_to_device(self, data): data = data.to(get_current_device()).detach() return data + def _to_list(self, data): + if torch.is_tensor(data): + return [data] + return data + def load_batch(self, data_iter): """Loads a batch from data iterator. It returns the data and labels which are already in the same GPU as where the model's. @@ -44,46 +52,34 @@ def load_batch(self, data_iter): """ if data_iter is None: raise RuntimeError('Dataloader is not defined.') - data, label = next(data_iter) - return self._move_to_device(data), self._move_to_device(label) + batch_data = next(data_iter) - def initialize(self, model, optimizer): - """Initializes the model and the optimizer before training. - This is often used in FP16 training. + if self.batch_data_process_func: + data, label = self.batch_data_process_func(batch_data) + else: + data, label = batch_data - :param model: The neural network model - :param optimizer: Optimizer for updating the parameters + data, label = self._to_list(data), self._to_list(label) + return self._move_to_device(data), self._move_to_device(label) + + def pre_processing(self, engine: Engine): + """To perform actions before running the schedule. """ - return model, optimizer + pass @abstractmethod def forward_backward_step(self, - data_iter, - model, - criterion, - optimizer=None, - forward_only=False, - grad_accum_size: int = 1, - return_loss=True): + engine: Engine, + data_iter: Iterable, + forward_only: bool, + return_loss: bool = True + ): """The process function over a batch of dataset for training or evaluation. - :param data_iter: Data iterator of the dataset - :param model: Model used in training or evaluation - :param optimizer: Optimizer used in training or evaluation - :param criterion: Loss function + :param engine: Colossalai training engine + :param inputs: input data + :param labels: ground truth :param forward_only: If True, the process won't include backward - :param grad_accum_size: Steps of gradient accumulation :param return_loss: If False, the loss won't be returned """ - pass - - @abstractmethod - def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0): - """Updates the parameters with the optimizer. - - :param model: The neural network model - :param optimizer: Optimizer for updating the parameters - :param grad_clipping: The norm of gradient clipping - :type grad_clipping: float, optional - """ - pass + pass \ No newline at end of file diff --git a/colossalai/engine/schedule/_no_pipeline.py b/colossalai/engine/schedule/_no_pipeline.py deleted file mode 100644 index a495a52d434b..000000000000 --- a/colossalai/engine/schedule/_no_pipeline.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -try: - import apex.amp as apex_amp -except: - pass - -try: - import torch.cuda.amp as torch_amp -except: - pass - -from typing import Iterable - -import torch - -import torch.nn as nn -from torch.optim import Optimizer - -from colossalai.nn import (ZeroRedundancyOptimizer_Level_2, - ZeroRedundancyOptimizer_Level_3) -from colossalai.nn.optimizer._utils import clip_grad_norm_fp32 -from ._base_schedule import BaseSchedule -from ._utils import convert_to_fp16, convert_to_fp32 -from ..amp import AMP_TYPE, GradScaler - - -class NoPipelineSchedule(BaseSchedule): - """A helper schedule class for no pipeline parallelism running environment. - During one process, it loads a batch of dataset and feeds it to the model. - After getting the output and calculating the loss, it will use :meth:`step` - to update the parameters if it is in training mode. - - :param amp_type: The type of automatic mixed precision - :param amp_config: The configuration of automatic mixed procision - :type amp_type: AMP_TYPE - :type amp_config: dict - """ - - def __init__( - self, - amp_type: AMP_TYPE = None, - amp_config: dict = None, - ): - super().__init__() - - # mixed precision training - assert amp_type is None or isinstance(amp_type, AMP_TYPE), \ - 'unrecognised value for argument fp16, it can only be None, torch or apex' - - self.use_zero_level_2_3 = False - - if amp_type is not None: - self.fp16 = True - self.amp_type = amp_type - - if amp_config is not None: - assert isinstance(amp_config, dict), \ - f'expected argument fp16_config to be type dictionary, but got {type(amp_config)}' - - if self.amp_type == AMP_TYPE.TORCH: - # torch apex - if amp_config is None: - amp_config = dict() - self.amp_cfg = amp_config - elif self.amp_type == AMP_TYPE.APEX: - # apex amp - if amp_config is None: - amp_config = dict(opt_level='O2') - self.logger.warning( - 'apex is deprecated, please consider using torch.cuda.amp instead.' - ) - self.amp_cfg = amp_config - elif self.amp_type == AMP_TYPE.PARALLEL: - # use fp16 optimizer for tensor parallel training - if amp_config is None: - amp_config = dict() - self.amp_cfg = amp_config - else: - self.fp16 = False - self.amp_type = None - - def initialize(self, model: nn.Module, optimizer: Optimizer): - if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, - ZeroRedundancyOptimizer_Level_3)): - self.use_zero_level_2_3 = True - assert self.amp_type != AMP_TYPE.PARALLEL, \ - 'ZeRO Level 2 and 3 are mutually exclusive with AMP_TYPE.PARALLEL' - - if self.fp16: - if self.amp_type == AMP_TYPE.TORCH: - self._torch_amp_scaler = GradScaler(**self.amp_cfg) - elif self.amp_type == AMP_TYPE.APEX: - model, optimizer = apex_amp.initialize(model, optimizer, **self.amp_cfg) - - return model, optimizer - - def forward_backward_step(self, - data_iter: Iterable, - model: nn.Module, - criterion: nn.modules.loss._Loss, - optimizer: Optimizer = None, - forward_only: bool = False, - grad_accum_size: int = 1, - return_loss: bool = True): - """The process function that loads loads a batch of dataset and feeds it to the model. - The returned labels and loss will None if :attr:`return_loss` is False. - - :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader) - :param model: Model for training and inference - :param criterion: Loss function for training - :param optimizer: Optimizer used for training - :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed - :param grad_accum_size: The number of iterations for gradient accumulation - :param return_loss: Loss will be returned if True - :type data_iter: Iterator - :type model: torch.nn.Module - :type criterion: torch.nn.modules.loss._Loss - :type optimizer: torch.optim.Optimizer - :type forward_only: bool, optional - :type grad_accum_size: int - :type return_loss: bool, optional - :return: (output, label, loss) - """ - assert forward_only or return_loss, \ - 'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.' - - data, label = self.load_batch(data_iter) - loss = None - - # forward - if forward_only: - with torch.no_grad(): - output = model(*data) - if not isinstance(output, (tuple, list)): - output = (output,) - if return_loss: - loss = criterion(*output, *label) - elif self.fp16 and self.amp_type == AMP_TYPE.TORCH: - with torch_amp.autocast(): - output = model(*data) - if not isinstance(output, (tuple, list)): - output = (output,) - if return_loss: - loss = criterion(*output, *label) - else: - if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL: - data = convert_to_fp16(data) - - output = model(*data) - - if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL: - output = convert_to_fp32(output) - - if not isinstance(output, (tuple, list)): - output = (output,) - if return_loss: - loss = criterion(*output, *label) - - loss /= grad_accum_size - - if not forward_only: - # backward - if self.use_zero_level_2_3: - optimizer.backward(loss) - elif self.fp16: - if self.amp_type == AMP_TYPE.APEX: - with apex_amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - elif self.amp_type == AMP_TYPE.TORCH: - self._torch_amp_scaler.scale(loss).backward() - elif self.amp_type == AMP_TYPE.PARALLEL: - loss = optimizer.scale_loss(loss) - loss.backward() - # scale back to display the original value in logs - loss.div_(optimizer.grad_scaler.scale) - else: - loss.backward() - - if return_loss: - return output, label, loss * grad_accum_size - else: - return output, None, None - - def optimizer_step(self, model: nn.Module, optimizer: Optimizer, grad_clipping: float = 0.0): - # step optimizer - if self.fp16 and self.amp_type == AMP_TYPE.TORCH: - if grad_clipping > 0.0: - self._torch_amp_scaler.unscale_(optimizer) - clip_grad_norm_fp32(model.parameters(), grad_clipping) - self._torch_amp_scaler.step(optimizer) - self._torch_amp_scaler.update() - else: - if not self.fp16 and not self.use_zero_level_2_3 and grad_clipping > 0.0: - clip_grad_norm_fp32(model.parameters(), grad_clipping) - optimizer.step() diff --git a/colossalai/engine/schedule/_non_pipeline_schedule.py b/colossalai/engine/schedule/_non_pipeline_schedule.py new file mode 100644 index 000000000000..01e681941fee --- /dev/null +++ b/colossalai/engine/schedule/_non_pipeline_schedule.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from typing import Iterable + +import torch + +import torch.nn as nn +from colossalai.engine import Engine +from torch.optim import Optimizer +from ._base_schedule import BaseSchedule +from colossalai.utils import conditional_context + + +class NonPipelineSchedule(BaseSchedule): + """A helper schedule class for no pipeline parallelism running environment. + During one process, it loads a batch of dataset and feeds it to the model. + After getting the output and calculating the loss, it will use :meth:`step` + to update the parameters if it is in training mode. + :param amp_type: The type of automatic mixed precision + :param amp_config: The configuration of automatic mixed procision + :type amp_type: AMP_TYPE + :type amp_config: dict + """ + + def forward_backward_step(self, + engine: Engine, + data_iter: Iterable, + forward_only: bool = False, + return_loss: bool = True): + """The process function that loads loads a batch of dataset and feeds it to the model. + The returned labels and loss will None if :attr:`return_loss` is False. + :param engine: Model for training and inference + :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader) + :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed + :param return_loss: Loss will be returned if True + :type engine: Iterator + :type data_iter: Iterator + :type forward_only: bool, optional + :type return_loss: bool, optional + :return: (output, label, loss) + """ + assert forward_only or return_loss, \ + "The argument 'return_loss' has to be True when 'forward_only' is False, but got False." + data, label = self.load_batch(data_iter) + + # forward + with conditional_context(torch.no_grad(), enable=forward_only): + output = engine(*data) + if not isinstance(output, (tuple, list)): + output = (output,) + if return_loss: + loss = engine.criterion(*output, *label) + + if not forward_only: + engine.backward(loss) + + if return_loss: + return output, label, loss + else: + return output, None, None diff --git a/colossalai/engine/schedule/_pipeline.py b/colossalai/engine/schedule/_pipeline_schedule.py similarity index 82% rename from colossalai/engine/schedule/_pipeline.py rename to colossalai/engine/schedule/_pipeline_schedule.py index 7a1b5fdadef3..f0bc04427607 100644 --- a/colossalai/engine/schedule/_pipeline.py +++ b/colossalai/engine/schedule/_pipeline_schedule.py @@ -10,12 +10,12 @@ from colossalai.communication import * from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn import (ZeroRedundancyOptimizer_Level_2, - ZeroRedundancyOptimizer_Level_3) +from colossalai.amp.naive_amp import NaiveAMPModel +from colossalai.zero import (ZeroRedundancyOptimizer_Level_2, + ZeroRedundancyOptimizer_Level_3) from colossalai.utils import get_current_device from ._base_schedule import BaseSchedule -from ._utils import convert_to_fp16 -from ..amp import AMP_TYPE +from colossalai.amp import AMP_TYPE def squeeze(x: Union[Tensor, tuple, list]): @@ -28,7 +28,7 @@ def squeeze(x: Union[Tensor, tuple, list]): class PipelineSchedule(BaseSchedule): """A helper schedule class for pipeline parallelism running environment. It uses non-interleaved 1F1B strategy. Other properties are similar as - :class:`NoPipelineSchedule`. + :class:`NonPipelineSchedule`. :param num_microbatches: The number of microbatches :param amp_type: The type of automatic mixed precision @@ -42,20 +42,11 @@ class PipelineSchedule(BaseSchedule): def __init__(self, num_microbatches, - amp_type: AMP_TYPE = None, - amp_config: dict = None, sync_data: bool = True): super().__init__() self.num_microbatches = num_microbatches self.sync_data = sync_data - # amp - # LSGL: amp_config is not used, but leave here for future extension - self.amp_type = amp_type - self.amp_config = amp_config - - if self.amp_type is not None: - assert self.amp_type == AMP_TYPE.PARALLEL, 'We only support AMP_TYPE.PARALLEL for pipeline training for now' def _move_to_device(self, data): if isinstance(data, ( @@ -125,21 +116,20 @@ def load_micro_batch(self): self.batch_pos += self.microbatch_size return (data,), (label,) - def initialize(self, model, optimizer): - if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)): + def pre_processing(self, engine): + if isinstance(engine.optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)): raise TypeError( "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3" ) # LSG: set default dtype to fp16 for communication - if self.amp_type == AMP_TYPE.PARALLEL: + if isinstance(engine.model, NaiveAMPModel): torch.set_default_dtype(torch.half) - self.logger.info( + self.logger.warning( 'default tensor dtype is set to torch.half for fp16 training', ranks=[0]) - def forward_step(self, model, criterion, input_tensor, return_tensors, - grad_accum_size, return_loss=True): + def forward_step(self, engine, input_tensor, return_tensors, return_loss=True): """Forward step for passed-in model. If it is the first stage, the input tensor is obtained from data_iterator, otherwise the passed-in input_tensor is used. Returns output tensor. This is a helper function and can be ignored by users. @@ -147,17 +137,15 @@ def forward_step(self, model, criterion, input_tensor, return_tensors, if input_tensor is None: input_tensor, label = self.load_micro_batch() - if self.amp_type == AMP_TYPE.PARALLEL: - input_tensor = convert_to_fp16(input_tensor) input_tensor = squeeze(input_tensor) - output_tensor = model(input_tensor) + output_tensor = engine(input_tensor) output_tensor = squeeze(output_tensor) if gpc.is_last_rank(ParallelMode.PIPELINE): if return_loss: input_tensor, label = self.load_micro_batch() - loss_reduced = criterion(output_tensor, *label) \ - / (self.num_microbatches * grad_accum_size) + loss_reduced = engine.criterion(output_tensor, *label) \ + / self.num_microbatches return_tensors.append( tuple((output_tensor, label[0], loss_reduced))) @@ -169,7 +157,7 @@ def forward_step(self, model, criterion, input_tensor, return_tensors, else: return output_tensor - def backward_step(self, optimizer, input_tensor, output_tensor, output_tensor_grad): + def backward_step(self, engine, input_tensor, output_tensor, output_tensor_grad): """Backward step through the passed-in output tensor. If it is the last stage, the output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor. Returns the gradients with respect to the input tensor (None if first stage). @@ -181,9 +169,10 @@ def backward_step(self, optimizer, input_tensor, output_tensor, output_tensor_gr input_tensor.retain_grad() # Backward pass. - if output_tensor_grad is None and self.amp_type == AMP_TYPE.PARALLEL: - output_tensor = optimizer.scale_loss(output_tensor) - torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad) + if output_tensor_grad is None: + engine.backward(output_tensor) + else: + engine.backward_by_grad(output_tensor, output_tensor_grad) # Collect the grad of the input_tensor. input_tensor_grad = None @@ -193,12 +182,9 @@ def backward_step(self, optimizer, input_tensor, output_tensor, output_tensor_gr return input_tensor_grad def forward_backward_step(self, + engine, data_iter, - model, - criterion, - optimizer=None, forward_only=False, - grad_accum_size: int = 1, return_loss=True): """Runs non-interleaved 1F1B schedule, with communication between pipeline stages. Returns a tuple with losses if the last stage, an empty tuple otherwise. @@ -236,9 +222,8 @@ def forward_backward_step(self, ft_shape = recv_tensor_meta(ft_shape) input_tensor = recv_forward(ft_shape) output_tensor = self.forward_step( - model, criterion, - input_tensor, return_tensors, - grad_accum_size, return_loss=return_loss + engine, input_tensor, return_tensors, + return_loss=return_loss ) if not gpc.is_last_rank(ParallelMode.PIPELINE): bt_shape = output_tensor.shape @@ -262,9 +247,8 @@ def forward_backward_step(self, last_iteration = (i == (num_microbatches_remaining - 1)) output_tensor = self.forward_step( - model, criterion, - input_tensor, return_tensors, - grad_accum_size, return_loss=return_loss + engine, input_tensor, return_tensors, + return_loss=return_loss ) if forward_only: send_forward(output_tensor) @@ -286,7 +270,7 @@ def forward_backward_step(self, output_tensor = output_tensors.pop(0) input_tensor_grad = self.backward_step( - optimizer, + engine, input_tensor, output_tensor, output_tensor_grad ) @@ -307,7 +291,7 @@ def forward_backward_step(self, output_tensor_grad = recv_backward(bt_shape) input_tensor_grad = self.backward_step( - optimizer, + engine, input_tensor, output_tensor, output_tensor_grad ) @@ -319,11 +303,8 @@ def forward_backward_step(self, output, label, loss = tuple(map(list, zip(*return_tensors))) return (torch.cat(output, dim=0), torch.cat(label, dim=0), - sum(loss) * grad_accum_size) + sum(loss)) else: return tuple((torch.cat(return_tensors, dim=0), None, None)) else: return tuple((None, None, None)) - - def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0): - optimizer.step() diff --git a/colossalai/engine/schedule/_utils.py b/colossalai/engine/schedule/_utils.py deleted file mode 100644 index cdfd0246c12d..000000000000 --- a/colossalai/engine/schedule/_utils.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from typing import Union, List - -from torch import Tensor - - -def convert_to_fp16(data: Union[Tensor, List[Tensor]]): - if isinstance(data, Tensor): - ret = data.half() - elif isinstance(data, (list, tuple)): - ret = [val.half() for val in data] - else: - raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}") - return ret - - -def convert_to_fp32(data: Union[Tensor, List[Tensor]]): - if isinstance(data, Tensor): - ret = data.float() - elif isinstance(data, (list, tuple)): - ret = [val.float() for val in data] - else: - raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}") - return ret - diff --git a/colossalai/initialize.py b/colossalai/initialize.py index 351c67947097..5d7087841674 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -3,44 +3,45 @@ import argparse import pprint -import random -from pathlib import Path -from typing import Callable, Iterable, Optional, Union -from typing import Tuple - +import os +from colossalai.nn.optimizer.colossalai_optimizer import ColossalaiOptimizer import numpy as np import torch +import torch.nn as nn + +from pathlib import Path +from typing import Iterable, Union, Optional, Tuple, List, Dict + +from colossalai.amp import convert_to_amp, AMP_TYPE +from colossalai.context import Config, ParallelMode, ConfigException +from colossalai.core import global_context as gpc +from colossalai.engine import Engine +from colossalai.logging import get_dist_logger +from colossalai.utils import (accumulate_gradient, get_current_device, + sync_model_param_in_dp, is_using_ddp, is_using_pp) +from colossalai.zero import convert_to_zero, ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3 +from colossalai.builder.builder import build_gradient_handler +from torch.optim.optimizer import Optimizer +from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader +from torch.nn.modules.loss import _Loss from torch.nn.parallel import DistributedDataParallel as DDP -from colossalai.engine import AMP_TYPE, NoPipelineSchedule, PipelineSchedule -from colossalai.engine import Engine -from colossalai.logging import get_global_dist_logger, init_global_dist_logger -from colossalai.nn import DataParallelSampler -from colossalai.nn.model.base_model import BaseModel -from .builder import (ModelInitializer, build_dataset, build_loss, - build_model, build_optimizer, - build_optimizer_wrapper, build_schedule) -from .context import Config, ParallelMode -from .core import global_context as gpc -from .utils import get_current_device, sync_model_param_in_dp, is_using_ddp, is_using_pp - - -def parse_args(): + + +def get_default_parser(): '''Reads user command line and uses an argument parser to parse the input arguments. Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed. - :return: call the parse arguments function + :return: returns the parser with the default arguments, the user may add customized arguments into this parser :rtype: Namespace ''' parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, help='path to the config file') parser.add_argument('--host', type=str, - default=None, help='the master address for distributed training') parser.add_argument('--port', - type=str, - default=None, + type=int, help='the master port for distributed training') parser.add_argument('--world_size', type=int, help='world size for distributed training') parser.add_argument('--rank', type=int, help='rank for the default process group') @@ -51,341 +52,277 @@ def parse_args(): type=str, default='nccl', help='backend for distributed communication') - return parser.parse_args() - - -def init_dist(config: Union[str, dict] = None, - local_rank: int = None, - rank: int = None, - world_size: int = None, - host: str = None, - port: str = None, - backend: str = None): + return parser + + +def launch(config: Union[str, Path, Config, Dict], + rank: int, + world_size: int, + host: str, + port: int, + backend: str = 'nccl', + local_rank: int = None, + seed: int = 1024, + verbose: bool = True): '''This function first parses the configuration arguments, using :func:parse_args() in case one of the input arguments are not given. - Then initialize and set distributed environment by calling global_context's functions. + Then initialize and set distributed environment by calling global_context's functions. :param config: config file or config file path are both acceptable - :type config: Union[str, dict], optional - :param local_rank: rank for the default process group, defaults to None + :type config: Union[str, dict, Config] + :param rank: rank for the default process group + :type rank: int + :param world_size: world size of the default process group + :type world_size: int + :param host: the master address for distributed training + :type host: str + :param port: the master port for distributed training + :type port: str + :param backend: backend for torch.distributed + :type backend: str + :param local_rank: rank for the process on the node and is used to set the default CUDA device, + defaults to None. If local_rank = None, the default device ordinal will be calculated automatically :type local_rank: int, optional - :param world_size: world size of GPUs, defaults to None - :type world_size: int, optional - :param host: the master address for distributed training, defaults to None - :type host: str, optional - :param port: the master port for distributed training, defaults to None - :type port: str, optional - :param backend: backend for torch.distributed, defaults to None - :type backend: str, optional :raises Exception: raise exception when config type is wrong ''' - args = [config, local_rank, world_size, host, port, backend] - arg_given = [arg is not None for arg in args] - - if not all(arg_given): - args = parse_args() - - if config is None: - config = args.config - if local_rank is None: - local_rank = args.local_rank - if rank is None: - rank = args.rank - if world_size is None: - world_size = args.world_size - if host is None: - host = args.host - if port is None: - port = args.port - if backend is None: - backend = args.backend - args = Config( - dict(config=config, - host=host, - port=port, - world_size=world_size, - rank=rank, - local_rank=local_rank, - backend=backend)) - - # set distributed settings - dist_args = Config( - dict(local_rank=args.local_rank, - rank=rank, - world_size=args.world_size, - backend=args.backend)) - - gpc.set_dist_args(dist_args) + gpc.verbose = verbose # set config - if isinstance(args.config, dict): - cfg = args.config - elif isinstance(args.config, (str, Path)): - cfg = Config.from_file(args.config) - else: - raise Exception('Config type error: {}'.format(type(args.config))) - gpc.load_config(cfg) - - # init dist groups - gpc.init_global_dist(args.host, args.port) + assert isinstance(config, (Config, str, Path, dict)), \ + f'expected argument config to be Config, str or Path, but got {type(config)}' + if not isinstance(config, Config) and isinstance(config, dict): + config = Config(config) + if isinstance(config, (str, Path)): + config = Config.from_file(config) + gpc.load_config(config) + + # init default process group + gpc.init_global_dist(rank, world_size, backend, host, port) + + # init process groups for different parallel modes from config gpc.init_parallel_groups() - # init dist logger - init_global_dist_logger() - # set cuda device if torch.cuda.is_available(): - gpc.set_device() - - -def get_dataloader(dataset, seed=1024, add_sampler_if_possible=False, **kwargs): - '''Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not) - - .. note: when pipeline parallel is enabled, shuffle cannot be True - as it will result in mismatch between input data on the 1st - stage and label on the last stage - - :param dataset: a :class:utils.data.dataset dataset - :param seed: random worker seed, defaults to 1024 - :type seed: int, optional - :param add_sampler_if_possible: [description], defaults to False - :type add_sampler_if_possible: bool, optional - :return: a :class:utils.data.dataset dataloader - :rtype: torch.utils.data.dataset - ''' - _kwargs = kwargs.copy() - if 'shuffle' in _kwargs: - shuffle = _kwargs.pop('shuffle') - else: - shuffle = False - - if add_sampler_if_possible and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1: - sampler = DataParallelSampler(dataset, shuffle=shuffle) - else: - sampler = None - - # Deterministic dataloader - def seed_worker(worker_id): - worker_seed = seed - np.random.seed(worker_seed) - torch.manual_seed(worker_seed) - random.seed(worker_seed) - - if sampler is None: - return DataLoader(dataset, - worker_init_fn=seed_worker, - shuffle=shuffle, - **_kwargs) - else: - return DataLoader(dataset, - sampler=sampler, - worker_init_fn=seed_worker, - **_kwargs) - - -def initialize(config: Union[str, dict] = None, - local_rank: int = None, - rank: int = None, - world_size: int = None, - host: str = None, - port: str = None, - backend: str = None, - train_dataloader: Optional[Union[Iterable, Callable]] = None, - test_dataloader: Optional[Union[Iterable, Callable]] = None, + # if local rank is not given, calculate automatically + gpc.set_device(local_rank) + + gpc.set_seed(seed) + + if verbose: + logger = get_dist_logger() + logger.info(f'Distributed environment is initialized, ' + f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, ' + f'tensor parallel size: {gpc.tensor_parallel_size}', ranks=[0]) + + +def launch_from_slurm(config: Union[str, Path, Config, Dict], + host: str, + port: int, + backend: str = 'nccl', + seed: int = 1024, + verbose: bool = True): + rank = int(os.environ['SLURM_PROCID']) + world_size = int(os.environ['SLURM_NPROCS']) + launch(config=config, + rank=rank, + world_size=world_size, + host=host, + port=port, + backend=backend, + seed=seed, + verbose=verbose) + + +def launch_from_openmpi(config: Union[str, Path, Config, Dict], + host: str, + port: int, + backend: str = 'nccl', + seed: int = 1024, + verbose: bool = True): + rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) + world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) + launch(config=config, + local_rank=local_rank, + rank=rank, + world_size=world_size, + host=host, + port=port, + backend=backend, + seed=seed, + verbose=verbose) + + +def launch_from_torch(config: Union[str, Path, Config, Dict], + host: str, + port: int, + backend: str = 'nccl', + seed: int = 1024, + verbose: bool = True): + rank = int(os.environ['RANK']) + local_rank = int(os.environ['LOCAL_RANK']) + world_size = int(os.environ['WORLD_SIZE']) + launch(config=config, + local_rank=local_rank, + rank=rank, + world_size=world_size, + host=host, + port=port, + backend=backend, + seed=seed, + verbose=verbose) + + +def initialize(model: Union[nn.Module, List[nn.Module]], + optimizer: Union[Optimizer, List[Optimizer]], + criterion: Union[_Loss, List[_Loss]], + train_dataloader: Optional[Union[Iterable, List[Iterable]]] = None, + test_dataloader: Optional[Union[Iterable, List[Iterable]]] = None, + lr_scheduler: _LRScheduler = None, + verbose: bool = True ) -> Tuple[Engine, DataLoader, DataLoader]: - '''Core function that initializes distributed environment, logger, cudnn, data, model, loss function, optimizer, and lr_scheduler(their configs are in gpc.config). - - :param config: config file or config file path are both acceptable - :type config: Union[str, dict], optional - :param local_rank: rank for the default process group, defaults to None - :type local_rank: int, optional - :param world_size: world size of GPUs, defaults to None - :type world_size: int, optional - :param host: the master address for distributed training, defaults to None - :type host: str, optional - :param port: the master port for distributed training, defaults to None - :type port: str, optional - :param backend: backend for torch.distributed, defaults to None - :type backend: str, optional - :param train_dataloader: If None, the config is used to build a dataloder; Else, it should be a dataloader object or a function with no arguments which can build a dataloader, defaults to None - :type train_dataloader: Optional[Union[Iterable, Callable]], optional - :param test_dataloader: If None, the config is used to build a dataloder; Else, it should be a dataloader object or a function with no arguments which can build a dataloader, defaults to None - :type test_dataloader: Optional[Union[Iterable, Callable]], optional - :return: (engine, train_dataloader, test_dataloader, criterion) + ''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config. + + :param model: your model instance + :type model: a single or a list of ``torch.nn.Module`` objects + :param optimizer: your optimizer instance + :type optimizer: a single or a list of ``torch.optim.optimizer.Optimizer`` objects + :param criterion: your criterion instance + :type criterion: a single or a list of ``torch.nn.modules.loss._Loss`` objects + :param train_dataloader: dataloaders for training data + :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None + :param train_dataloader: dataloaders for testing data + :type train_dataloader: a single or a list of ``torch.utils.data.DataLoader`` objects, defaults to None + :return: (engine, criterion, train_dataloader, test_dataloader) :rtype: tuple ''' - # initialize distributed environment - init_dist(config=config, - local_rank=local_rank, - rank=rank, - world_size=world_size, - host=host, - port=port, - backend=backend) - - # init logger - logger = get_global_dist_logger() - logger.info(f'Distributed environment is initialized, ' - f'data parallel size: {gpc.data_parallel_size}, pipeline parallel size: {gpc.pipeline_parallel_size}, ' - f'tensor parallel size: {gpc.tensor_parallel_size}', ranks=[0]) + # get logger + logger = get_dist_logger() + gpc.verbose = verbose + + # get config from gpc + config = gpc.config # print config - logger.info(f"\n========== Your Config ========\n" - f"{pprint.pformat(gpc.config)}\n" - f"================================", ranks=[0]) + if verbose: + logger.info(f"\n========== Your Config ========\n" + f"{pprint.pformat(gpc.config)}\n" + f"================================\n", ranks=[0]) # cudnn - cudnn_benchmark = gpc.config.get('cudnn_benchmark', True) - cudnn_deterministic = gpc.config.get('cudnn_deterministic', False) + cudnn_benchmark = config.get('cudnn_benchmark', True) + cudnn_deterministic = config.get('cudnn_deterministic', False) torch.backends.cudnn.benchmark = cudnn_benchmark torch.backends.cudnn.deterministic = cudnn_deterministic - logger.info( - f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0]) - - # set seed, cuda seed is only set when cuda is avail - gpc.set_seed() - - # return_items = list() - - # check fp16 and zero - should_convert_model_to_half = False - should_wrap_fp16_optimizer = False - should_wrap_zero_optimizer_level_2_3 = False - - if hasattr(gpc.config, 'fp16'): - fp16_mode = gpc.config.fp16.mode - if fp16_mode == AMP_TYPE.PARALLEL: - should_convert_model_to_half = True - should_wrap_fp16_optimizer = True - - if hasattr(gpc.config, 'zero'): - should_wrap_zero_optimizer_level_2_3 = True - zero_type = gpc.config.zero.type - if zero_type in ['ZeroRedundancyOptimizer_Level_2', 'ZeroRedundancyOptimizer_Level_3']: - should_convert_model_to_half = True - assert not should_wrap_fp16_optimizer, \ - 'AMP_TYPE.PARALLEL is mutually exclusive with zero level 2 and 3' - - # build model - logger.info('Building model ...', ranks=[0]) - assert hasattr( - gpc.config, 'model'), "Build error: configuration 'model' is missing" - if gpc.pipeline_parallel_size > 1: - model = ModelInitializer(gpc.config.model, 1, verbose=True) - model = model.model_initialize() - else: - model = build_model(gpc.config.model) - if isinstance(model, BaseModel): - model.build_from_cfg() - model = model.to(get_current_device()) - sync_model_param_in_dp(model) - logger.info('Model is created', ranks=[0]) - - if should_convert_model_to_half: - model = model.half() - logger.info("Model is cast to fp16", ranks=[0]) - - if is_using_ddp() and not is_using_pp(): - model = DDP(model, process_group=gpc.get_group(ParallelMode.DATA)) - logger.info( - 'Model is using torch.nn.parallel.DistributedDataParallel', ranks=[0]) - # training data - if callable(train_dataloader): - logger.info( - f'Build train data loader from {train_dataloader}', ranks=[0]) - train_dataloader = train_dataloader() - if train_dataloader is None and hasattr(gpc.config, 'train_data'): - logger.info('Preparing data ...', ranks=[0]) - # assert hasattr(gpc.config, 'train_data'), "Build error: configuration 'train_data' is missing." - train_dataset = build_dataset(gpc.config.train_data.dataset) - logger.info('Train dataset is ready.', ranks=[0]) - - train_dataloader = get_dataloader(train_dataset, - gpc.config.get('seed', 42), - True, - **gpc.config.train_data.dataloader, - ) + if verbose: logger.info( - f'Loaded {len(train_dataset)} samples in {len(train_dataloader)} batches for training', ranks=[0]) + f"cuDNN benchmark = {cudnn_benchmark}, deterministic = {cudnn_deterministic}", ranks=[0]) - if callable(test_dataloader): - logger.info( - f'Build test data loader from {test_dataloader}', ranks=[0]) - test_dataloader = test_dataloader() - # testing data, allowed to be None - if test_dataloader is None and hasattr(gpc.config, 'test_data'): - test_dataset = build_dataset(gpc.config.test_data.dataset) - test_dataloader = get_dataloader( - test_dataset, add_sampler_if_possible=True, **gpc.config.test_data.dataloader) - logger.info( - f'Loaded {len(test_dataset)} samples in {len(test_dataloader)} batches for testing', ranks=[0]) - - # build loss function - assert hasattr(gpc.config, 'loss'), \ - 'Build error: configuration \'loss\' is missing.' - criterion = build_loss(gpc.config.loss) - logger.info('Loss function is created', ranks=[0]) - - # build optimizer - assert hasattr(gpc.config, 'optimizer'), \ - "Build error: configuration 'optimizer' is missing." - optim_type = gpc.config.optimizer.type - is_pytorch_native_zero_level_1 = optim_type == 'ZeroRedundancyOptimizer' - if is_pytorch_native_zero_level_1: - original_cfg_copy = gpc.config.optimizer.copy() - original_cfg_copy.pop('type') - cfg = dict(type=optim_type, process_group=gpc.get_group( - ParallelMode.DATA), **original_cfg_copy) - optimizer = build_optimizer(cfg, model) - else: - optimizer = build_optimizer(gpc.config.optimizer, model) - - if should_wrap_zero_optimizer_level_2_3: - optimizer = build_optimizer_wrapper(gpc.config.zero, optimizer, model) - - if should_wrap_fp16_optimizer: - # replace the field mode with type - fp16_cfg = gpc.config.fp16.copy() - amp_type = fp16_cfg.pop('mode') - assert amp_type == AMP_TYPE.PARALLEL, 'FP Optimizer should only be used for AMP_TYPE.PARALLEL' - fp16_cfg['type'] = 'FP16Optimizer' - optimizer = build_optimizer_wrapper(fp16_cfg, optimizer) - logger.info('Optimizer is created', ranks=[0]) - - # build schedule and engine - if hasattr(gpc.config, 'fp16'): - amp_type = gpc.config.fp16.mode - amp_cfg = gpc.config.fp16.copy() - amp_cfg.pop('mode') + # first sync model across dp ranks + model.to(get_current_device()) + sync_model_param_in_dp(model) + + # check amp and zero + fp16_cfg = gpc.config.get('fp16', None) + zero_cfg = gpc.config.get('zero', None) + + if fp16_cfg is not None and fp16_cfg.mode is not None and zero_cfg is not None: + raise ConfigException( + "It is not allowed to set fp16 and zero configuration in your config file at the same time") + + # initialize amp + amp_mode = None + if fp16_cfg is not None and fp16_cfg.mode is not None: + cfg_ = fp16_cfg.copy() + amp_mode = cfg_.pop('mode') + model, optimizer, criterion = convert_to_amp(model=model, + optimizer=optimizer, + criterion=criterion, + mode=amp_mode, + amp_config=cfg_) + + if zero_cfg is not None: + cfg_ = zero_cfg.copy() + level = cfg_.pop('level') + model, optimizer = convert_to_zero(model=model, + optimizer=optimizer, + level=level, + zero_config=cfg_ + ) + + # gradient handler + gradient_handler_cfg = gpc.config.get('gradient_handler', None) + if gradient_handler_cfg is None: + # if gradient handler is not specified in the configuration file, + # check in the following order + # 1. if optimizer is ZERO, then use zero grad handler + # 2. if dp size is larger than 1 and pipeline is not used, use pytorch ddp + # 3. if using pipeline and dp size larger than 1, use data parallel grad handler + if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, + ZeroRedundancyOptimizer_Level_3)): + gradient_handler_cfg = [dict(type='ZeROGradientHandler')] + if verbose: + logger.info( + "Training with zero is detected, ZeROGradientHandler is automatically " + "added even though not specified in the configuration", + ranks=[0]) + elif is_using_ddp() and not is_using_pp() and amp_mode != AMP_TYPE.NAIVE: + model = DDP(model, process_group=gpc.get_group(ParallelMode.DATA)) + if verbose: + logger.info( + 'Model is using torch.nn.parallel.DistributedDataParallel', ranks=[0]) + elif is_using_ddp(): + gradient_handler_cfg = [dict(type='DataParallelGradientHandler')] + if verbose: + logger.info( + "Data parallel training is detected when using pipeline parallel, DataParallelGradientHandler is automatically " + "added even though not specified in the configuration", + ranks=[0]) else: - amp_type = None - amp_cfg = None - - engine_cfg = gpc.config.get('engine', dict()) - schedule_cfg = engine_cfg.pop('schedule', None) - - schedule_type = None - if schedule_cfg is not None: - schedule_type = schedule_cfg.get('type', None) - - if schedule_type is not None: - # run customized schedule - schedule_cfg['amp_type'] = amp_type - schedule_cfg['amp_config'] = amp_cfg - schedule = build_schedule(schedule_cfg) - elif gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: - assert schedule_cfg is not None, \ - "Config 'engine.schedule' not found in your configuration file for pipeline parallel training" - schedule = PipelineSchedule( - amp_type=amp_type, amp_config=amp_cfg, **schedule_cfg.copy()) + if not isinstance(gradient_handler_cfg, list): + raise ConfigException( + f"expected gradient_handler in the configuration file to be a list but got {type(gradient_handler_cfg)}") + + if gradient_handler_cfg is None: + gradient_handlers = None + if verbose and not isinstance(model, DDP): + logger.warning( + "No PyTorch DDP or gradient handler is set up, please make sure you do not need " + "to all-reduce the gradients after a training step.", + ranks=[0]) else: - schedule = NoPipelineSchedule(amp_type=amp_type, amp_config=amp_cfg) + gradient_handlers = [build_gradient_handler(cfg, model, optimizer) for cfg in gradient_handler_cfg] + + # check if optimizer is ColossalaiOptimizer + if not isinstance(optimizer, (ColossalaiOptimizer, ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)): + optimizer = ColossalaiOptimizer(optim=optimizer) + + # gradient accumulation + grad_accum_size = gpc.config.get('gradient_accumulation', None) + if grad_accum_size is not None: + optimizer, train_dataloader, gradient_handlers, lr_scheduler = accumulate_gradient(model=model, + optimizer=optimizer, + dataloader=train_dataloader, + accumulate_size=grad_accum_size, + gradient_handlers=gradient_handlers, + lr_scheduler=lr_scheduler) + + # clip grad norm + clip_grad_norm = gpc.config.get('clip_grad_norm', 0.0) + if clip_grad_norm > 0: + if zero_cfg is not None: + raise ConfigException( + "clip_grad_norm should be specified with zero, you should specify clip_grad in zero configuration") + elif fp16_cfg is not None and fp16_cfg.mode == AMP_TYPE.NAIVE: + raise ConfigException( + "clip_grad_norm should be specified with AMP_TYPE.NAIVE, you should specify clip_grad in fp16 configuration") engine = Engine( model=model, optimizer=optimizer, criterion=criterion, - step_schedule=schedule, - **gpc.config.get('engine', dict()) + gradient_handlers=gradient_handlers, + clip_grad_norm=clip_grad_norm ) - return engine, train_dataloader, test_dataloader + return engine, train_dataloader, test_dataloader, lr_scheduler diff --git a/colossalai/logging/__init__.py b/colossalai/logging/__init__.py index 71657557fb97..5ee86c45f6ee 100644 --- a/colossalai/logging/__init__.py +++ b/colossalai/logging/__init__.py @@ -1,26 +1,10 @@ -from colossalai.core import global_context as gpc from .logging import DistributedLogger -__all__ = ['get_global_dist_logger', 'get_dist_logger', 'DistributedLogger', 'init_global_dist_logger'] +__all__ = ['get_dist_logger', 'DistributedLogger'] -_GLOBAL_LOGGER: DistributedLogger = None - -def get_dist_logger(name, level='INFO', root_path: str = None, mode='a'): - return DistributedLogger(name=name, level=level, root_path=root_path, mode=mode) - - -def get_global_dist_logger(): - assert _GLOBAL_LOGGER is not None, 'Global distributed logger is not initialized' - return _GLOBAL_LOGGER - - -def init_global_dist_logger(): - rank = gpc.get_global_rank() - if hasattr(gpc.config, 'logging'): - logger = get_dist_logger(name=f'rank_{rank}', **gpc.config.logging) - else: - logger = get_dist_logger(name=f'rank_{rank}', level='INFO') - global _GLOBAL_LOGGER - assert _GLOBAL_LOGGER is None, 'Global distributed logger has already been initialized' - _GLOBAL_LOGGER = logger +def get_dist_logger(name='root'): + """Get logger instance based on name. The DistributedLogger will create singleton instances, + which means that only one logger instance is created per name. + """ + return DistributedLogger.get_instance(name=name) diff --git a/colossalai/logging/logging.py b/colossalai/logging/logging.py index b8a79c4914b0..69f799f8b773 100644 --- a/colossalai/logging/logging.py +++ b/colossalai/logging/logging.py @@ -1,11 +1,13 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import colossalai import logging from pathlib import Path +from typing import Union from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc + _FORMAT = 'colossalai - %(name)s - %(asctime)s %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=_FORMAT) @@ -16,40 +18,92 @@ class DistributedLogger: :param name: The name of the logger :type name: str - :param level: The threshold for the logger. Logging messages which are less severe than `level` - will be ignored - :type level: str - :param root_path: The root path where logs are stored - :type root_path: str, optional - :param mode: The mode that the file is opened in. Defaults to 'a' - :type mode: str, optional """ - def __init__(self, name, level='INFO', root_path: str = None, mode='a'): - self._logger = logging.getLogger(name) + __instances = dict() + + @staticmethod + def get_instance(name: str): + """Get the unique single logger instance based on name. + :param name: The name of the logger + :type name: str + :return: a DistributedLogger object + :rtype: DistributedLogger + """ + if name in DistributedLogger.__instances: + return DistributedLogger.__instances[name] + else: + logger = DistributedLogger(name=name) + return logger + + def __init__(self, name): + if name in DistributedLogger.__instances: + raise Exception('Logger with the same name has been created, you should use colossalai.logging.get_dist_logger') + else: + self._name = name + self._logger = logging.getLogger(name) + DistributedLogger.__instances[name] = self + + @staticmethod + def _check_valid_logging_level(level: str): + assert level in ['INFO', 'DEBUG', 'WARNING', 'ERROR'], 'found invalid logging level' + + def set_level(self, level: str): + """Set the logging level + :param level: can only be INFO, DEBUG, WARNING and ERROR + :type level: str + """ + self._check_valid_logging_level(level) self._logger.setLevel(getattr(logging, level)) - if root_path is not None: - log_root_path = Path(root_path) - # create path if not exists - log_root_path.mkdir(parents=True, exist_ok=True) - log_path = log_root_path.joinpath(f'{name}.log') - file_handler = logging.FileHandler(log_path, mode) - file_handler.setLevel(getattr(logging, level)) - formatter = logging.Formatter(_FORMAT) - file_handler.setFormatter(formatter) - self._logger.addHandler(file_handler) + def log_to_file(self, + path: Union[str, Path], + mode: str = 'a', + level: str = 'INFO', + suffix: str = None): + """Save the logs to file + :param path: the file to save the log + :type path: a string or pathlib.Path object + :param mode: the mode to write log into the file + :type mode: str + :param level: can only be INFO, DEBUG, WARNING and ERROR + :type level: str + """ + assert isinstance(path, (str, Path)), \ + f'expected argument path to be type str or Path, but got {type(path)}' + self._check_valid_logging_level(level) + if isinstance(path, str): + path = Path(path) + + # set the default file name if path is a directory + if not colossalai.core.global_context.is_initialized(ParallelMode.GLOBAL): + rank = 0 + else: + rank = colossalai.core.global_context.get_global_rank() + + if suffix is not None: + log_file_name = f'rank_{rank}_{suffix}.log' + else: + log_file_name = f'rank_{rank}.log' + path = path.joinpath(log_file_name) + + # add file handler + file_handler = logging.FileHandler(path, mode) + file_handler.setLevel(getattr(logging, level)) + formatter = logging.Formatter(_FORMAT) + file_handler.setFormatter(formatter) + self._logger.addHandler(file_handler) def _log(self, level, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): if ranks is None: getattr(self._logger, level)(message) else: - local_rank = gpc.get_local_rank(parallel_mode) + local_rank = colossalai.core.global_context.get_local_rank(parallel_mode) if local_rank in ranks: getattr(self._logger, level)(message) def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): - """Stores an info log message. + """Log an info message. :param message: :type message: @@ -61,7 +115,7 @@ def info(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, self._log('info', message, parallel_mode, ranks) def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): - """Stores a warning log message. + """Log a warning message. :param message: The message to be logged :type message: str @@ -73,7 +127,7 @@ def warning(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBA self._log('warning', message, parallel_mode, ranks) def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): - """Stores a debug log message. + """Log a debug message. :param message: The message to be logged :type message: str @@ -85,7 +139,7 @@ def debug(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, self._log('debug', message, parallel_mode, ranks) def error(self, message: str, parallel_mode: ParallelMode = ParallelMode.GLOBAL, ranks: list = None): - """Stores an error log message. + """Log an error message. :param message: The message to be logged :type message: str diff --git a/colossalai/nn/__init__.py b/colossalai/nn/__init__.py index 69fd61594543..c612b631ac30 100644 --- a/colossalai/nn/__init__.py +++ b/colossalai/nn/__init__.py @@ -1,4 +1,3 @@ -from .data import * from .layer import * from .loss import * from .lr_scheduler import * diff --git a/colossalai/nn/data/__init__.py b/colossalai/nn/data/__init__.py deleted file mode 100644 index d94afe2da48f..000000000000 --- a/colossalai/nn/data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .caltech101_dataset import Caltech101Dataset -from .cifar10_dataset import CIFAR10Dataset -from .sampler import * diff --git a/colossalai/nn/data/_utils.py b/colossalai/nn/data/_utils.py deleted file mode 100644 index 08d77e0dab6a..000000000000 --- a/colossalai/nn/data/_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -import numpy as np - - -def pil_img_to_numpy(pil_img): - """convert a PIL image to numpy nd-array - - :param pil_img: a PIL image - :type pil_img: PIL.Image - :return: a nd-array - :rtype: numpy.ndarray - """ - np_img = np.array(pil_img) - np_img = np.rollaxis(np_img, 2) # HWC to CHW - return np_img diff --git a/colossalai/nn/data/base_dataset.py b/colossalai/nn/data/base_dataset.py deleted file mode 100644 index 730b3764992a..000000000000 --- a/colossalai/nn/data/base_dataset.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from abc import ABC - -from torch.utils.data import Dataset -from torchvision.transforms import transforms - -from colossalai.builder import build_transform - - -class BaseDataset(Dataset, ABC): - - def __init__(self, transform_pipeline: list): - transform_list = [build_transform(cfg) for cfg in transform_pipeline] - transform = transforms.Compose(transform_list) - self._transform_pipeline = transform diff --git a/colossalai/nn/data/caltech101_dataset.py b/colossalai/nn/data/caltech101_dataset.py deleted file mode 100644 index b1dc89b68390..000000000000 --- a/colossalai/nn/data/caltech101_dataset.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch.distributed as dist -from torchvision.datasets import Caltech101 - -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.registry import DATASETS -from .base_dataset import BaseDataset - - -@DATASETS.register_module -class Caltech101Dataset(BaseDataset): - """`Caltech 101 `_ Dataset. - - :param transform_pipeline: A list of functions' config, which takes in an PIL image - and returns a transformed version - :type transform_pipeline: list - """ - - def __init__(self, transform_pipeline: list, *args, **kwargs): - super().__init__(transform_pipeline) - if gpc.is_initialized(ParallelMode.GLOBAL) and gpc.get_global_rank() != 0: - dist.barrier() - self._dataset = Caltech101( - transform=self._transform_pipeline, *args, **kwargs) - if gpc.is_initialized(ParallelMode.GLOBAL) and gpc.get_global_rank() == 0: - dist.barrier() - - def __len__(self): - return len(self._dataset) - - def __getitem__(self, item): - """ - - :param item: Index - :type item: int - :return: ((image,), (target,)) where the type of target specified by target_type. - :rtype: tuple - """ - img, label = self._dataset.__getitem__(item) - return (img,), (label,) diff --git a/colossalai/nn/data/cifar10_dataset.py b/colossalai/nn/data/cifar10_dataset.py deleted file mode 100644 index a0ce139a2e1f..000000000000 --- a/colossalai/nn/data/cifar10_dataset.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch.distributed as dist -from torchvision.datasets import CIFAR10 - -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.registry import DATASETS -from .base_dataset import BaseDataset - - -@DATASETS.register_module -class CIFAR10Dataset(BaseDataset): - """`CIFAR10 `_ Dataset. - - :param transform_pipeline: A list of functions' config, which takes in an PIL image - and returns a transformed version - :type transform_pipeline: list - """ - - def __init__(self, transform_pipeline: list, *args, **kwargs): - super().__init__(transform_pipeline) - if gpc.is_initialized(ParallelMode.GLOBAL) and gpc.get_global_rank() != 0: - dist.barrier() - self._dataset = CIFAR10(transform=self._transform_pipeline, - *args, - **kwargs) - if gpc.is_initialized(ParallelMode.GLOBAL) and gpc.get_global_rank() == 0: - dist.barrier() - - def __len__(self): - return len(self._dataset) - - def __getitem__(self, item): - """ - - :param item: Index - :type item: int - :return: ((image,), (target,)) where the type of target specified by target_type. - :rtype: tuple - """ - img, label = self._dataset.__getitem__(item) - return (img,), (label,) diff --git a/colossalai/nn/data/sampler/__init__.py b/colossalai/nn/data/sampler/__init__.py deleted file mode 100644 index 471add313963..000000000000 --- a/colossalai/nn/data/sampler/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .base_sampler import BaseSampler -from .data_parallel_sampler import DataParallelSampler - -__all__ = ['BaseSampler', 'DataParallelSampler'] diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py index 9d4bb70efa42..e56d8bffe7cd 100644 --- a/colossalai/nn/layer/__init__.py +++ b/colossalai/nn/layer/__init__.py @@ -4,7 +4,5 @@ from .parallel_2p5d import * from .parallel_3d import * from .parallel_sequence import * -from .parallel_vision_transformer import * -from .vanilla_resnet import * -from .vanilla_vision_transformer import * +from .non_parallel_layers import * from .wrapper import * diff --git a/colossalai/nn/layer/_common_utils.py b/colossalai/nn/layer/_common_utils.py index db0f362b270b..759b09003e08 100644 --- a/colossalai/nn/layer/_common_utils.py +++ b/colossalai/nn/layer/_common_utils.py @@ -2,7 +2,8 @@ # -*- encoding: utf-8 -*- import math - +import collections.abc +from itertools import repeat import numpy as np from colossalai.utils.common import print_rank_0 import torch @@ -11,6 +12,30 @@ from torch import Tensor, nn +class CheckpointModule(nn.Module): + def __init__(self, checkpoint: bool = True): + super().__init__() + self.checkpoint = checkpoint + self._use_checkpoint = checkpoint + + def _forward(self, *args, **kwargs): + raise NotImplementedError( + 'CheckpointModule should implement _forward method instead of origin forward') + + def forward(self, *args, **kwargs): + if self._use_checkpoint: + return checkpoint(self._forward, *args, **kwargs) + else: + return self._forward(*args, **kwargs) + + def train(self, mode: bool = True): + self._use_checkpoint = self.checkpoint + return super().train(mode=mode) + + def eval(self): + self._use_checkpoint = False + return super().eval() + def divide(numerator, denominator): """ only allow exact division """ assert numerator % denominator == 0, \ @@ -18,14 +43,6 @@ def divide(numerator, denominator): return numerator // denominator -def gelu(x: Tensor) -> Tensor: - """Implementation of the gelu activation function. - For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): - 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - """ - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - - def swish(x: Tensor) -> Tensor: return x * torch.sigmoid(x) @@ -33,33 +50,23 @@ def swish(x: Tensor) -> Tensor: ACT2FN = {"gelu": torch.nn.functional.gelu, "relu": torch.nn.functional.relu, "swish": swish} -def set_tensor_parallel_attribute(param, size): - # if not hasattr(param, IS_TENSOR_PARALLEL): +def set_tensor_parallel_attribute_by_size(param, size): setattr(param, IS_TENSOR_PARALLEL, True) - # if not hasattr(param, NUM_PARTITIONS): setattr(param, NUM_PARTITIONS, size // np.prod(param.shape)) -class CheckpointModule(nn.Module): - def __init__(self, checkpoint: bool = True): - super().__init__() - self.checkpoint = checkpoint - self._use_checkpoint = checkpoint +def set_tensor_parallel_attribute_by_partition(param, num_partitions): + setattr(param, IS_TENSOR_PARALLEL, True) + setattr(param, NUM_PARTITIONS, num_partitions) - def _forward(self, *args): - raise NotImplementedError( - 'CheckpointModule should implement _forward method instead of origin forward') +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) - def forward(self, *args): - if self._use_checkpoint: - return checkpoint(self._forward, *args) - else: - return self._forward(*args) + return parse - def train(self, mode: bool = True): - self._use_checkpoint = self.checkpoint - return super().train(mode=mode) - def eval(self): - self._use_checkpoint = False - return super().eval() +to_2tuple = _ntuple(2) diff --git a/colossalai/nn/layer/non_parallel_layers/__init__.py b/colossalai/nn/layer/non_parallel_layers/__init__.py new file mode 100644 index 000000000000..6a9883141a51 --- /dev/null +++ b/colossalai/nn/layer/non_parallel_layers/__init__.py @@ -0,0 +1,8 @@ +from ._vit import (ViTBlock, VanillaViTAttention, VanillaViTBlock, VanillaViTDropPath, + VanillaViTHead, VanillaViTMLP, VanillaViTPatchEmbedding) + + +__all__ = [ + 'ViTBlock', 'VanillaViTAttention', 'VanillaViTBlock', 'VanillaViTDropPath', + 'VanillaViTHead', 'VanillaViTMLP', 'VanillaViTPatchEmbedding' +] diff --git a/colossalai/nn/layer/vanilla_vision_transformer/layers.py b/colossalai/nn/layer/non_parallel_layers/_vit.py similarity index 88% rename from colossalai/nn/layer/vanilla_vision_transformer/layers.py rename to colossalai/nn/layer/non_parallel_layers/_vit.py index 6f7ec4c7cb4c..59a12fee2df4 100644 --- a/colossalai/nn/layer/vanilla_vision_transformer/layers.py +++ b/colossalai/nn/layer/non_parallel_layers/_vit.py @@ -1,23 +1,47 @@ -import collections.abc -from itertools import repeat +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + import torch from torch import nn as nn +from colossalai.builder import build_layer from colossalai.registry import LAYERS +from .._common_utils import to_2tuple -# From PyTorch internals -def _ntuple(n): - def parse(x): - if isinstance(x, collections.abc.Iterable): - return x - return tuple(repeat(x, n)) - - return parse +@LAYERS.register_module +class ViTBlock(nn.Module): + """Vision Transformer block + + :param attention_cfg: config of attention layer + :type attention_cfg: dict + :param droppath_cfg: config of drop path + :type droppath_cfg: dict + :param mlp_cfg: config of MLP layer + :type mlp_cfg: dict + :param norm_cfg: config of normlization layer + :type norm_cfg: dict + """ + def __init__(self, + attention_cfg: dict, + droppath_cfg: dict, + mlp_cfg: dict, + norm_cfg: dict, + ): + super().__init__() + self.norm1 = build_layer(norm_cfg) + self.attn = build_layer(attention_cfg) + self.drop_path = build_layer( + droppath_cfg) if droppath_cfg['drop_path'] > 0. else nn.Identity() + self.norm2 = build_layer(norm_cfg) + self.mlp = build_layer(mlp_cfg) -to_2tuple = _ntuple(2) + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x @LAYERS.register_module diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py index cf262053a60b..85272d7c01bd 100644 --- a/colossalai/nn/layer/parallel_1d/__init__.py +++ b/colossalai/nn/layer/parallel_1d/__init__.py @@ -1,7 +1,7 @@ from .layers import Linear1D_Col, Linear1D_Row from .layers import MixedFusedLayerNorm1D as LayerNorm1D from ._transformer import TransformerMLP1D, TransformerSelfAttention1D, TransformerLayer1D -from ._vit import ViTMLP1D, ViTSelfAttention1D, ViTHead1D, ViTPatchEmbedding1D, ViTTokenFuser1D, ViTHeadNormal +from ._vit import ViTMLP1D, ViTSelfAttention1D, ViTHead1D, ViTPatchEmbedding1D, ViTTokenFuser1D, ViTHead diff --git a/colossalai/nn/layer/parallel_1d/_vit.py b/colossalai/nn/layer/parallel_1d/_vit.py index e45a0e3dfa0d..dca3d176867f 100644 --- a/colossalai/nn/layer/parallel_1d/_vit.py +++ b/colossalai/nn/layer/parallel_1d/_vit.py @@ -11,12 +11,12 @@ from colossalai.context import seed, ParallelMode from colossalai.core import global_context as gpc from colossalai.nn.layer._common_utils import divide, ACT2FN -from colossalai.nn.layer.vanilla_vision_transformer.layers import to_2tuple from colossalai.registry import LAYERS from colossalai.utils import checkpoint from colossalai.utils import get_current_device from .layers import Linear1D_Col, Linear1D_Row from ..base_layer import ParallelLayer +from .._common_utils import to_2tuple from ..fused_bias_gelu import bias_gelu_impl @@ -70,11 +70,10 @@ def __init__(self, dtype=dtype, gather_output=False, skip_bias_add=skip_dense_1_add_bias, - init_weight=weight_init, + init_weight=weight_init, init_bias=weight_init ) - # Project back to h. self.dense_2 = Linear1D_Row( int(self.mlp_ratio * self.in_features), @@ -155,7 +154,7 @@ def __init__(self, hidden_size, 3 * hidden_size, dtype=dtype, - init_weight=weight_init, + init_weight=weight_init, init_bias=init_bias ) self.attention_dropout = nn.Dropout(attention_dropout_prob) @@ -172,7 +171,7 @@ def __init__(self, def _forward(self, hidden_states: Tensor) -> Tensor: query_key_value = self.query_key_value(hidden_states) new_qkv_shape = query_key_value.shape[:-1] + \ - (self.num_attention_heads_per_partition, 3 * self.attention_head_size) + (self.num_attention_heads_per_partition, 3 * self.attention_head_size) query_key_value = query_key_value.view(new_qkv_shape) query_key_value = query_key_value.permute((0, 2, 1, 3)) query_layer, key_layer, value_layer = torch.chunk( @@ -181,7 +180,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: attention_scores = torch.matmul( query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / \ - math.sqrt(self.attention_head_size) + math.sqrt(self.attention_head_size) attention_probs = self.softmax(attention_scores) @@ -191,7 +190,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.transpose(1, 2) new_context_layer_shape = context_layer.size()[ - :-2] + (self.hidden_size_per_partition,) + :-2] + (self.hidden_size_per_partition,) context_layer = context_layer.reshape(new_context_layer_shape) output = self.dense(context_layer) output = self.dropout(output) @@ -250,6 +249,7 @@ def forward(self, x: Tensor) -> Tensor: x = self.linear(x) return x + @LAYERS.register_module class ViTHead(ParallelLayer): """Output layer for 1D parallel Vision Transformer @@ -271,10 +271,10 @@ def __init__(self, self.linear = nn.Linear( hidden_size, num_classes, - dtype = dtype + dtype=dtype ) self._broadcast_linear_params() - + def _broadcast_linear_params(self) -> None: self.to(get_current_device()) ranks = gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D) @@ -289,6 +289,7 @@ def forward(self, x: Tensor) -> Tensor: x = self.linear(x) return x + @LAYERS.register_module class ViTPatchEmbedding1D(ParallelLayer): """ 2D Image to Patch Embedding @@ -325,10 +326,10 @@ def __init__(self, self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, - self.embed_dim, - kernel_size=patch_size, - stride=patch_size - ) + self.embed_dim, + kernel_size=patch_size, + stride=patch_size + ) if weight_init == 'jax': fan_in, _ = _calculate_fan_in_and_fan_out(self.proj.weight) @@ -398,14 +399,13 @@ def __init__(self, # move to cuda before broadcast self.to(get_current_device()) - dist.broadcast(self.pos_embed, - src=gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], - group=gpc.get_group(ParallelMode.TENSOR)) + dist.broadcast(self.pos_embed, + src=gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], + group=gpc.get_group(ParallelMode.TENSOR)) self.pos_drop = nn.Dropout(p=drop_rate) def forward(self, x: Tensor) -> Tensor: cls_token = self.cls_token.expand(x.shape[0], -1, -1) - x = torch.cat((cls_token, x), dim=1) + x = torch.cat((cls_token, x), dim=1) x = self.pos_drop(x + self.pos_embed) return x.contiguous() - diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py index 6158da07a634..796e043869e9 100644 --- a/colossalai/nn/layer/parallel_1d/layers.py +++ b/colossalai/nn/layer/parallel_1d/layers.py @@ -18,7 +18,7 @@ from colossalai.registry import LAYERS from colossalai.utils import get_current_device from ._operation import FusedLayerNormAffineFunction1D -from .._common_utils import divide, set_tensor_parallel_attribute +from .._common_utils import divide, set_tensor_parallel_attribute_by_partition from .._parallel_utilities import reduce_grad, reduce_input, gather_forward_split_backward, \ split_forward_gather_backward from ..base_layer import ParallelLayer @@ -87,7 +87,7 @@ def __init__(self, with seed(ParallelMode.TENSOR): self.reset_parameters(init_weight, init_bias) self._set_tensor_parallel_attributes() - + def reset_parameters(self, init_weight, init_bias) -> None: assert init_weight in ('torch', 'jax', 'zero') assert init_bias in ('torch', 'jax', 'zero') @@ -119,9 +119,10 @@ def reset_parameters(self, init_weight, init_bias) -> None: init.zeros_(self.bias) def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.weight, num_partition) if self.bias is not None: - set_tensor_parallel_attribute(self.bias) + set_tensor_parallel_attribute_by_partition(self.bias, num_partition) def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]: # Set up backprop all-reduce. @@ -186,8 +187,8 @@ def __init__(self, # Initialize weight. factory_kwargs = {'device': get_current_device(), 'dtype': dtype} self.weight = Parameter(torch.empty( - self.out_features, - self.input_size_per_partition, + self.out_features, + self.input_size_per_partition, **factory_kwargs)) if bias: @@ -204,7 +205,7 @@ def __init__(self, with seed(ParallelMode.TENSOR): self.reset_parameters(init_weight, init_bias) self._set_tensor_parallel_attributes() - + def reset_parameters(self, init_weight, init_bias) -> None: assert init_weight in ('torch', 'jax', 'zero') assert init_bias in ('torch', 'jax', 'zero') @@ -234,13 +235,13 @@ def reset_parameters(self, init_weight, init_bias) -> None: init.normal_(self.bias, std=1e-6) elif init_bias == 'zero': init.zeros_(self.bias) - dist.broadcast(self.bias, - src=gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], - group=gpc.get_group(ParallelMode.PARALLEL_1D)) - + dist.broadcast(self.bias, + src=gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], + group=gpc.get_group(ParallelMode.PARALLEL_1D)) def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.weight, num_partition) def forward(self, input_: Tensor) -> Tensor: # Set up backprop all-reduce. @@ -258,13 +259,12 @@ def forward(self, input_: Tensor) -> Tensor: return output else: return output, self.bias - @LAYERS.register_module class MixedFusedLayerNorm1D(torch.nn.Module): - def __init__(self, normalized_shape, eps=1e-5): + def __init__(self, normalized_shape, eps=1e-5): super(MixedFusedLayerNorm1D, self).__init__() if isinstance(normalized_shape, numbers.Integral): @@ -275,12 +275,10 @@ def __init__(self, normalized_shape, eps=1e-5): self.bias = Parameter(torch.Tensor(*normalized_shape)) self.reset_parameters() + def reset_parameters(self): + init.ones_(self.weight) + init.zeros_(self.bias) - def reset_parameters(self): - init.ones_(self.weight) - init.zeros_(self.bias) - - - def forward(self, input): - return FusedLayerNormAffineFunction1D.apply( - input, self.weight, self.bias, self.normalized_shape,self.eps) + def forward(self, input): + return FusedLayerNormAffineFunction1D.apply( + input, self.weight, self.bias, self.normalized_shape, self.eps) diff --git a/colossalai/nn/layer/parallel_2d/_vit.py b/colossalai/nn/layer/parallel_2d/_vit.py index 3f8ed2a437aa..70734b345c31 100644 --- a/colossalai/nn/layer/parallel_2d/_vit.py +++ b/colossalai/nn/layer/parallel_2d/_vit.py @@ -10,13 +10,14 @@ from colossalai.context import seed, ParallelMode from colossalai.nn.layer._common_utils import divide, ACT2FN from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization, get_summa_dim_from_env -from colossalai.nn.layer.vanilla_vision_transformer.layers import to_2tuple + from colossalai.registry import LAYERS from colossalai.utils import checkpoint from colossalai.utils import get_current_device +from colossalai.core import global_context as gpc from ._operation import AllGatherLast, SplitFirst from .layers import Linear2D -from .._common_utils import set_tensor_parallel_attribute +from .._common_utils import set_tensor_parallel_attribute_by_partition, to_2tuple from ..base_layer import ParallelLayer from ..fused_bias_gelu import bias_gelu_impl @@ -72,7 +73,6 @@ def __init__(self, skip_bias_add=skip_dense_1_add_bias ) - # Project back to h. self.dense_2 = Linear2D( self.mlp_ratio * self.in_features, @@ -168,7 +168,7 @@ def __init__(self, def _forward(self, hidden_states: Tensor) -> Tensor: query_key_value = self.query_key_value(hidden_states) new_qkv_shape = query_key_value.shape[:-1] + \ - (self.num_attention_heads, 3 * self.attention_head_size) + (self.num_attention_heads, 3 * self.attention_head_size) query_key_value = query_key_value.view(new_qkv_shape) query_key_value = query_key_value.permute((0, 2, 1, 3)) query_layer, key_layer, value_layer = torch.chunk( @@ -177,7 +177,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: attention_scores = torch.matmul( query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / \ - math.sqrt(self.attention_head_size) + math.sqrt(self.attention_head_size) attention_probs = self.softmax(attention_scores) @@ -187,7 +187,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.transpose(1, 2) new_context_layer_shape = context_layer.size()[ - :-2] + (self.all_head_size,) + :-2] + (self.all_head_size,) context_layer = context_layer.reshape(new_context_layer_shape) output = self.dense(context_layer) @@ -284,11 +284,11 @@ def __init__(self, with seed(ParallelMode.TENSOR): self.proj = nn.Conv2d(in_chans, - self.embed_dim, - kernel_size=patch_size, - stride=patch_size, - device=get_current_device() - ) + self.embed_dim, + kernel_size=patch_size, + stride=patch_size, + device=get_current_device() + ) self._set_tensor_parallel_attribute() if weight_init == 'jax': @@ -299,8 +299,9 @@ def __init__(self, nn.init.zeros_(self.proj.bias) def _set_tensor_parallel_attribute(self): - set_tensor_parallel_attribute(self.proj.weight) - set_tensor_parallel_attribute(self.proj.bias) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.proj.weight, num_partition) + set_tensor_parallel_attribute_by_partition(self.proj.bias, num_partition) def forward(self, x: Tensor) -> Tensor: B, C, H, W = x.shape @@ -377,8 +378,9 @@ def __init__(self, self._set_tensor_parallel_attribute() def _set_tensor_parallel_attribute(self): - set_tensor_parallel_attribute(self.cls_token) - set_tensor_parallel_attribute(self.pos_embed) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.cls_token, num_partition) + set_tensor_parallel_attribute_by_partition(self.pos_embed, num_partition) def forward(self, x: Tensor) -> Tensor: # stole cls_tokens impl from Phil Wang, thanks diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/nn/layer/parallel_2d/layers.py index 396d55e239aa..f2935435633a 100644 --- a/colossalai/nn/layer/parallel_2d/layers.py +++ b/colossalai/nn/layer/parallel_2d/layers.py @@ -11,7 +11,7 @@ from colossalai.utils import get_current_device from ._operation import Matmul_AB_2D, Add_Bias_2D, _LayerNorm_2D from ._utils import get_summa_dim_from_env, assert_summa_initialization -from .._common_utils import divide, set_tensor_parallel_attribute +from .._common_utils import divide, set_tensor_parallel_attribute_by_partition from ..base_layer import ParallelLayer @@ -78,9 +78,10 @@ def __init__(self, self._set_tensor_parallel_attributes() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.weight, num_partition) if self.bias is not None: - set_tensor_parallel_attribute(self.bias) + set_tensor_parallel_attribute_by_partition(self.bias, num_partition) def reset_parameters(self, init_weight, init_bias) -> None: assert init_weight in ('torch', 'jax', 'zero') @@ -216,8 +217,9 @@ def __init__(self, self._set_tensor_parallel_attributes() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.gamma) - set_tensor_parallel_attribute(self.beta) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.gamma, num_partition) + set_tensor_parallel_attribute_by_partition(self.beta, num_partition) def forward(self, x: Tensor) -> Tensor: with torch.no_grad(): diff --git a/colossalai/nn/layer/parallel_2p5d/_transformer.py b/colossalai/nn/layer/parallel_2p5d/_transformer.py index 55cdddef6f94..ed469ba7ddad 100644 --- a/colossalai/nn/layer/parallel_2p5d/_transformer.py +++ b/colossalai/nn/layer/parallel_2p5d/_transformer.py @@ -47,13 +47,14 @@ def __init__(self, assert_tesseract_initialization() self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env() self.in_features = in_features + self.skip_bias_add = skip_bias_add # Project to h * mlp_ratio. self.dense_1 = Linear2p5D( in_features, int(mlp_ratio * in_features), dtype=dtype, - skip_bias_add=self.skip_bias_add + skip_bias_add=skip_bias_add ) assert act_func in ACT2FN.keys(), f'Invalid value for argument act_func, ' \ @@ -65,7 +66,7 @@ def __init__(self, int(mlp_ratio * in_features), in_features, dtype=dtype, - skip_bias_add=self.skip_bias_add + skip_bias_add=skip_bias_add ) self.dropout = nn.Dropout(dropout_prob) self.layernorm = LayerNorm2p5D(in_features, dtype=dtype) @@ -140,7 +141,7 @@ def __init__(self, def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor: query_key_value = self.query_key_value(hidden_states) new_qkv_shape = query_key_value.shape[:-1] + \ - (self.num_attention_heads, 3 * self.attention_head_size) + (self.num_attention_heads, 3 * self.attention_head_size) query_key_value = query_key_value.view(new_qkv_shape) query_key_value = query_key_value.permute((0, 2, 1, 3)) query_layer, key_layer, value_layer = torch.chunk( @@ -149,7 +150,7 @@ def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor: attention_scores = torch.matmul( query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / \ - math.sqrt(self.attention_head_size) + math.sqrt(self.attention_head_size) attention_scores = attention_scores + attention_mask attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = self.attention_dropout(attention_probs) @@ -157,7 +158,7 @@ def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor: context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute((0, 2, 1, 3)).contiguous() new_context_layer_shape = context_layer.size()[ - :-2] + (self.all_head_size,) + :-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) output = self.dense(context_layer) diff --git a/colossalai/nn/layer/parallel_2p5d/_vit.py b/colossalai/nn/layer/parallel_2p5d/_vit.py index 40f670db52bc..180e27b3e13f 100644 --- a/colossalai/nn/layer/parallel_2p5d/_vit.py +++ b/colossalai/nn/layer/parallel_2p5d/_vit.py @@ -9,7 +9,6 @@ from colossalai.context import seed, ParallelMode from colossalai.core import global_context as gpc -from colossalai.nn.layer.vanilla_vision_transformer.layers import to_2tuple from colossalai.registry import LAYERS from colossalai.utils import checkpoint from colossalai.utils import get_current_device @@ -17,10 +16,10 @@ from ._utils import assert_tesseract_initialization, \ get_tesseract_dim_dep_from_env from .layers import Linear2p5D -from .._common_utils import ACT2FN, divide, CheckpointModule -from .._common_utils import set_tensor_parallel_attribute from ..base_layer import ParallelLayer from ..fused_bias_gelu import bias_gelu_impl +from .._common_utils import (ACT2FN, divide, to_2tuple, + set_tensor_parallel_attribute_by_partition) @LAYERS.register_module @@ -70,7 +69,7 @@ def __init__(self, self.in_features, self.mlp_ratio * self.in_features, dtype=dtype, - init_weight=weight_init, + init_weight=weight_init, init_bias=weight_init, skip_bias_add=skip_dense_1_add_bias ) @@ -82,7 +81,7 @@ def __init__(self, self.mlp_ratio * self.in_features, self.in_features, dtype=dtype, - init_weight=weight_init, + init_weight=weight_init, init_bias=weight_init ) self.dropout = nn.Dropout(dropout_prob) @@ -160,7 +159,7 @@ def __init__(self, hidden_size, 3 * hidden_size, dtype=dtype, - init_weight=weight_init, + init_weight=weight_init, init_bias=self.init_bias ) self.attention_dropout = nn.Dropout(attention_dropout_prob) @@ -168,7 +167,7 @@ def __init__(self, hidden_size, hidden_size, dtype=dtype, - init_weight=weight_init, + init_weight=weight_init, init_bias=self.init_bias ) self.dropout = nn.Dropout(hidden_dropout_prob) @@ -177,7 +176,7 @@ def __init__(self, def _forward(self, hidden_states: Tensor) -> Tensor: query_key_value = self.query_key_value(hidden_states) new_qkv_shape = query_key_value.shape[:-1] + \ - (self.num_attention_heads, 3 * self.attention_head_size) + (self.num_attention_heads, 3 * self.attention_head_size) query_key_value = query_key_value.view(new_qkv_shape) query_key_value = query_key_value.permute((0, 2, 1, 3)) query_layer, key_layer, value_layer = torch.chunk( @@ -186,7 +185,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: attention_scores = torch.matmul( query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / \ - math.sqrt(self.attention_head_size) + math.sqrt(self.attention_head_size) attention_probs = self.softmax(attention_scores) @@ -196,7 +195,7 @@ def _forward(self, hidden_states: Tensor) -> Tensor: context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.transpose(1, 2) new_context_layer_shape = context_layer.size()[ - :-2] + (self.all_head_size,) + :-2] + (self.all_head_size,) context_layer = context_layer.reshape(new_context_layer_shape) output = self.dense(context_layer) @@ -246,7 +245,7 @@ def __init__(self, hidden_size, num_classes, dtype=dtype, - init_weight=self.init_weight, + init_weight=self.init_weight, init_bias=self.init_bias ) @@ -291,15 +290,15 @@ def __init__(self, img_size[1] // patch_size[1]) self.num_patches = self.grid_size[0] * self.grid_size[1] self.flatten = flatten - self.embed_dim = embed_dim // (self.tesseract_dep * self.tesseract_dim ** 2) # * + self.embed_dim = embed_dim // (self.tesseract_dep * self.tesseract_dim ** 2) # * with seed(ParallelMode.TENSOR): self.proj = nn.Conv2d(in_chans, - self.embed_dim, - kernel_size=patch_size, - stride=patch_size, - device=get_current_device() - ) + self.embed_dim, + kernel_size=patch_size, + stride=patch_size, + device=get_current_device() + ) self._set_tensor_parallel_attribute() if weight_init == 'jax': @@ -310,8 +309,9 @@ def __init__(self, nn.init.zeros_(self.proj.bias) def _set_tensor_parallel_attribute(self): - set_tensor_parallel_attribute(self.proj.weight) - set_tensor_parallel_attribute(self.proj.bias) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.proj.weight, num_partition) + set_tensor_parallel_attribute_by_partition(self.proj.bias, num_partition) def forward(self, x: Tensor) -> Tensor: B, C, H, W = x.shape @@ -388,8 +388,9 @@ def __init__(self, self._set_tensor_parallel_attribute() def _set_tensor_parallel_attribute(self): - set_tensor_parallel_attribute(self.cls_token) - set_tensor_parallel_attribute(self.pos_embed) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.cls_token, num_partition) + set_tensor_parallel_attribute_by_partition(self.pos_embed, num_partition) def _broadcast_params(self, param) -> None: " broadcast to all column ranks for data consistency " @@ -397,12 +398,12 @@ def _broadcast_params(self, param) -> None: xz_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2P5D_XZ) xz_group = gpc.get_group(ParallelMode.PARALLEL_2P5D_XZ) dist.broadcast(param, src=xz_rank[0], - group=xz_group) + group=xz_group) def _sync_grad_hook(self, grad) -> None: dist.all_reduce(grad, group=gpc.get_group( ParallelMode.PARALLEL_2P5D_XZ)) - grad = grad / self.tesseract_dim #/ self.tesseract_dep # * + grad = grad / self.tesseract_dim # / self.tesseract_dep # * return grad def forward(self, x: Tensor) -> Tensor: @@ -418,4 +419,3 @@ def forward(self, x: Tensor) -> Tensor: with seed(ParallelMode.TENSOR): x = self.pos_drop(x) return x - diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py index 429b4e8373cd..224fa615fdc5 100644 --- a/colossalai/nn/layer/parallel_2p5d/layers.py +++ b/colossalai/nn/layer/parallel_2p5d/layers.py @@ -10,7 +10,7 @@ from colossalai.utils import get_current_device from ._operation import Matmul_AB_2p5D, Add_Bias_2p5D, _LayerNorm_2p5D from ._utils import get_tesseract_dim_dep_from_env, assert_tesseract_initialization -from .._common_utils import divide, set_tensor_parallel_attribute +from .._common_utils import divide, set_tensor_parallel_attribute_by_partition from ..base_layer import ParallelLayer @@ -76,9 +76,10 @@ def __init__(self, self._set_tensor_parallel_attributes() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.weight, num_partition) if self.bias is not None: - set_tensor_parallel_attribute(self.bias) + set_tensor_parallel_attribute_by_partition(self.bias, num_partition) def reset_parameters(self, init_weight, init_bias) -> None: assert init_weight in ('torch', 'jax', 'zero') @@ -178,6 +179,7 @@ class LayerNorm2p5D(ParallelLayer): :param dtype: The dtype of parameters, defaults to None :type dtype: torch.dtype, optional """ + def __init__(self, normalized_shape: int, eps: float = 1e-05, @@ -213,8 +215,9 @@ def __init__(self, self._set_tensor_parallel_attribute() def _set_tensor_parallel_attribute(self): - set_tensor_parallel_attribute(self.gamma) - set_tensor_parallel_attribute(self.beta) + num_partition = gpc.get_world_size(ParallelMode.TENSOR) + set_tensor_parallel_attribute_by_partition(self.gamma, num_partition) + set_tensor_parallel_attribute_by_partition(self.beta, num_partition) def forward(self, x: Tensor) -> Tensor: with torch.no_grad(): diff --git a/colossalai/nn/layer/parallel_3d/_vit.py b/colossalai/nn/layer/parallel_3d/_vit.py index 09d9370433f1..46fb83b927b0 100644 --- a/colossalai/nn/layer/parallel_3d/_vit.py +++ b/colossalai/nn/layer/parallel_3d/_vit.py @@ -13,8 +13,7 @@ from colossalai.utils import checkpoint, get_current_device from torch import Tensor, dtype, nn -from .._common_utils import ACT2FN, divide, set_tensor_parallel_attribute -from ..vanilla_vision_transformer.layers import to_2tuple +from .._common_utils import ACT2FN, divide, set_tensor_parallel_attribute_by_size, to_2tuple from ._utils import get_depth_from_env, get_parallel_mode_from_env, get_last_group from .layers import Linear3D @@ -36,6 +35,7 @@ class ViTPatchEmbedding3D(nn.Module): :param flatten: whether to flatten output tensor, defaults to True :type flatten: bool, optional """ + def __init__(self, img_size: int, patch_size: int, @@ -43,7 +43,7 @@ def __init__(self, embed_size: int, drop_prob: float, flatten: bool = True, - init_method: str ='torch'): + init_method: str = 'torch'): super().__init__() self.depth = get_depth_from_env() self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) @@ -83,10 +83,10 @@ def __init__(self, self._set_tensor_parallel_attributes() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.proj.weight, self.in_chans * self.embed_size * self.num_patches) - set_tensor_parallel_attribute(self.proj.bias, self.embed_size) - set_tensor_parallel_attribute(self.cls_token, 1 * 1 * self.embed_size) - set_tensor_parallel_attribute(self.pos_embed, 1 * (self.num_patches + 1) * self.embed_size) + set_tensor_parallel_attribute_by_size(self.proj.weight, self.in_chans * self.embed_size * self.num_patches) + set_tensor_parallel_attribute_by_size(self.proj.bias, self.embed_size) + set_tensor_parallel_attribute_by_size(self.cls_token, 1 * 1 * self.embed_size) + set_tensor_parallel_attribute_by_size(self.pos_embed, 1 * (self.num_patches + 1) * self.embed_size) def reset_parameters(self, init_weight, init_bias): fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.proj.weight) @@ -98,7 +98,7 @@ def reset_parameters(self, init_weight, init_bias): init_bias_(self.pos_embed, fan_in, init_method=init_weight) if init_bias != 'torch': init_bias_(self.proj.bias, fan_in, init_method=init_bias) - + self.to(get_current_device()) weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0] dist.broadcast(self.proj.weight, @@ -114,7 +114,7 @@ def reset_parameters(self, init_weight, init_bias): dist.broadcast(self.proj.bias, src=input_src_rank, group=gpc.get_group(self.input_parallel_mode)) - + self.proj.weight.register_hook(self._sync_grad_hook) self.proj.bias.register_hook(self._sync_grad_hook) self.cls_token.register_hook(self._sync_grad_hook) @@ -173,6 +173,7 @@ class ViTSelfAttention3D(nn.Module): :param bias: whether to add bias, defaults to True :type bias: bool, optional """ + def __init__(self, hidden_size: int, num_attention_heads: int, @@ -181,7 +182,7 @@ def __init__(self, dtype: dtype = None, bias: bool = True, checkpoint: bool = False, - init_method: str ='torch'): + init_method: str = 'torch'): super().__init__() self.depth = get_depth_from_env() # self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) @@ -210,8 +211,8 @@ def __init__(self, self.attention_dropout = nn.Dropout(attention_probs_dropout_prob) self.dense = Linear3D(self.hidden_size, self.hidden_size, - # self.output_parallel_mode, - # self.weight_parallel_mode, + # self.output_parallel_mode, + # self.weight_parallel_mode, dtype=dtype, bias=bias, init_weight=self.init_weight, @@ -225,7 +226,7 @@ def __init__(self, def _forward(self, hidden_states: Tensor) -> Tensor: query_key_value = self.query_key_value(hidden_states) new_qkv_shape = query_key_value.shape[:-1] + \ - (self.num_attention_heads, 3 * self.attention_head_size) + (self.num_attention_heads, 3 * self.attention_head_size) query_key_value = query_key_value.view(new_qkv_shape) query_key_value = query_key_value.permute((0, 2, 1, 3)) query_layer, key_layer, value_layer = torch.chunk(query_key_value, @@ -285,6 +286,7 @@ class ViTMLP3D(nn.Module): :param bias: whether to add bias, defaults to True :type bias: bool, optional """ + def __init__(self, hidden_size: int, mlp_ratio: int, @@ -365,6 +367,7 @@ class ViTHead3D(nn.Module): :param bias: whether to add bias, defaults to True :type bias: bool, optional """ + def __init__(self, in_features: int, num_classes: int, @@ -387,11 +390,11 @@ def __init__(self, if init_method == 'jax': self.init_weight = 'zero' self.init_bias = 'zero' - + self.linear = Linear3D(self.in_features, self.num_classes, - # self.input_parallel_mode, - # self.weight_parallel_mode, + # self.input_parallel_mode, + # self.weight_parallel_mode, dtype=dtype, bias=bias, init_weight=self.init_weight, diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/nn/layer/parallel_3d/layers.py index 775fc207a2a1..60e4a2c8a64f 100644 --- a/colossalai/nn/layer/parallel_3d/layers.py +++ b/colossalai/nn/layer/parallel_3d/layers.py @@ -19,7 +19,7 @@ from torch.nn import Parameter from torch.nn import init as init -from .._common_utils import divide, set_tensor_parallel_attribute +from .._common_utils import divide, set_tensor_parallel_attribute_by_size from ._operation import (Add_3D, Matmul_AB_3D, Mul_3D, Sum_3D, layer_norm_3d, linear_3d) from ._utils import (get_depth_from_env, get_last_group, @@ -57,8 +57,8 @@ def __init__( self._set_tensor_parallel_attributes() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight, self.normalized_shape) - set_tensor_parallel_attribute(self.bias, self.normalized_shape) + set_tensor_parallel_attribute_by_size(self.weight, self.normalized_shape) + set_tensor_parallel_attribute_by_size(self.bias, self.normalized_shape) def reset_parameters(self): init.zeros_(self.bias) @@ -107,8 +107,8 @@ def __init__( # weight_parallel_mode: ParallelMode, bias: bool = True, dtype: dtype = None, - init_weight: str ='torch', - init_bias: str ='torch'): + init_weight: str = 'torch', + init_bias: str = 'torch'): super().__init__() self.in_features = in_features self.out_features = out_features @@ -142,16 +142,16 @@ def __init__( swap_in_out_group() def _set_tensor_parallel_attributes(self): - set_tensor_parallel_attribute(self.weight, self.in_features * self.out_features) + set_tensor_parallel_attribute_by_size(self.weight, self.in_features * self.out_features) if self.bias is not None: - set_tensor_parallel_attribute(self.bias, self.out_features) + set_tensor_parallel_attribute_by_size(self.bias, self.out_features) def reset_parameters(self, init_weight, init_bias) -> None: # setting fan_in, fan_out = self.in_features, self.out_features weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0] output_src_rank = gpc.get_ranks_in_group(self.output_parallel_mode)[0] - + # init weight init_weight_(self.weight, fan_in, fan_out, init_method=init_weight) dist.broadcast(self.weight, diff --git a/colossalai/nn/layer/parallel_vision_transformer/__init__.py b/colossalai/nn/layer/parallel_vision_transformer/__init__.py deleted file mode 100644 index 8adf9eb308d2..000000000000 --- a/colossalai/nn/layer/parallel_vision_transformer/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .layers import ViTBlock - -__all__ = ['ViTBlock'] diff --git a/colossalai/nn/layer/parallel_vision_transformer/layers.py b/colossalai/nn/layer/parallel_vision_transformer/layers.py deleted file mode 100644 index 8624f7f66e93..000000000000 --- a/colossalai/nn/layer/parallel_vision_transformer/layers.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from torch import nn as nn - -from colossalai.builder import build_layer -from colossalai.registry import LAYERS - - -@LAYERS.register_module -class ViTBlock(nn.Module): - """Vision Transformer block - - :param attention_cfg: config of attention layer - :type attention_cfg: dict - :param droppath_cfg: config of drop path - :type droppath_cfg: dict - :param mlp_cfg: config of MLP layer - :type mlp_cfg: dict - :param norm_cfg: config of normlization layer - :type norm_cfg: dict - """ - - def __init__(self, - attention_cfg: dict, - droppath_cfg: dict, - mlp_cfg: dict, - norm_cfg: dict, - ): - super().__init__() - self.norm1 = build_layer(norm_cfg) - self.attn = build_layer(attention_cfg) - self.drop_path = build_layer( - droppath_cfg) if droppath_cfg['drop_path'] > 0. else nn.Identity() - self.norm2 = build_layer(norm_cfg) - self.mlp = build_layer(mlp_cfg) - - def forward(self, x): - x = x + self.drop_path(self.attn(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - - # x_ = x - # x_ = self.norm1(x_) - # if self.checkpoint: - # x_ = checkpoint(self.attn, x_) - # else: - # x_ = self.attn(x_) - # x_ = self.drop_path(x_) - # x = x + x_ - # - # x_ = x - # x_ = self.norm2(x_) - # if self.checkpoint: - # x_ = checkpoint(self.mlp, x_) - # else: - # x_ = self.mlp(x_) - # x_ = self.drop_path(x_) - # x = x + x_ - return x diff --git a/colossalai/nn/layer/vanilla_resnet/__init__.py b/colossalai/nn/layer/vanilla_resnet/__init__.py deleted file mode 100644 index 289b8749ee39..000000000000 --- a/colossalai/nn/layer/vanilla_resnet/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .basic_block import ResNetBasicBlock -from .bottleneck import ResNetBottleneck -from .reslayer import ResLayer - -__all__ = ['ResLayer', 'ResNetBottleneck', 'ResNetBasicBlock'] diff --git a/colossalai/nn/layer/vanilla_resnet/basic_block.py b/colossalai/nn/layer/vanilla_resnet/basic_block.py deleted file mode 100644 index 320dac2fde59..000000000000 --- a/colossalai/nn/layer/vanilla_resnet/basic_block.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from typing import Optional, Callable - -import torch.nn as nn -from torch import Tensor - -from colossalai.registry import LAYERS -from .conv import conv3x3 - - -@LAYERS.register_module -class ResNetBasicBlock(nn.Module): - """Basic ResNet block - """ - expansion: int = 1 - - def __init__( - self, - inplanes: int, - planes: int, - stride: int = 1, - downsample: Optional[nn.Module] = None, - groups: int = 1, - base_width: int = 64, - dilation: int = 1, - norm_layer: Optional[Callable[..., nn.Module]] = None - ) -> None: - super().__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - if groups != 1 or base_width != 64: - raise ValueError( - 'BasicBlock only supports groups=1 and base_width=64') - if dilation > 1: - raise NotImplementedError( - "Dilation > 1 not supported in BasicBlock") - # Both self.conv1 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = norm_layer(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(planes, planes) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x: Tensor) -> Tensor: - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out diff --git a/colossalai/nn/layer/vanilla_resnet/bottleneck.py b/colossalai/nn/layer/vanilla_resnet/bottleneck.py deleted file mode 100644 index d75f9534b0f7..000000000000 --- a/colossalai/nn/layer/vanilla_resnet/bottleneck.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from typing import Optional, Callable - -import torch.nn as nn -from torch import Tensor - -from colossalai.registry import LAYERS -from .conv import conv3x3, conv1x1 - - -@LAYERS.register_module -class ResNetBottleneck(nn.Module): - # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) - # while original implementation places the stride at the first 1x1 convolution(self.conv1) - # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. - # This variant is also known as ResNet V1.5 and improves accuracy according to - # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. - - expansion: int = 4 - - def __init__( - self, - inplanes: int, - planes: int, - stride: int = 1, - downsample: Optional[nn.Module] = None, - groups: int = 1, - base_width: int = 64, - dilation: int = 1, - norm_layer: Optional[Callable[..., nn.Module]] = None - ) -> None: - super().__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - width = int(planes * (base_width / 64.)) * groups - # Both self.conv2 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv1x1(inplanes, width) - self.bn1 = norm_layer(width) - self.conv2 = conv3x3(width, width, stride, groups, dilation) - self.bn2 = norm_layer(width) - self.conv3 = conv1x1(width, planes * self.expansion) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - - def forward(self, x: Tensor) -> Tensor: - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.bn3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out diff --git a/colossalai/nn/layer/vanilla_resnet/conv.py b/colossalai/nn/layer/vanilla_resnet/conv.py deleted file mode 100644 index c918d94c4e1a..000000000000 --- a/colossalai/nn/layer/vanilla_resnet/conv.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch.nn as nn - - -def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d: - """3x3 convolution with padding""" - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, - padding=dilation, groups=groups, bias=False, dilation=dilation) - - -def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d: - """1x1 convolution""" - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) diff --git a/colossalai/nn/layer/vanilla_resnet/reslayer.py b/colossalai/nn/layer/vanilla_resnet/reslayer.py deleted file mode 100644 index 4e1b48c5e8b5..000000000000 --- a/colossalai/nn/layer/vanilla_resnet/reslayer.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import torch.nn as nn - -from colossalai.registry import LAYERS -from .conv import conv1x1 - - -@LAYERS.register_module -class ResLayer(nn.Module): - - def __init__(self, - block_type: str, - norm_layer_type: str, - inplanes: int, - planes: int, - blocks: int, - groups: int, - base_width: int, - stride: int = 1, - dilation: int = 1, - dilate: bool = False, - ): - super().__init__() - self.block = LAYERS.get_module(block_type) - self.norm_layer = LAYERS.get_module(norm_layer_type) - self.inplanes = inplanes - self.planes = planes - self.blocks = blocks - self.groups = groups - self.dilation = dilation - self.base_width = base_width - self.dilate = dilate - self.stride = stride - self.layer = self._make_layer() - - def _make_layer(self): - norm_layer = self.norm_layer - downsample = None - previous_dilation = self.dilation - if self.dilate: - self.dilation *= self.stride - self.stride = 1 - if self.stride != 1 or self.inplanes != self.planes * self.block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, self.planes * self.block.expansion, self.stride), - norm_layer(self.planes * self.block.expansion), - ) - - layers = [] - layers.append(self.block(self.inplanes, self.planes, self.stride, downsample, self.groups, - self.base_width, previous_dilation, norm_layer)) - self.inplanes = self.planes * self.block.expansion - for _ in range(1, self.blocks): - layers.append(self.block(self.inplanes, self.planes, groups=self.groups, - base_width=self.base_width, dilation=self.dilation, - norm_layer=norm_layer)) - - return nn.Sequential(*layers) - - def forward(self, x): - return self.layer(x) diff --git a/colossalai/nn/layer/vanilla_vision_transformer/__init__.py b/colossalai/nn/layer/vanilla_vision_transformer/__init__.py deleted file mode 100644 index 90d614e0a8ea..000000000000 --- a/colossalai/nn/layer/vanilla_vision_transformer/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .layers import (VanillaViTBlock, VanillaViTMLP, VanillaViTPatchEmbedding, - VanillaViTAttention, VanillaViTDropPath, VanillaViTHead) - -__all__ = [ - 'VanillaViTBlock', 'VanillaViTMLP', 'VanillaViTPatchEmbedding', - 'VanillaViTAttention', 'VanillaViTDropPath', 'VanillaViTHead' -] diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py index 6015c55c6dea..19c83b747407 100644 --- a/colossalai/nn/loss/__init__.py +++ b/colossalai/nn/loss/__init__.py @@ -1,4 +1,3 @@ -from .base_loss import BaseLoss from .cross_entropy_2d import CrossEntropyLoss2D from .cross_entropy_2p5d import CrossEntropyLoss2p5D from .cross_entropy_3d import CrossEntropyLoss3D diff --git a/colossalai/nn/loss/base_loss.py b/colossalai/nn/loss/base_loss.py deleted file mode 100644 index bf5bbe6b28e5..000000000000 --- a/colossalai/nn/loss/base_loss.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from abc import ABC, abstractmethod - - -class BaseLoss(ABC): - """Absctract loss class - """ - - @abstractmethod - def calc_loss(self, *args, **kwargs): - pass diff --git a/colossalai/nn/lr_scheduler/delayed.py b/colossalai/nn/lr_scheduler/delayed.py index 173d2f52c7a7..0f7bc1df6edc 100644 --- a/colossalai/nn/lr_scheduler/delayed.py +++ b/colossalai/nn/lr_scheduler/delayed.py @@ -48,8 +48,10 @@ def step(self, epoch=None): if self.finished: if epoch is None: self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() else: self.after_scheduler.step(epoch - self.delay_epochs) + self._last_lr = self.after_scheduler.get_last_lr() else: return super(DelayerScheduler, self).step(epoch) @@ -66,6 +68,7 @@ class WarmupScheduler(_LRScheduler): :param last_epoch: The index of last epoch, defaults to -1 :type last_epoch: int, optional """ + def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1): self.warmup_epochs = int(warmup_epochs) self.after_scheduler = after_scheduler @@ -85,8 +88,10 @@ def step(self, epoch=None): if self.finished: if epoch is None: self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() else: self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() else: return super().step(epoch) @@ -136,7 +141,9 @@ def step(self, epoch=None): if self.finished: if epoch is None: self.after_scheduler.step(None) + self._last_lr = self.after_scheduler.get_last_lr() else: self.after_scheduler.step(epoch - self.warmup_epochs) + self._last_lr = self.after_scheduler.get_last_lr() else: return super().step(epoch) diff --git a/colossalai/nn/lr_scheduler/multistep.py b/colossalai/nn/lr_scheduler/multistep.py index 5def4a1fac92..cdb89b53fbe2 100644 --- a/colossalai/nn/lr_scheduler/multistep.py +++ b/colossalai/nn/lr_scheduler/multistep.py @@ -12,7 +12,6 @@ class MultiStepLR(_MultiStepLR): number of epoch reaches one of the milestones. Notice that such decay can happen simultaneously with other changes to the learning rate from outside this scheduler. When last_epoch=-1, sets initial lr as lr. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -34,7 +33,6 @@ def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, ga @LR_SCHEDULERS.register_module class MultiStepWarmupLR(WarmupScheduler): """Multi-step laerning rate scheduler with warmup. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps diff --git a/colossalai/nn/lr_scheduler/onecycle.py b/colossalai/nn/lr_scheduler/onecycle.py index 743855470cc0..4384e61e2c45 100644 --- a/colossalai/nn/lr_scheduler/onecycle.py +++ b/colossalai/nn/lr_scheduler/onecycle.py @@ -12,28 +12,21 @@ class OneCycleLR(_OneCycleLR): than the initial learning rate. This policy was initially described in the paper `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates`_. - The 1cycle learning rate policy changes the learning rate after every batch. `step` should be called after a batch has been used for training. - This scheduler is not chainable. - Note also that the total number of steps in the cycle can be determined in one of two ways (listed in order of precedence): - #. A value for total_steps is explicitly provided. #. A number of epochs (epochs) and a number of steps per epoch (steps_per_epoch) are provided. In this case, the number of total steps is inferred by total_steps = epochs * steps_per_epoch - You must either provide a value for total_steps or provide a value for both epochs and steps_per_epoch. - The default behaviour of this scheduler follows the fastai implementation of 1cycle, which claims that "unpublished work has shown even better results by using only two phases". To mimic the behaviour of the original paper instead, set ``three_phase=True``. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -71,7 +64,6 @@ class OneCycleLR(_OneCycleLR): number of *batches* computed, not the total number of epochs computed. When last_epoch=-1, the schedule is started from the beginning, defaults to -1 :type last_epoch: int, optional - .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates: https://arxiv.org/abs/1708.07120 """ diff --git a/colossalai/nn/lr_scheduler/poly.py b/colossalai/nn/lr_scheduler/poly.py index ee77b2f9b2f9..ae9c1d2d245d 100644 --- a/colossalai/nn/lr_scheduler/poly.py +++ b/colossalai/nn/lr_scheduler/poly.py @@ -7,7 +7,6 @@ @LR_SCHEDULERS.register_module class PolynomialLR(_LRScheduler): """Polynomial learning rate scheduler. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -43,7 +42,6 @@ def _get_closed_form_lr(self): @LR_SCHEDULERS.register_module class PolynomialWarmupLR(WarmupScheduler): """Polynomial learning rate scheduler with warmup. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps diff --git a/colossalai/nn/lr_scheduler/torch.py b/colossalai/nn/lr_scheduler/torch.py index e739084b6fbb..abd0f4f39106 100644 --- a/colossalai/nn/lr_scheduler/torch.py +++ b/colossalai/nn/lr_scheduler/torch.py @@ -10,7 +10,6 @@ class LambdaLR(_LambdaLR): """Sets the learning rate of each parameter group to the initial lr times a given function. When last_epoch=-1, sets initial lr as lr. - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -33,7 +32,6 @@ def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) class MultiplicativeLR(_MultiplicativeLR): """Multiply the learning rate of each parameter group by the factor given in the specified function. When last_epoch=-1, sets initial lr as lr - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -58,7 +56,6 @@ class StepLR(_StepLR): step_size epochs. Notice that such decay can happen simultaneously with other changes to the learning rate from outside this scheduler. When last_epoch=-1, sets initial lr as lr - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps @@ -82,7 +79,6 @@ def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0. class ExponentialLR(_ExponentialLR): """Decays the learning rate of each parameter group by gamma every epoch. When last_epoch=-1, sets initial lr as lr - :param optimizer: Wrapped optimizer :type optimizer: torch.optim.Optimizer :param total_steps: number of total training steps diff --git a/colossalai/nn/model/__init__.py b/colossalai/nn/model/__init__.py index 5d5ccd96ec80..6ced1705408e 100644 --- a/colossalai/nn/model/__init__.py +++ b/colossalai/nn/model/__init__.py @@ -1,3 +1,3 @@ -from .base_model import BaseModel -from .vanilla_resnet import VanillaResNet -from .vision_transformer import * +from .model_from_config import ModelFromConfig + +__all__ = ['ModelFromConfig'] diff --git a/colossalai/nn/model/base_model.py b/colossalai/nn/model/model_from_config.py similarity index 92% rename from colossalai/nn/model/base_model.py rename to colossalai/nn/model/model_from_config.py index cbe38fefad70..24903ca3607d 100644 --- a/colossalai/nn/model/base_model.py +++ b/colossalai/nn/model/model_from_config.py @@ -8,10 +8,10 @@ from colossalai.builder import build_layer -class BaseModel(nn.Module, ABC): +class ModelFromConfig(nn.Module, ABC): def __init__(self): - super(BaseModel, self).__init__() + super(ModelFromConfig, self).__init__() self.layers = nn.ModuleList() self.layers_cfg = [] @@ -32,7 +32,6 @@ def init_weights(self): def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False): - """Use this function to override the state dict for saving checkpoints.""" return self.state_dict(destination, prefix, keep_vars) diff --git a/colossalai/nn/model/vanilla_resnet/__init__.py b/colossalai/nn/model/vanilla_resnet/__init__.py deleted file mode 100644 index 1740de7dc2b7..000000000000 --- a/colossalai/nn/model/vanilla_resnet/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .resnet import VanillaResNet - -__all__ = ['VanillaResNet'] diff --git a/colossalai/nn/model/vanilla_resnet/resnet.py b/colossalai/nn/model/vanilla_resnet/resnet.py deleted file mode 100644 index 905889649615..000000000000 --- a/colossalai/nn/model/vanilla_resnet/resnet.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from typing import List, Optional - -import torch -import torch.nn as nn -from torch import Tensor - -from colossalai.registry import LAYERS -from colossalai.registry import MODELS -from ..base_model import BaseModel - - -@MODELS.register_module -class VanillaResNet(BaseModel): - """ResNet from - `"Deep Residual Learning for Image Recognition" `_. - """ - - def __init__( - self, - num_cls: int, - block_type: str, - layers: List[int], - norm_layer_type: str = 'BatchNorm2d', - in_channels: int = 3, - groups: int = 1, - width_per_group: int = 64, - zero_init_residual: bool = False, - replace_stride_with_dilation: Optional[List[bool]] = None, - dilations=(1, 1, 1, 1) - ) -> None: - super().__init__() - - self.inplanes = 64 - self.zero_init_residual = zero_init_residual - self.blocks = layers - self.block_expansion = LAYERS.get_module(block_type).expansion - self.dilations = dilations - self.reslayer_common_cfg = dict( - type='ResLayer', - block_type=block_type, - norm_layer_type=norm_layer_type, - groups=groups, - base_width=width_per_group - ) - - if replace_stride_with_dilation is None: - # each element in the tuple indicates if we should replace - # the 2x2 stride with a dilated convolution instead - replace_stride_with_dilation = [False, False, False] - - if len(replace_stride_with_dilation) != 3: - raise ValueError("replace_stride_with_dilation should be None " - "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) - - self.layers_cfg = [ - # conv1 - dict(type='Conv2d', - in_channels=in_channels, - out_channels=self.inplanes, - kernel_size=7, - stride=2, - padding=3, - bias=False), - # bn1 - dict( - type=norm_layer_type, - num_features=self.inplanes - ), - # relu - dict( - type='ReLU', - inplace=True - ), - # maxpool - dict( - type='MaxPool2d', - kernel_size=3, - stride=2, - padding=1 - ), - # layer 1 - dict( - inplanes=self.inplanes, - planes=64, - blocks=self.blocks[0], - dilation=self.dilations[0], - **self.reslayer_common_cfg - ), - # layer 2 - dict( - inplanes=64 * self.block_expansion, - planes=128, - blocks=self.blocks[1], - stride=2, - dilate=replace_stride_with_dilation[0], - dilation=self.dilations[1], - **self.reslayer_common_cfg - ), - # layer 3 - dict( - inplanes=128 * self.block_expansion, - planes=256, - blocks=layers[2], - stride=2, - dilate=replace_stride_with_dilation[1], - dilation=self.dilations[2], - **self.reslayer_common_cfg - ), - # layer 4 - dict( - inplanes=256 * self.block_expansion, - planes=512, - blocks=layers[3], stride=2, - dilate=replace_stride_with_dilation[2], - dilation=self.dilations[3], - **self.reslayer_common_cfg - ), - # avg pool - dict( - type='AdaptiveAvgPool2d', - output_size=(1, 1) - ), - # flatten - dict( - type='LambdaWrapper', - func=lambda mod, x: torch.flatten(x, 1) - ), - # linear - dict( - type='Linear', - in_features=512 * self.block_expansion, - out_features=num_cls - ) - ] - - def forward(self, x: Tensor): - for layer in self.layers: - x = layer(x) - return x, - - def init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_( - m.weight, mode='fan_out', nonlinearity='relu') - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - # Zero-initialize the last BN in each residual branch, - # so that the residual branch starts with zeros, and each residual block behaves like an identity. - # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 - if self.zero_init_residual: - for m in self.modules(): - if isinstance(m, LAYERS.get_module('ResNetBottleneck')): - # type: ignore[arg-type] - nn.init.constant_(m.bn3.weight, 0) - elif isinstance(m, LAYERS.get_module('ResNetBasicBlock')): - # type: ignore[arg-type] - nn.init.constant_(m.bn2.weight, 0) diff --git a/colossalai/nn/model/vision_transformer/__init__.py b/colossalai/nn/model/vision_transformer/__init__.py deleted file mode 100644 index ab9d7e640b98..000000000000 --- a/colossalai/nn/model/vision_transformer/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .vision_transformer import VisionTransformerFromConfig - -__all__ = ['VisionTransformerFromConfig'] diff --git a/colossalai/nn/optimizer/__init__.py b/colossalai/nn/optimizer/__init__.py index f9993c4709d5..c084c5c8671d 100644 --- a/colossalai/nn/optimizer/__init__.py +++ b/colossalai/nn/optimizer/__init__.py @@ -1,14 +1,10 @@ -from .fp16_optimizer import FP16Optimizer +from .colossalai_optimizer import ColossalaiOptimizer from .fused_adam import FusedAdam from .fused_lamb import FusedLAMB from .fused_sgd import FusedSGD from .lamb import Lamb from .lars import Lars -from .zero_redundancy_optimizer_level_1 import ZeroRedundancyOptimizer_Level_1 -from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2 -from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3 __all__ = [ - 'ZeroRedundancyOptimizer_Level_1', 'ZeroRedundancyOptimizer_Level_2', 'ZeroRedundancyOptimizer_Level_3', - 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'FP16Optimizer', 'Lars' + 'ColossalaiOptimizer', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars' ] diff --git a/colossalai/nn/optimizer/_utils.py b/colossalai/nn/optimizer/_utils.py deleted file mode 100644 index 31fc62213437..000000000000 --- a/colossalai/nn/optimizer/_utils.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from colossalai.utils.common import print_rank_0 -import torch -from torch._six import inf - -try: - import colossal_C -except: - print('Colossalai should be built with cuda extension to use the FP16 optimizer') - -from ..multi_tensor_apply import multi_tensor_applier - -from colossalai.constants import IS_TENSOR_PARALLEL, TENSOR_PARALLEL_ATTRIBUTES, NUM_PARTITIONS -import torch.distributed as dist -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc - - -def is_model_parallel_parameter(p): - return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL) - - -def _calc_l2_norm(grads): - norm = 0.0 - if len(grads) > 0: - dummy_overflow_buf = torch.cuda.IntTensor([0]) - norm, _ = multi_tensor_applier( - colossal_C.multi_tensor_l2norm, - dummy_overflow_buf, - [grads], - False # no per-parameter norm - ) - return norm - - -def _calc_lp(grads, norm_type): - norm = 0.0 - for grad in grads: - grad_norm = torch.norm(grad, norm_type) - norm += grad_norm ** norm_type - return norm - -# ======== Gradient Clipping ========= - - -def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): - """Clips gradient norm of an iterable of parameters whose gradients - are in fp32. - - This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and - added functionality to handle model parallel parameters. Note that - the gradients are modified in place. - - Arguments: - parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a - single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients - norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for - infinity norm. - - Returns: - Total norm of the parameters (viewed as a single vector). - """ - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - - # Filter parameters based on: - # - grad should not be none - # - parameter should not be shared - # - should not be a replica due to tensor model parallelism - params = [] - for param in parameters: - if param.grad is not None: - # Make sure the grads are in fp32 - assert param.grad.type() == 'torch.cuda.FloatTensor' - params.append(param) - # Norm parameters. - max_norm = float(max_norm) - norm_type = float(norm_type) - - # Calculate norm. - if norm_type == inf: - total_norm = max(p.grad.data.abs().max() for p in params) - total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) - ops = [] - # Take max across all model-parallel GPUs. - if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1: - ops.append(dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=gpc.get_group( - ParallelMode.TENSOR), - async_op=True)) - if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: - ops.append(dist.all_reduce(total_norm_cuda, - op=dist.ReduceOp.MAX, - group=gpc.get_group( - ParallelMode.PIPELINE), - async_op=True)) - for req in ops: - req.wait() - total_norm = total_norm_cuda[0].item() - else: - tensor_parallel_grads = [] - no_tensor_parallel_grads = [] - for p in params: - if is_model_parallel_parameter(p): - reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS)) ** (1 / norm_type) - tensor_parallel_grads.append(p.grad.data / reductor) - else: - no_tensor_parallel_grads.append(p.grad.data) - if norm_type == 2.0: - tensor_parallel_norm = _calc_l2_norm( - tensor_parallel_grads) ** norm_type - no_tensor_parallel_norm = _calc_l2_norm( - no_tensor_parallel_grads) ** norm_type - else: - tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type) - no_tensor_parallel_grads = _calc_lp( - no_tensor_parallel_grads, norm_type) - # Sum across all model-parallel GPUs. - if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0: - dist.all_reduce(tensor_parallel_norm, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.TENSOR)) - total_norm = tensor_parallel_norm + no_tensor_parallel_norm - if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: - dist.all_reduce(total_norm, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.PIPELINE)) - total_norm = total_norm ** (1.0 / norm_type) - if type(total_norm) == 'torch.cuda.FloatTensor': - total_norm = total_norm.item() - - # Scale. - clip_coeff = max_norm / (total_norm + 1.0e-6) - if clip_coeff < 1.0: - grads = [p.grad.detach() for p in params] - dummy_overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(colossal_C.multi_tensor_scale, - dummy_overflow_buf, - [grads, grads], - clip_coeff) - - return total_norm - - -def count_zeros_fp32(parameters): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - - # Filter parameters based on: - # - grad should not be none - # - parameter should not be shared - # - should not be a replica due to tensor model parallelism - total_num_zeros = 0.0 - for param in parameters: - grad_not_none = param.grad is not None - is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) - if grad_not_none and is_not_tp_duplicate: - grad = param.grad.detach() - num_zeros = grad.numel() - torch.count_nonzero(grad) - total_num_zeros = num_zeros + total_num_zeros - - # Sum across all model-parallel GPUs. - ops = [] - ops.append(dist.all_reduce(total_num_zeros, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.TENSOR), - async_op=True)) - ops.append(dist.all_reduce(total_num_zeros, - op=dist.ReduceOp.SUM, - group=gpc.get_group(ParallelMode.PIPELINE), - async_op=True)) - for req in ops: - req.wait() - total_num_zeros = total_num_zeros.item() - - return total_num_zeros - - -def copy_tensor_parallel_attributes(src_tensor, dst_tensor): - for attr in TENSOR_PARALLEL_ATTRIBUTES: - if hasattr(src_tensor, attr): - val = getattr(src_tensor, attr) - setattr(dst_tensor, attr, val) - - -def param_is_not_tensor_parallel_duplicate(param): - return (hasattr(param, IS_TENSOR_PARALLEL) and - getattr(param, IS_TENSOR_PARALLEL)) or ( - gpc.get_local_rank(ParallelMode.TENSOR) == 0) diff --git a/colossalai/nn/optimizer/colossalai_optimizer.py b/colossalai/nn/optimizer/colossalai_optimizer.py new file mode 100644 index 000000000000..fb0c43903509 --- /dev/null +++ b/colossalai/nn/optimizer/colossalai_optimizer.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch +import torch.nn as nn +from torch import Tensor +from torch.optim import Optimizer +from colossalai.utils import clip_grad_norm_fp32 + + +class ColossalaiOptimizer(Optimizer): + + def __init__(self, optim: Optimizer): + self.optim = optim + + @property + def param_groups(self): + return self.optim.param_groups + + @property + def defaults(self): + return self.optim.defaults + + def add_param_group(self, *args, **kwargs): + return self.optim.add_param_group(*args, **kwargs) + + def step(self, *args, **kwargs): + return self.optim.step(*args, **kwargs) + + def zero_grad(self, *args, **kwargs): + self.optim.zero_grad(*args, **kwargs) + + def load_state_dict(self, *args, **kwargs): + self.optim.load_state_dict(*args, **kwargs) + + def state_dict(self): + return self.optim.state_dict() + + def backward(self, loss: Tensor): + loss.backward() + + def backward_by_grad(self, tensor: Tensor, grad: Tensor): + torch.autograd.backward(tensors=tensor, grad_tensors=grad) + + def clip_grad_norm(self, model: nn.Module, max_norm: float): + if max_norm > 0.0: + clip_grad_norm_fp32(model.parameters(), max_norm) diff --git a/colossalai/nn/optimizer/fused_adam.py b/colossalai/nn/optimizer/fused_adam.py index 5ab31b3635c0..8bcd3841adae 100644 --- a/colossalai/nn/optimizer/fused_adam.py +++ b/colossalai/nn/optimizer/fused_adam.py @@ -2,7 +2,7 @@ import torch from colossalai.registry import OPTIMIZERS -from ..multi_tensor_apply import multi_tensor_applier +from colossalai.utils import multi_tensor_applier @OPTIMIZERS.register_module diff --git a/colossalai/nn/optimizer/fused_lamb.py b/colossalai/nn/optimizer/fused_lamb.py index 14b1167a9d27..8a340a9f35eb 100644 --- a/colossalai/nn/optimizer/fused_lamb.py +++ b/colossalai/nn/optimizer/fused_lamb.py @@ -2,7 +2,7 @@ import torch from colossalai.registry import OPTIMIZERS -from ..multi_tensor_apply import multi_tensor_applier +from colossalai.utils import multi_tensor_applier @OPTIMIZERS.register_module diff --git a/colossalai/nn/optimizer/fused_sgd.py b/colossalai/nn/optimizer/fused_sgd.py index 3950c40be284..4986aa5f5bea 100644 --- a/colossalai/nn/optimizer/fused_sgd.py +++ b/colossalai/nn/optimizer/fused_sgd.py @@ -3,7 +3,7 @@ from torch.optim.optimizer import Optimizer, required from colossalai.registry import OPTIMIZERS -from ..multi_tensor_apply import multi_tensor_applier +from colossalai.utils import multi_tensor_applier @OPTIMIZERS.register_module diff --git a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_1.py b/colossalai/nn/optimizer/zero_redundancy_optimizer_level_1.py deleted file mode 100644 index 05848f1dd5d2..000000000000 --- a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_1.py +++ /dev/null @@ -1,707 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import math -from collections import defaultdict - -import torch -import torch.distributed as dist -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from torch.optim import Optimizer - -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.registry import OPTIMIZER_WRAPPERS -from colossalai.utils import get_current_device, print_rank_0 - - -def get_alignment_padding(flattened_lean_size, sub_partition_id, sub_partition_size): - sub_partition_high_limit = (sub_partition_id + 1) * sub_partition_size - if sub_partition_high_limit <= flattened_lean_size: - return 0 - else: - return min(sub_partition_size, sub_partition_high_limit - flattened_lean_size) - - -def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_count): - group_paddings = [] - flattened_size = sum([tensor.numel() for tensor in tensor_list]) - for i in range(sub_partition_count): - padding = get_alignment_padding(flattened_size, i, sub_partition_size) - group_paddings.append(padding) - - return group_paddings - - -def _single_range_check(current_index, start_index, end_index, tensor_size): - offset = 0 - if (current_index >= start_index) and (current_index < end_index): - # Fully inside bounds - return True, offset - elif (start_index > current_index) and (start_index < (current_index + tensor_size)): - # Partially contained, compute offset - offset = start_index - current_index - return True, offset - else: - return False, offset - - -def _range_check(current_index, element_intervals, tensor_size): - results = [] - for comm_idx, interval in enumerate(element_intervals): - start_index, end_index = interval - contained, offset = _single_range_check( - current_index, start_index, end_index, tensor_size) - if contained: - results.append((contained, offset, comm_idx)) - if len(results) == 0: - return [(False, 0, -1)] - return results - - -@OPTIMIZER_WRAPPERS.register_module -class ZeroRedundancyOptimizer_Level_1(Optimizer): - """ - ZeroRedundancyOptimizer_Level_1 designed to reduce the memory footprint - required for training large deep learning models. - - For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models - https://arxiv.org/abs/1910.02054 - - This version aligns with stage-1 in the paper above. - """ - - def __init__(self, - init_optimizer: Optimizer, - dp_parallel_mode: ParallelMode = ParallelMode.DATA, - max_elements_per_comm=5e8, - verbose=False - ): - # TODO: this class does not work with fp16 AMP_TYPE.PARALLEL, fix it - assert get_current_device() != 'cpu', 'ZeRO optimizer cannot be used on CPU only' - - self.flatten = _flatten_dense_tensors - self.unflatten = _unflatten_dense_tensors - self.optimizer = init_optimizer - self.dp_parallel_mode = dp_parallel_mode - self.verbose = verbose - - # for compatibility with pytorch optim - self.defaults = init_optimizer.defaults - - # param flattened by groups - self._param_groups = [] - self._param_groups_flat = [] - - # parallel_sub_partitioned_fp16_groups[group-idx] -> [comm-ids] -> [rank-ids] - self.parallel_sub_partitioned_groups = [] - # same underlying data as above but viewed as: [groups] -> [rank-ids] -> [comm-ids] - self.parallel_comm_sub_partitioned_groups = [] - - # param partition info - # parameters in each group that will not be updated by this process directly - self.params_not_local = [] - - # parameters that will be updated by this process directly - self.params_in_rank_sub_partitions = [] - - # parameter offsets for parameters in sub-partitions. Parameter - # boundaries may not align with sub-partition boundaries - # so we need to keep track of the offsets - self.params_in_rank_sub_partitions_offsets = [] - - # number of elements per sub-partition in each group - self.sub_partition_sizes = [] - - # number of communication intervals for each group - self.num_comm_intervals_per_group = [] - - self.local_rank = gpc.get_local_rank(self.dp_parallel_mode) - self.partition_count = self.world_size = gpc.get_world_size( - self.dp_parallel_mode) - - self.group_paddings = [] - self.default_device = self.optimizer.param_groups[0]['params'][0].device - - # max elems per param group - self.max_elems_per_comm = [] - - # loop to deal with groups - for i, param_group in enumerate(self.optimizer.param_groups): - # push this group to list before modify - self._param_groups.append(param_group['params']) - - # calculate best max elements per comm based to minimize padding - self.max_elems_per_comm.append( - self.best_max_elems_per_comm( - num_elements=sum(t.numel() for t in self._param_groups[i]), - max_elements_per_comm=max_elements_per_comm - ) - ) - - # flattens all tensors into single 1d tensor aligned with sub-partition size for later dividing - # RS: create aligned sub-partitions - flat_aligned_params = self.flatten_dense_tensors_sub_partition_aligned( - tensor_list=self._param_groups[i], - max_elements_per_comm=self.max_elems_per_comm[i], - ) - self._param_groups_flat.append(flat_aligned_params) - - updated_params = self.unflatten(self._param_groups_flat[i], - self._param_groups[i]) - for p, q in zip(self._param_groups[i], updated_params): - p.data = q.data - - # divide the flat weights into near equal partition equal to the data parallel degree - # each process will compute on a different part of the partition - # RS: split into two layer list -> [comm-id] -> [sub-partitions per rank] - comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \ - self.get_data_parallel_sub_partitions( - tensor=self._param_groups_flat[i], - max_elements_per_comm=self.max_elems_per_comm[i], - ) - self.parallel_comm_sub_partitioned_groups.append( - comm_partitions) # comm -> rank - self.parallel_sub_partitioned_groups.append( - dp_sub_partitions) # rank -> comm - self.sub_partition_sizes.append(sub_partition_size) - self.num_comm_intervals_per_group.append(num_comm_intervals) - - # Compute sub_partition paddings - sub_partition_paddings = get_group_alignment_padding( - tensor_list=self._param_groups[i], - sub_partition_size=sub_partition_size, - sub_partition_count=num_comm_intervals * self.partition_count) - self.group_paddings.append(sub_partition_paddings) - - # modify optimizer of have flat master weight - param_group['params'] = self.parallel_sub_partitioned_groups[i][self.local_rank] - - # RS: divide up the sub-partitions and keep track of offsets for each param - # partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(group=self.dp_process_group) - params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local = self.get_all_sub_partition_info( - tensor_list=self._param_groups[i], - all_element_intervals=element_intervals, - ) - - self.params_in_rank_sub_partitions.append( - params_in_rank_sub_partition) - self.params_not_local.append(params_not_local) - self.params_in_rank_sub_partitions_offsets.append( - params_in_rank_sub_partitions_offsets) - - self.local_sub_partitions_of_groups = [ - group[self.local_rank] for group in self.parallel_sub_partitioned_groups] - self._initialize_optimizer_states() - - @property - def state(self): - return self.optimizer.state - - @state.setter - def state(self, value): - self.optimizer.state = value - - @property - def param_groups(self): - # LSG: return the full param groups instead of local partitions - # of the param groups for compatibility with torch.cuda.amp - param_groups = [] - - for group_id, group in enumerate(self.optimizer.param_groups): - group_containing_all_param = { - 'params': self._param_groups[group_id], - **{k: v for k, v in group.items() if k != 'params'} - } - # LSG: for compatibility with unknown bug with lr scheduler - # TODO: fix this - group_containing_all_param.setdefault('initial_lr', group['lr']) - param_groups.append(group_containing_all_param) - return param_groups - - @param_groups.setter - def param_groups(self, value): - self.optimizer.param_groups = value - - def _initialize_optimizer_states(self): - for group_idx, group in enumerate(self.local_sub_partitions_of_groups): - for idx, sub_partition_param in enumerate(group): - sub_partition_grad = torch.zeros(int( - self.sub_partition_sizes[group_idx]), - dtype=sub_partition_param.dtype).cuda() - sub_partition_param.grad = sub_partition_grad - - self.optimizer.step() - - # LSG: comment out for compatibility with torch.cuda.amp - # for group in self.local_sub_partitions_of_groups: - # for idx, sub_partition_param in enumerate(group): - # sub_partition_param.grad = None - - def best_max_elems_per_comm(self, num_elements, max_elements_per_comm): - # if we use max-elems-per-comm as is, how many comm intervals will there be - max_comm_intervals = math.ceil(num_elements / max_elements_per_comm) - padding_for_max_comm = (max_elements_per_comm * - max_comm_intervals) - num_elements - - # if we use 1 less comm interval how much extra comm padding would be required - min_comm_intervals = num_elements // max_elements_per_comm - if min_comm_intervals == 0: - if self.verbose: - print_rank_0( - f'Using default max_elements_per_comm {max_elements_per_comm}') - return max_elements_per_comm - - padding_for_min_comm = math.ceil( - num_elements / (self.world_size * min_comm_intervals)) - - # choose padding that uses least amount of overhead - if padding_for_max_comm > padding_for_min_comm: - new_max_elements_per_comm = padding_for_min_comm + max_elements_per_comm - if self.verbose: - print_rank_0( - f'Updating max_elements_per_comm from {max_elements_per_comm} -> {new_max_elements_per_comm}') - return new_max_elements_per_comm - else: - if self.verbose: - print_rank_0( - f'Using default max_elements_per_comm {max_elements_per_comm}') - return max_elements_per_comm - - def get_data_parallel_sub_partitions(self, - tensor, - max_elements_per_comm, - ): - total_num_elements = tensor.numel() - - # if total elements is less than our max, revert to splitting into dp partitions - max_elements_per_comm = min(total_num_elements, max_elements_per_comm) - sub_partition_size = int(max_elements_per_comm // self.world_size) - - # Ensure partition alignment was done correctly - num_sub_partitions = int(total_num_elements // sub_partition_size) - assert total_num_elements % sub_partition_size == 0, "{} % {} != 0".format(total_num_elements, - sub_partition_size) - - # Ensure comm interval alignment was done correctly. - num_comm_intervals = int(num_sub_partitions // self.world_size) - assert num_sub_partitions % self.world_size == 0, "{} % {} != 0".format( - num_sub_partitions, self.world_size) - - if self.verbose: - print_rank_0("**** partition info:") - print_rank_0(f"\t total_num_elements={total_num_elements}") - print_rank_0(f"\t world_size={self.world_size}") - print_rank_0(f"\t max_elements_per_comm={max_elements_per_comm}") - print_rank_0(f"\t sub_partition_size={sub_partition_size}") - print_rank_0(f"\t num_sub_partitions={num_sub_partitions}") - print_rank_0(f"\t num_comm_intervals={num_comm_intervals}") - print_rank_0("****") - - # [comm_id] -> [rank] - comm_partitions = [] - for _ in range(num_comm_intervals): - comm_partitions.append([]) - - start = 0 - comm_id = 0 - element_intervals = defaultdict( - list) # [rank] -> [(start,end), (start,end), ...] - for idx in range(num_sub_partitions): - rank_id = idx % self.world_size - sub_partition = tensor.narrow( - 0, start, sub_partition_size).detach() - element_intervals[rank_id].append( - (start, start + sub_partition_size)) - comm_partitions[comm_id].append(sub_partition) - start = start + sub_partition_size - if rank_id == (self.world_size - 1): - comm_id += 1 - - # [rank] -> [comm_id] - sub_partitions = [] - for _ in range(self.world_size): - sub_partitions.append([]) - for comm_id, partitions in enumerate(comm_partitions): - for rank_id, partition in enumerate(partitions): - sub_partitions[rank_id].append(partition) - - return comm_partitions, sub_partitions, element_intervals, sub_partition_size, num_comm_intervals - - def get_all_sub_partition_info(self, - tensor_list, - all_element_intervals, - ): - params_not_local = [] - - # [rank] -> [comm-id] -> [param/offset] - params_in_rank_sub_partition = [] - params_in_rank_sub_partitions_offsets = [] - - for rank in range(self.world_size): - params_in_local_sub_partition = [] - local_sub_partition_offsets = [] - comm_tensor_list = [] - comm_offset_list = [] - current_index = 0 - prev_comm_idx = 0 - for iii, tensor in enumerate(tensor_list): - tensor_size = tensor.numel() - results_list = _range_check(current_index, - all_element_intervals[rank], - tensor_size) - for contained, offset, comm_idx in results_list: - if contained: - if prev_comm_idx != comm_idx: - params_in_local_sub_partition.append( - comm_tensor_list) - comm_tensor_list = [] - local_sub_partition_offsets.append( - comm_offset_list) - comm_offset_list = [] - comm_tensor_list.append(tensor) - comm_offset_list.append(offset) - prev_comm_idx = comm_idx - elif rank == self.local_rank: - params_not_local.append(tensor) - - current_index = current_index + tensor_size - - # assert len(comm_tensor_list) > 0 - # assert len(comm_offset_list) > 0 - params_in_local_sub_partition.append(comm_tensor_list) - local_sub_partition_offsets.append(comm_offset_list) - - params_in_rank_sub_partition.append(params_in_local_sub_partition) - params_in_rank_sub_partitions_offsets.append( - local_sub_partition_offsets) - - return params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local - - def get_flat_sub_partitions(self, - comm_tensor_list, - comm_param_offsets, - sub_partition_size, - dtype, - default_device, - num_comm_intervals=None, - return_partition_params=False): - partition_params = [] - final_param_offsets = [] - flat_sub_partitions = [] - for tensor_list, param_offsets in zip(comm_tensor_list, comm_param_offsets): - flat_tensor_list = [] - current_size = 0 - my_offsets = [] - my_params = [] - - for i, tensor in enumerate(tensor_list): - if tensor.grad is None: - tensor.grad = torch.zeros(tensor.size(), - dtype=tensor.dtype, - device=tensor.device) - param = tensor - tensor = tensor.grad - num_elements = tensor.numel() - tensor_offset = 0 - - # we need to offset to get to the right element - if i == 0 and param_offsets[i] > 0: - tensor_offset = param_offsets[i] - num_elements = num_elements - tensor_offset - - # We don't need all elements of the tensor if this tensor is - # larger than we have space for in our curr sub-partition - if num_elements > (sub_partition_size - current_size): - num_elements = sub_partition_size - current_size - - # we need a narrow view of the tensor based on the tensor offset and number of elements that - # we need from this tensor - if tensor_offset > 0 or num_elements < tensor.numel(): - flat_tensor_list.append(tensor.contiguous().view(-1).narrow( - 0, - int(tensor_offset), - int(num_elements)).to(dtype)) - else: - flat_tensor_list.append(tensor.to(dtype)) - my_params.append(param) - - # remember offset into partition and #elems for this tensor - my_offsets.append((current_size, num_elements)) - - current_size = current_size + num_elements - - # this means its the last partition and does not align with the dp boundary. We need to pad before flattening - if current_size < sub_partition_size: - my_offsets.append((None, None)) - my_params.append(None) - if len(tensor_list) == 0: - assert default_device != None - flat_tensor_list.append( - torch.zeros(int(sub_partition_size - current_size), - dtype=dtype, - device=default_device)) - else: - flat_tensor_list.append( - torch.zeros(int(sub_partition_size - current_size), - dtype=dtype, - device=tensor_list[0].device)) - partition_params.append(my_params) # flat_tensor_list) - final_param_offsets.append(my_offsets) - assert len(flat_tensor_list) == len(my_offsets), "{} {}".format( - len(flat_tensor_list), len(my_offsets)) - flat_sub_partitions.append(self.flatten(flat_tensor_list)) - if num_comm_intervals is not None and len( - flat_sub_partitions) < num_comm_intervals: - # print("padding w. sub partitions to ensure uniform communication") - device = flat_sub_partitions[0].device - for _ in range(num_comm_intervals - len(flat_sub_partitions)): - flat_sub_partitions.append( - torch.zeros(int(sub_partition_size), - dtype=dtype, - device=device)) - partition_params.append([None]) - final_param_offsets.append([(None, None)]) - - if return_partition_params: - assert len(flat_sub_partitions) == len(partition_params) - assert len(partition_params) == len(final_param_offsets), "{} {}".format(len(partition_params), - len(final_param_offsets)) - return flat_sub_partitions, partition_params, final_param_offsets - return flat_sub_partitions - - def zero_grad(self, set_grads_to_None=False): - """ - Zero FP16 parameter grads. - """ - # FP32 grad should never exist. - # For speed, set model fp16 grad to None by default - for group in self._param_groups: - for p in group: - if set_grads_to_None: - p.grad = None - else: - if p.grad is not None: - p.grad.detach_() - p.grad.zero_() - - def free_grad_in_param_list(self, param_list): - for p in param_list: - if isinstance(p, list): - for _p in p: - _p.grad = None - else: - p.grad = None - - def flatten_dense_tensors_sub_partition_aligned(self, - tensor_list, - max_elements_per_comm - ): - assert max_elements_per_comm >= self.world_size, f"max_elements_per_comm {max_elements_per_comm} < dp {self.world_size}" - - num_elements = sum(t.numel() for t in tensor_list) - - # Compute aligned partition size based on parameter count - aligned_param_partition_size = math.ceil( - num_elements / self.world_size) - - # Compute aligned partition size based on communication size - aligned_comm_partition_size = int( - max_elements_per_comm // self.world_size) - - if aligned_param_partition_size <= aligned_comm_partition_size: - sub_partition_count = 1 - sub_partition_size = aligned_param_partition_size - else: - sub_partition_count = math.ceil(aligned_param_partition_size / - aligned_comm_partition_size) - sub_partition_size = aligned_comm_partition_size - - # Compute required padding for alignment to dp and max_elements_per_comm - padding = (sub_partition_count * sub_partition_size * - self.world_size) - num_elements - - if self.verbose: - print_rank_0( - f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}") - print_rank_0( - f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}") - - if padding == 0: - aligned_tensor_list = tensor_list - else: - pad_tensor = torch.zeros(padding, - device=tensor_list[0].device, - dtype=tensor_list[0].dtype) - aligned_tensor_list = tensor_list + [pad_tensor] - - flat_tensors = self.flatten(aligned_tensor_list) - return flat_tensors - - # def reduce_gradients(self): - # # LSG: this reduce gradients method no longer works - # # after code change, please use DataParallelGradientHandler instead - # - # world_size = gpc.get_world_size(self.parallel_mode) - # local_rank = gpc.get_local_rank(self.parallel_mode) - # - # for i, group in enumerate(self._param_groups): - # num_comm_intervals = self.num_comm_intervals_per_group[i] - # all_sub_partitions = [] - # for rank in range(world_size): - # # gsp is list of partitions indexed by comm_idx - # grad_sub_partitions = self.get_flat_sub_partitions( - # comm_tensor_list=self.params_in_rank_sub_partitions[i][rank], - # comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i][rank], - # dtype=self.local_sub_partitions_of_groups[i][0].dtype, - # default_device=self.default_device, - # sub_partition_size=self.sub_partition_sizes[i], - # num_comm_intervals=self.num_comm_intervals_per_group[i]) - # all_sub_partitions.append(grad_sub_partitions) - # - # assert len(grad_sub_partitions) == num_comm_intervals - # - # local_comm_partitions = [] - # for comm_idx in range(num_comm_intervals): - # single_comm_all_partitions = [] - # for rank in range(world_size): - # single_comm_all_partitions.append(all_sub_partitions[rank][comm_idx]) - # - # for partition in single_comm_all_partitions: - # partition.div_(world_size) - # - # dist.reduce_scatter(output=single_comm_all_partitions[local_rank], - # input_list=single_comm_all_partitions, - # group=gpc.get_group(self.parallel_mode)) - - def step(self, closure=None): - local_sub_partitions_grad_groups = [] - - for i, group in enumerate(self._param_groups): - # RS: update free grads w.r.t. sub partitions - # free gradients for all the parameters that are not updated by this process - self.free_grad_in_param_list(self.params_not_local[i]) - - # create flat gradient partitions for parameters updated by this process - local_grad_sub_partitions = self.get_flat_sub_partitions( - comm_tensor_list=self.params_in_rank_sub_partitions[i][self.local_rank], - comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i][self.local_rank], - sub_partition_size=self.sub_partition_sizes[i], - dtype=self.local_sub_partitions_of_groups[i][0].dtype, - num_comm_intervals=self.num_comm_intervals_per_group[i], - default_device=self.default_device) - - # RS: update all our local params with sub-partition grads - for idx, sub_partition_param in enumerate(self.local_sub_partitions_of_groups[i]): - sub_partition_param.grad = local_grad_sub_partitions[idx] - - # RS: update free grads for sub-partitions - # release all the gradient since we have already created a necessary copy in dp_grad_partition - self.free_grad_in_param_list( - self.params_in_rank_sub_partitions[i][self.local_rank]) - - local_sub_partitions_grad_groups.append(local_grad_sub_partitions) - - if closure is None: - loss = self.optimizer.step() - else: - loss = self.optimizer.step(closure=closure) - - # RS: clear our sub partition grads - # LSG: not needed as amp is used instead - # get rid of the fp32 gradients. Not needed anymore - # for group in self.local_sub_partitions_of_groups: - # for idx, sub_partition_param in enumerate(group): - # sub_partition_param.grad = None - - # RS: all_gather/broadcast sub-partitions in separate comm calls - # gather the updated weights from everyone - for all_sub_partitions in self.parallel_comm_sub_partitioned_groups: - for comm_id, sub_partitions in enumerate(all_sub_partitions): - dist.all_gather(sub_partitions, - sub_partitions[self.local_rank], - group=gpc.get_group(self.dp_parallel_mode)) - - # TODO: we probably don't need this? just to be safe - for i in range(len(self._param_groups)): - updated_params = self.unflatten(self._param_groups_flat[i], - self._param_groups[i]) - for p, q in zip(self._param_groups[i], updated_params): - p.data = q.data - - return loss - - def _rigid_state_dict(self): - """Returns a dict that can be loaded for continued training with same DP degree - - Returns a dict containing the current state of this :class:`FP16_Optimizer` instance. - This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict - of the contained Pytorch optimizer. - - Example:: - - checkpoint = {} - checkpoint['model'] = model.state_dict() - checkpoint['optimizer'] = optimizer.state_dict() - torch.save(checkpoint, "saved.pth") - """ - state_dict = {} - for k, v in self.optimizer.state_dict().items(): - state_dict[k] = v - state_dict[ - 'local_sub_partitions_of_groups'] = self.local_sub_partitions_of_groups - return state_dict - - def state_dict(self): - """ - Returns a dict containing the current state of this Optimizer instance. - This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict - of the contained Pytorch optimizer. - - Example:: - - checkpoint = {} - checkpoint['model'] = model.state_dict() - checkpoint['optimizer'] = optimizer.state_dict() - torch.save(checkpoint, "saved.pth") - """ - return self._rigid_state_dict() - - def load_state_dict(self, - state_dict, - load_optimizer_states=True, - ): - """ - Loads a state_dict created by an earlier call to state_dict(). - If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, - whose parameters in turn came from ``model``, it is expected that the user - will call ``model.load_state_dict()`` before - ``fp16_optimizer_instance.load_state_dict()`` is called. - - Example:: - - model = torch.nn.Linear(D_in, D_out).cuda().half() - optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) - optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0) - ... - checkpoint = torch.load("saved.pth") - model.load_state_dict(checkpoint['model']) - optimizer.load_state_dict(checkpoint['optimizer']) - """ - self._rigid_load_state_dict( - state_dict, - load_optimizer_states) - - def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True): - # I think it should actually be ok to reload the optimizer before the model. - state_dict_ = state_dict.copy() - local_sub_partitions_of_groups = state_dict_.pop( - 'local_sub_partitions_of_groups') - - if load_optimizer_states: - self.optimizer.load_state_dict(state_dict_) - - for curr_group, saved_group in zip(self.local_sub_partitions_of_groups, - local_sub_partitions_of_groups): - for curr_param, saved_param in zip(curr_group, saved_group): - curr_param.data.copy_(saved_param.data) diff --git a/colossalai/registry/__init__.py b/colossalai/registry/__init__.py index 9f270f049611..492b278a40f2 100644 --- a/colossalai/registry/__init__.py +++ b/colossalai/registry/__init__.py @@ -2,6 +2,7 @@ import torch.nn as nn import torch.optim as optim import torchvision.models as tv_models +import torchvision.datasets as tv_datasets from torchvision import transforms from .registry import Registry @@ -10,14 +11,12 @@ LOSSES = Registry('losses') MODELS = Registry('models', third_party_library=[tv_models]) OPTIMIZERS = Registry('optimizers', third_party_library=[optim, dist_optim]) -OPTIMIZER_WRAPPERS = Registry('optimizer_wrappers') -DATASETS = Registry('datasets') +DATASETS = Registry('datasets', third_party_library=[tv_datasets]) DIST_GROUP_INITIALIZER = Registry('dist_group_initializer') GRADIENT_HANDLER = Registry('gradient_handler') LOSSES = Registry('losses', third_party_library=[nn]) HOOKS = Registry('hooks') TRANSFORMS = Registry('transforms', third_party_library=[transforms]) -PIPE_ALLOC_POLICY = Registry('pipeline_allocation_policy') -SAMPLERS = Registry('samplers') +DATA_SAMPLERS = Registry('data_samplers') LR_SCHEDULERS = Registry('lr_schedulers') SCHEDULE = Registry('schedules') diff --git a/colossalai/trainer/__init__.py b/colossalai/trainer/__init__.py index 57f7b7495325..84e53dc4e87a 100644 --- a/colossalai/trainer/__init__.py +++ b/colossalai/trainer/__init__.py @@ -1,5 +1,3 @@ from ._trainer import Trainer -from .hooks import * -from .metric import Loss, Accuracy2D, Accuracy3D, Accuracy2p5D, LearningRate -__all__ = ['Trainer', 'Loss', 'Accuracy3D', 'Accuracy2D', 'Accuracy2p5D', 'LearningRate'] +__all__ = ['Trainer'] diff --git a/colossalai/trainer/_trainer.py b/colossalai/trainer/_trainer.py index 92ef64393305..6cce0a3e4f93 100644 --- a/colossalai/trainer/_trainer.py +++ b/colossalai/trainer/_trainer.py @@ -2,19 +2,21 @@ # -*- encoding: utf-8 -*- from typing import Union, List +from colossalai import engine +from colossalai.context.parallel_mode import ParallelMode import torch from torch import Tensor from torch.utils.data import DataLoader from tqdm import tqdm -from colossalai.builder import build_hooks from colossalai.core import global_context as gpc from colossalai.engine import Engine -from colossalai.logging import get_global_dist_logger -from colossalai.nn.data import DataParallelSampler +from colossalai.engine.schedule import NonPipelineSchedule, BaseSchedule +from colossalai.logging import DistributedLogger from colossalai.utils import MultiTimer from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage +from .hooks import BaseHook class Trainer: @@ -32,8 +34,9 @@ class Trainer: def __init__(self, engine: Engine, - verbose: bool = False, - timer: MultiTimer = None): + schedule: BaseSchedule = None, + timer: MultiTimer = None, + logger: DistributedLogger = None): # training-ralated params self._engine = engine self._max_epochs = 0 @@ -43,8 +46,8 @@ def __init__(self, self._steps_per_epoch = 0 # misc params - self._logger = get_global_dist_logger() - self._verbose = verbose + self._logger = logger + self._verbose = logger is not None # hooks can store states in this dict, and could be consumed by other hooks self.states = dict() @@ -55,6 +58,15 @@ def __init__(self, # multi-timer for time benchmarking self._timer = timer + # set schedule which specifies the training iteration for the engine + if schedule is None: + schedule = NonPipelineSchedule() + if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: + assert not isinstance(schedule, NonPipelineSchedule), \ + 'NonPipelineSchedule cannot be used for pipeline parallel training, please use PipelineSchedule instead.' + self._schedule = schedule + self._schedule.pre_processing(engine) + @property def cur_epoch(self): """Returns the index of the current epoch. @@ -90,9 +102,9 @@ def steps_per_epoch(self): def engine(self): return self._engine - @engine.setter - def engine(self, engine_: Engine): - self._engine = engine_ + @property + def schedule(self): + return self._schedule def _set_current_step(self, epoch: int): """Sets current step number. @@ -130,9 +142,9 @@ def _call_hooks(self, func, output=None): # Only after iter hook will receive output for hook in self.hooks: if output is None: - getattr(hook, func)() + getattr(hook, func)(self) else: - getattr(hook, func)(*output) + getattr(hook, func)(self, *output) @staticmethod def _should_display_progress(display_progress: bool): @@ -144,12 +156,6 @@ def _train_epoch(self, train_dataloader: DataLoader, epoch: int = None, display_progress: bool = False): - # set sampler epoch - if epoch is not None and \ - hasattr(train_dataloader, 'sampler') and \ - isinstance(train_dataloader.sampler, DataParallelSampler): - train_dataloader.sampler.set_epoch(epoch) - # set training state self._engine.train() data_iter = iter(train_dataloader) @@ -160,52 +166,22 @@ def _train_epoch(self, else: progress = tqdm(progress, desc=f'[Epoch {epoch} train]') - # train 1 epoch - # ### metric measured by zbian - # train_loss = 0 - # batch_cnt = 0 - # num_samples = 0 - # ###### self._call_hooks('before_train_epoch') self._call_timer(action='start', item='train-epoch') for i in progress: self._call_hooks('before_train_iter') self._call_timer(action='start', item='train-step') - # ### metric measured by zbian - # cur_lr = self._engine.optimizer.param_groups[0]['lr'] - # ###### - - if i == self._steps_per_epoch - 1: - is_last_iteration = True - else: - is_last_iteration = False - # run 1 training step - logits, label, loss = self._engine.step(data_iter, is_last_iteration) + self.engine.zero_grad() + logits, label, loss = self.schedule.forward_backward_step( + self.engine, data_iter, forward_only=False, return_loss=True) + self.engine.step() self._call_timer(action='stop', item='train-step', keep_in_history=True) self._call_hooks('after_train_iter', output=(logits, label, loss)) self._cur_step += 1 - # ### metric measured by zbian - # if display_progress: - # if isinstance(label, (tuple, list)): - # batch_size = label[0].size(0) - # else: - # batch_size = label.size(0) - # batch_size *= self._engine._grad_accum_size * gpc.data_parallel_size - # train_loss += loss.item() - # num_samples += batch_size - # batch_cnt += 1 - # batch_time = self._timer.get_timer('train-step').get_elapsed_time() - # print_features = dict(lr='%g' % cur_lr, - # loss='%.3f' % (train_loss / (i + 1)), - # throughput='%.3f (samples/sec)' % - # (batch_size / (batch_time + 1e-12))) - # progress.set_postfix(**print_features) - # ###### - # stop when max iter is reached if self._exceed_max_step(): break @@ -213,16 +189,6 @@ def _train_epoch(self, self._call_timer(action='stop', item='train-epoch', keep_in_history=True) self._call_hooks('after_train_epoch') self._call_timer(action='reset', item='train-step') - # ### metric measured by zbian - # if display_progress: - # epoch_time = self._timer.get_timer('train-epoch').get_elapsed_time() - # epoch_loss = train_loss / batch_cnt - # epoch_throughput = num_samples / (epoch_time + 1e-12) - # if display_progress: - # self._logger.info( - # '[Epoch %d] Loss: %.3f | Throughput: %.3f (samples/sec)' % - # (epoch, epoch_loss, epoch_throughput)) - # ###### def _eval(self, test_dataloader: DataLoader, @@ -235,32 +201,33 @@ def _eval(self, num_steps = len(test_dataloader) self._call_hooks('before_test') + # prepare progress bar + progress = range(num_steps) + if display_progress: + desc = 'Evaluation' + if epoch is not None: + desc = '[Epoch %d val]' % epoch + progress = tqdm(progress, desc=desc) + + self._call_hooks('before_test_epoch') + self._call_timer(action='start', item='test-epoch') with torch.no_grad(): - # prepare progress bar - progress = range(num_steps) - if display_progress: - desc = 'Evaluation' - if epoch is not None: - desc = '[Epoch %d val]' % epoch - progress = tqdm(progress, desc=desc) - - self._call_hooks('before_test_epoch') - self._call_timer(action='start', item='test-epoch') for _ in progress: self._call_hooks('before_test_iter') self._call_timer(action='start', item='test-step') - logits, label, loss = self._engine.step(data_iter, return_loss=True) + logits, label, loss = self.schedule.forward_backward_step( + self.engine, data_iter, forward_only=True, return_loss=True) self._call_timer(action='stop', item='test-step', keep_in_history=True) self._call_hooks('after_test_iter', output=(logits, label, loss)) - self._call_timer(action='stop', item='test-epoch', keep_in_history=True) - self._call_hooks('after_test_epoch') + self._call_timer(action='stop', item='test-epoch', keep_in_history=True) + self._call_hooks('after_test_epoch') self._call_hooks('after_test') self._call_timer(action='reset', item='test-step') self._call_timer(action='reset', item='test-epoch') def _exceed_max_step(self): - return self._max_steps is not None and self._cur_step > self._max_steps + return self._max_steps is not None and self._cur_step >= self._max_steps def fit(self, train_dataloader: DataLoader, @@ -268,7 +235,7 @@ def fit(self, max_steps: int = None, test_dataloader: DataLoader = None, test_interval: int = 1, - hooks_cfg: dict = None, + hooks: List[BaseHook] = None, display_progress: bool = False, ): """Trains the model to fit training data. @@ -291,7 +258,7 @@ def fit(self, """ # set epochs and steps, consider gradient accumulation - self._steps_per_epoch = len(train_dataloader) // self._engine.gradient_accumulation + self._steps_per_epoch = len(train_dataloader) self._max_steps = max_steps self._max_epochs = epochs @@ -304,19 +271,18 @@ def fit(self, # reset hooks self._reset_states() - self.hooks = list() - - # build hooks - if hooks_cfg is not None: - for cfg in hooks_cfg: - hook = build_hooks(cfg, self) - self.hooks.append(hook) + if hooks is not None: + assert isinstance(hooks, list), f'expected argument hooks be to list, but got {type(hooks)}' + else: + hooks = [] + self.hooks = hooks self.hooks.sort(key=lambda hook: hook.priority) if self._verbose: for hook in self.hooks: self._logger.info( - f'build {hook.__class__.__name__} for training, priority = {hook.priority}', ranks=[0]) - self._logger.info("Lower value means higher priority for calling hook function") + f'Using {hook.__class__.__name__} for training, priority = {hook.priority}', ranks=[0]) + self._logger.info("Lower value means higher priority for calling hook function", ranks=[0]) + self._call_hooks('after_hook_is_attached') # start train self._engine.train() @@ -347,13 +313,15 @@ def fit(self, # check for termination if self._exceed_max_step(): self._logger.info( - f"Max number of steps {max_steps} has been reached, training is stopped automatically") + f"Max number of steps {max_steps} has been reached, training is stopped automatically", + ranks=[0]) break self._call_hooks('after_train') self._call_timer('reset', 'train-epoch') def evaluate(self, test_dataloader: DataLoader, + hooks: List[BaseHook] = None, display_progress: bool = False): """Evaluates the model with testing data. @@ -365,6 +333,21 @@ def evaluate(self, # set display display_progress = self._should_display_progress(display_progress) + # reset hooks + self._reset_states() + if hooks is not None: + assert isinstance(hooks, list), f'expected argument hooks be to list, but got {type(hooks)}' + else: + hooks = [] + self.hooks = hooks + self.hooks.sort(key=lambda hook: hook.priority) + if self._verbose: + for hook in self.hooks: + self._logger.info( + f'Using {hook.__class__.__name__} for training, priority = {hook.priority}', ranks=[0]) + self._logger.info("Lower value means higher priority for calling hook function", ranks=[0]) + self._call_hooks('after_hook_is_attached') + # eval self._eval(test_dataloader=test_dataloader, display_progress=display_progress, @@ -389,5 +372,6 @@ def predict(self, data: Union[Tensor, List[Tensor]]): # for compatibility with schedule simple_dataloader = [(data, None)] data_iter = iter(simple_dataloader) - output, _, _ = self._engine.step(data_iter, return_loss=False) - return output \ No newline at end of file + output, _, _ = self.schedule.forward_backward_step( + self.engine, data_iter, forward_only=True, return_loss=False) + return output diff --git a/colossalai/trainer/hooks/_base_hook.py b/colossalai/trainer/hooks/_base_hook.py index 4d510ab0f67a..e4b5edfbfa72 100644 --- a/colossalai/trainer/hooks/_base_hook.py +++ b/colossalai/trainer/hooks/_base_hook.py @@ -5,9 +5,6 @@ from torch import Tensor -from colossalai.logging import get_global_dist_logger -from .._trainer import Trainer - class BaseHook(ABC): """This class allows users to add desired actions in specific time points @@ -18,27 +15,31 @@ class BaseHook(ABC): :type trainer: Trainer :type priority: int """ - def __init__(self, trainer: Trainer, priority: int) -> None: - self.trainer = trainer + + def __init__(self, priority: int) -> None: self.priority = priority - self.logger = get_global_dist_logger() - def before_train(self): + def after_hook_is_attached(self, trainer): + """Actions after hooks are attached to trainer. + """ + pass + + def before_train(self, trainer): """Actions before training. """ pass - def after_train(self): + def after_train(self, trainer): """Actions after training. """ pass - def before_train_iter(self): + def before_train_iter(self, trainer): """Actions before running a training iteration. """ pass - def after_train_iter(self, output: Tensor, label: Tensor, loss: Tensor): + def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): """Actions after running a training iteration. :param output: Output of the model @@ -50,42 +51,42 @@ def after_train_iter(self, output: Tensor, label: Tensor, loss: Tensor): """ pass - def before_train_epoch(self): + def before_train_epoch(self, trainer): """Actions before starting a training epoch. """ pass - def after_train_epoch(self): + def after_train_epoch(self, trainer): """Actions after finishing a training epoch. """ pass - def before_test(self): + def before_test(self, trainer): """Actions before evaluation. """ pass - def after_test(self): + def after_test(self, trainer): """Actions after evaluation. """ pass - def before_test_epoch(self): + def before_test_epoch(self, trainer): """Actions before starting a testing epoch. """ pass - def after_test_epoch(self): + def after_test_epoch(self, trainer): """Actions after finishing a testing epoch. """ pass - def before_test_iter(self): + def before_test_iter(self, trainer): """Actions before running a testing iteration. """ pass - def after_test_iter(self, output: Tensor, label: Tensor, loss: Tensor): + def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): """Actions after running a testing iteration. :param output: Output of the model @@ -97,11 +98,11 @@ def after_test_iter(self, output: Tensor, label: Tensor, loss: Tensor): """ pass - def init_runner_states(self, key, val): + def init_runner_states(self, trainer, key, val): """Initializes trainer's state. :param key: Key of reseting state :param val: Value of reseting state """ - if key not in self.trainer.states: - self.trainer.states[key] = val + if key not in trainer.states: + trainer.states[key] = val diff --git a/colossalai/trainer/hooks/_checkpoint_hook.py b/colossalai/trainer/hooks/_checkpoint_hook.py index 0f53f79c9e5e..939e957bdb49 100644 --- a/colossalai/trainer/hooks/_checkpoint_hook.py +++ b/colossalai/trainer/hooks/_checkpoint_hook.py @@ -2,9 +2,9 @@ # -*- encoding: utf-8 -*- import os.path as osp +from colossalai.logging import get_dist_logger from colossalai.registry import HOOKS -from colossalai.trainer import Trainer from colossalai.trainer.hooks import BaseHook from colossalai.utils import is_dp_rank_0 from colossalai.utils.checkpointing import get_latest_checkpoint_path, get_checkpoint_path @@ -16,12 +16,10 @@ class SaveCheckpointHook(BaseHook): """Saves the model by interval in training process. - :param trainer: Trainer attached with current hook :param interval: Saving interval :param checkpoint_dir: Directory of saving checkpoint :param suffix: Saving suffix of the file :param priority: Priority in the printing, hooks with small priority will be printed in front - :type trainer: Trainer :type interval: int, optional :type checkpoint_dir: int, optional :type suffix: str, optional @@ -29,59 +27,55 @@ class SaveCheckpointHook(BaseHook): """ def __init__(self, - trainer: Trainer, interval: int = 1, checkpoint_dir: str = None, suffix: str = '', priority: int = 10): - super().__init__(trainer=trainer, priority=priority) - assert isinstance(trainer, Trainer), \ - f'SaveCheckpointHook expects a Trainer, got {type(trainer)}' + super().__init__(priority=priority) self.interval = interval self.checkpoint_dir = checkpoint_dir self.suffix = suffix + self.logger = get_dist_logger() # get lr scheduler from the LRSchedulerHook before train self._lr_scheduler = None - def before_train(self): + def after_hook_is_attached(self, trainer): # check if lr scheduler is present in LRSchedulerHook - for hook in self.trainer.hooks: + for hook in trainer.hooks: if isinstance(hook, LRSchedulerHook): self._lr_scheduler = hook.lr_scheduler break - def after_train_epoch(self): + def after_train_epoch(self, trainer): """Saves the model after a training epoch. """ # save by interval - if self.trainer.cur_epoch % self.interval == 0: + if trainer.cur_epoch % self.interval == 0: # only gpus with data parallel rank equals to 0 write to the disk if is_dp_rank_0(): save_path = get_checkpoint_path(self.checkpoint_dir, - self.trainer.cur_epoch, + trainer.cur_epoch, suffix=self.suffix) save_checkpoint(save_path, - self.trainer.cur_epoch, - self.trainer.engine.model, - self.trainer.engine.optimizer, + trainer.cur_epoch, + trainer.engine.model, + trainer.engine.optimizer, self._lr_scheduler) self.logger.info( - f'checkpoint for epoch {self.trainer.cur_epoch} is saved to {self.checkpoint_dir}', ranks=[0]) + f'checkpoint for epoch {trainer.cur_epoch} is saved to {self.checkpoint_dir}', ranks=[0]) @HOOKS.register_module class LoadCheckpointHook(BaseHook): """Loads the model before training process. - :param trainer: Trainer attached with current hook :param checkpoint_dir: Directory of saving checkpoint :param epoch: Epoch number to be set :param finetune: Whether allows to load a part of the model :param strict: Whether loads a model that has the same shape of parameters :param priority: Priority in the printing, hooks with small priority will be printed in front - :type trainer: Trainer :type checkpoint_dir: str, optional :type epoch: str, optional :type finetune: bool, optional @@ -90,28 +84,26 @@ class LoadCheckpointHook(BaseHook): """ def __init__(self, - trainer: Trainer = None, checkpoint_dir: str = None, epoch: int = -1, finetune: bool = False, strict: bool = False, suffix: str = '', priority: int = 0) -> None: - super().__init__(trainer=trainer, priority=priority) - assert isinstance(trainer, Trainer), \ - f'LoadLatestCheckpointHook excepts a Trainer, got {type(trainer)}' + super().__init__(priority=priority) self.epoch = epoch self.checkpoint_dir = checkpoint_dir self.finetune = finetune self.suffix = suffix self.strict = strict + self.logger = get_dist_logger() - def before_train(self): + def before_train(self, trainer): """Loads parameters to the model before training. """ # check if lr scheduler is present in LRSchedulerHook lr_scheduler = None - for hook in self.trainer.hooks: + for hook in trainer.hooks: if isinstance(hook, LRSchedulerHook): lr_scheduler = hook.lr_scheduler break @@ -124,15 +116,15 @@ def before_train(self): if osp.exists(path): last_epoch, _ = load_checkpoint(path, - self.trainer.engine.model, - self.trainer.engine.optimizer, + trainer.engine.model, + trainer.engine.optimizer, lr_scheduler, finetune=self.finetune, strict=self.strict) if self.finetune: - self.trainer.cur_epoch = 0 + trainer.cur_epoch = 0 else: - self.trainer.cur_epoch = last_epoch + trainer.cur_epoch = last_epoch self.logger.info( f'loaded checkpoint from {path}', ranks=[0]) diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/trainer/hooks/_log_hook.py index d4d84dff76f2..8693dd515d9d 100644 --- a/colossalai/trainer/hooks/_log_hook.py +++ b/colossalai/trainer/hooks/_log_hook.py @@ -6,35 +6,40 @@ import torch from torch.utils.tensorboard import SummaryWriter - +from typing import List +from decimal import Decimal from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.registry import HOOKS -from colossalai.trainer._trainer import Trainer -from colossalai.utils import get_global_multitimer, set_global_multitimer_status, report_memory_usage, is_dp_rank_0, \ - is_tp_rank_0, is_no_pp_or_last_stage +from colossalai.logging import DistributedLogger +from colossalai.utils import report_memory_usage, is_dp_rank_0, \ + is_tp_rank_0, is_no_pp_or_last_stage, MultiTimer from ._base_hook import BaseHook def _format_number(val): if isinstance(val, float): - return f'{val:.5f}' - elif torch.is_floating_point(val): - return f'{val.item():.5f}' + return f'{val:.5g}' + elif torch.is_tensor(val) and torch.is_floating_point(val): + return f'{val.item():.5g}' return val -class EpochIntervalHook(BaseHook): - def __init__(self, trainer: Trainer, interval: int = 1, priority: int = 1): - super().__init__(trainer, priority) +class LogByEpochHook(BaseHook): + def __init__(self, + logger, + interval: int = 1, + priority: int = 1): + super().__init__(priority) + self.logger = logger self._interval = interval - def _is_epoch_to_log(self): - return self.trainer.cur_epoch % self._interval == 0 + def _is_epoch_to_log(self, trainer): + return trainer.cur_epoch % self._interval == 0 @HOOKS.register_module -class LogMetricByEpochHook(EpochIntervalHook): +class LogMetricByEpochHook(LogByEpochHook): """Specialized Hook to record the metric to log. :param trainer: Trainer attached with current hook @@ -45,32 +50,35 @@ class LogMetricByEpochHook(EpochIntervalHook): :type priority: int, optional """ - def __init__(self, trainer: Trainer, interval: int = 1, priority: int = 10) -> None: - super().__init__(trainer=trainer, interval=interval, priority=priority) + def __init__(self, + logger, + interval: int = 1, + priority: int = 10) -> None: + super().__init__(logger, interval, priority) self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() and is_no_pp_or_last_stage() - def _get_str(self, mode): + def _get_str(self, trainer, mode): msg = [] - for metric_name, metric_calculator in self.trainer.states['metrics'][mode].items(): + for metric_name, metric_calculator in trainer.states['metrics'][mode].items(): msg.append( f'{metric_name} = {_format_number(metric_calculator.get_accumulated_value())}') msg = ', '.join(msg) return msg - def after_train_epoch(self): - if self._is_epoch_to_log(): - msg = self._get_str(mode='train') + def after_train_epoch(self, trainer): + if self._is_epoch_to_log(trainer): + msg = self._get_str(trainer=trainer, mode='train') if self._is_rank_to_log: self.logger.info( - f'Training - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}') + f'Training - Epoch {trainer.cur_epoch} - {self.__class__.__name__}: {msg}') - def after_test_epoch(self): - if self._is_epoch_to_log(): - msg = self._get_str(mode='test') + def after_test_epoch(self, trainer): + if self._is_epoch_to_log(trainer): + msg = self._get_str(trainer=trainer, mode='test') if self._is_rank_to_log: self.logger.info( - f'Testing - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}') + f'Testing - Epoch {trainer.cur_epoch} - {self.__class__.__name__}: {msg}') @HOOKS.register_module @@ -86,74 +94,79 @@ class TensorboardHook(BaseHook): """ def __init__(self, - trainer: Trainer, log_dir: str, - dp_rank_0_only: bool = True, - tp_rank_0_only: bool = True, + ranks: List = None, + parallel_mode: ParallelMode = ParallelMode.GLOBAL, priority: int = 10, ) -> None: - super().__init__(trainer=trainer, priority=priority) + super().__init__(priority=priority) # create log dir if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0: os.makedirs(log_dir, exist_ok=True) # determine the ranks to generate tensorboard logs - self._is_valid_rank_to_log = is_no_pp_or_last_stage() + self._is_valid_rank_to_log = False + if not gpc.is_initialized(parallel_mode): + self._is_valid_rank_to_log = True + else: + local_rank = gpc.get_local_rank(parallel_mode) - if dp_rank_0_only: - self._is_valid_rank_to_log = self._is_valid_rank_to_log and is_dp_rank_0() + if ranks is None or local_rank in ranks: + self._is_valid_rank_to_log = True - if tp_rank_0_only: - self._is_valid_rank_to_log = self._is_valid_rank_to_log and is_tp_rank_0() + # check for + if gpc.is_initialized(ParallelMode.PIPELINE) and \ + not gpc.is_last_rank(ParallelMode.PIPELINE) and self._is_valid_rank_to_log: + raise ValueError("Tensorboard hook can only log on the last rank of pipeline process group") if self._is_valid_rank_to_log: # create workspace on only one rank - if gpc.is_initialized(ParallelMode.GLOBAL): - rank = gpc.get_global_rank() + if gpc.is_initialized(parallel_mode): + rank = gpc.get_local_rank(parallel_mode) else: rank = 0 # create workspace - log_dir = osp.join(log_dir, f'rank_{rank}') + log_dir = osp.join(log_dir, f'{parallel_mode}_rank_{rank}') os.makedirs(log_dir, exist_ok=True) self.writer = SummaryWriter( log_dir=log_dir, filename_suffix=f'_rank_{rank}') - def _log_by_iter(self, mode: str): - for metric_name, metric_calculator in self.trainer.states['metrics'][mode].items(): + def _log_by_iter(self, trainer, mode: str): + for metric_name, metric_calculator in trainer.states['metrics'][mode].items(): if metric_calculator.epoch_only: continue val = metric_calculator.get_last_step_value() if self._is_valid_rank_to_log: self.writer.add_scalar(f'{metric_name}/{mode}', val, - self.trainer.cur_step) + trainer.cur_step) - def _log_by_epoch(self, mode: str): - for metric_name, metric_calculator in self.trainer.states['metrics'][mode].items(): + def _log_by_epoch(self, trainer, mode: str): + for metric_name, metric_calculator in trainer.states['metrics'][mode].items(): if metric_calculator.epoch_only: val = metric_calculator.get_accumulated_value() if self._is_valid_rank_to_log: self.writer.add_scalar(f'{metric_name}/{mode}', val, - self.trainer.cur_step) + trainer.cur_step) - def after_test_iter(self, *args): - self._log_by_iter(mode='test') + def after_test_iter(self, trainer, *args): + self._log_by_iter(trainer, mode='test') - def after_test_epoch(self): - self._log_by_epoch(mode='test') + def after_test_epoch(self, trainer): + self._log_by_epoch(trainer, mode='test') - def after_train_iter(self, *args): - self._log_by_iter(mode='train') + def after_train_iter(self, trainer, *args): + self._log_by_iter(trainer, mode='train') - def after_train_epoch(self): - self._log_by_epoch(mode='train') + def after_train_epoch(self, trainer): + self._log_by_epoch(trainer, mode='train') @HOOKS.register_module -class LogTimingByEpochHook(EpochIntervalHook): +class LogTimingByEpochHook(LogByEpochHook): """Specialized Hook to write timing record to log. :param trainer: Trainer attached with current hook @@ -167,57 +180,61 @@ class LogTimingByEpochHook(EpochIntervalHook): """ def __init__(self, - trainer: Trainer, + timer: MultiTimer, + logger: DistributedLogger, interval: int = 1, priority: int = 10, log_eval: bool = True, ignore_num_train_steps: int = 0 ) -> None: - super().__init__(trainer=trainer, interval=interval, priority=priority) - set_global_multitimer_status(True) - self._global_timer = get_global_multitimer() + super().__init__(logger=logger, interval=interval, priority=priority) + self._timer = timer self._log_eval = log_eval self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() - self.ignore_num_train_steps = ignore_num_train_steps + + # extra handling to avoid the unstable readings of the first + # few training steps to affect the history mean time + self._ignore_num_train_steps = ignore_num_train_steps + self._is_train_step_history_trimmed = False def _get_message(self): msg = [] - for timer_name, timer in self._global_timer: + for timer_name, timer in self._timer: last_elapsed_time = timer.get_elapsed_time() if timer.has_history: - if timer_name == 'train-step': - timer._history = timer._history[self.ignore_num_train_steps:] + if timer_name == 'train-step' and not self._is_train_step_history_trimmed: + timer._history = timer._history[self._ignore_num_train_steps:] + self._is_train_step_history_trimmed = True history_mean = timer.get_history_mean() history_sum = timer.get_history_sum() msg.append( - f'{timer_name}: last elapsed time = {last_elapsed_time}, ' - f'history sum = {history_sum}, history mean = {history_mean}') + f'{timer_name}: last = {_format_number(last_elapsed_time)} s, mean = {_format_number(history_mean)} s') else: msg.append( - f'{timer_name}: last elapsed time = {last_elapsed_time}') + f'{timer_name}: last = {_format_number(last_elapsed_time)} s') msg = ', '.join(msg) return msg - def after_train_epoch(self): + def after_train_epoch(self, trainer): """Writes log after finishing a training epoch. """ - if self._is_epoch_to_log() and self._is_rank_to_log: + if self._is_epoch_to_log(trainer) and self._is_rank_to_log: msg = self._get_message() self.logger.info( - f'Training - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}, num steps per epoch={self.trainer.steps_per_epoch}') + f'Training - Epoch {trainer.cur_epoch} - {self.__class__.__name__}: {msg}, num steps per epoch={trainer.steps_per_epoch}') - def after_test_epoch(self): + def after_test_epoch(self, trainer): """Writes log after finishing a testing epoch. """ - if self._is_epoch_to_log() and self._is_rank_to_log and self._log_eval: + if self._is_epoch_to_log(trainer) and self._is_rank_to_log and self._log_eval: msg = self._get_message() self.logger.info( - f'Testing - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}: {msg}') + f'Testing - Epoch {trainer.cur_epoch} - {self.__class__.__name__}: {msg}') @HOOKS.register_module -class LogMemoryByEpochHook(EpochIntervalHook): +class LogMemoryByEpochHook(LogByEpochHook): """Specialized Hook to write memory usage record to log. :param trainer: Trainer attached with current hook @@ -231,33 +248,34 @@ class LogMemoryByEpochHook(EpochIntervalHook): """ def __init__(self, - trainer: Trainer, + logger: DistributedLogger, interval: int = 1, priority: int = 10, - log_eval: bool = True + log_eval: bool = True, + report_cpu: bool = False ) -> None: - super().__init__(trainer=trainer, interval=interval, priority=priority) - set_global_multitimer_status(True) - self._global_timer = get_global_multitimer() + super().__init__(logger=logger, interval=interval, priority=priority) self._log_eval = log_eval self._is_rank_to_log = is_dp_rank_0() and is_tp_rank_0() - def before_train(self): + def before_train(self, trainer): """Resets before training. """ - if self._is_epoch_to_log() and self._is_rank_to_log: - report_memory_usage('before-train') + if self._is_epoch_to_log(trainer) and self._is_rank_to_log: + report_memory_usage('before-train', self.logger) - def after_train_epoch(self): + def after_train_epoch(self, trainer): """Writes log after finishing a training epoch. """ - if self._is_epoch_to_log() and self._is_rank_to_log: + if self._is_epoch_to_log(trainer) and self._is_rank_to_log: report_memory_usage( - f'After Train - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}') + f'After Train - Epoch {trainer.cur_epoch} - {self.__class__.__name__}', + self.logger) - def after_test(self): + def after_test(self, trainer): """Reports after testing. """ - if self._is_epoch_to_log() and self._is_rank_to_log and self._log_eval: + if self._is_epoch_to_log(trainer) and self._is_rank_to_log and self._log_eval: report_memory_usage( - f'After Test - Epoch {self.trainer.cur_epoch} - {self.__class__.__name__}') + f'After Test - Epoch {trainer.cur_epoch} - {self.__class__.__name__}', + self.logger) diff --git a/colossalai/trainer/hooks/_lr_scheduler_hook.py b/colossalai/trainer/hooks/_lr_scheduler_hook.py index ae9fcd2561cc..d5bbe75910d8 100644 --- a/colossalai/trainer/hooks/_lr_scheduler_hook.py +++ b/colossalai/trainer/hooks/_lr_scheduler_hook.py @@ -3,7 +3,6 @@ from colossalai.builder import build_lr_scheduler from colossalai.registry import HOOKS from ._metric_hook import MetricHook -from .._trainer import Trainer from ..metric import LearningRate @@ -22,50 +21,26 @@ class LRSchedulerHook(MetricHook): """ def __init__(self, - trainer: Trainer, - lr_scheduler_cfg: dict, - by_epoch: bool = True, + lr_scheduler, + by_epoch: bool, store_lr_in_state: bool = True, priority: int = 1, ): - super().__init__(trainer=trainer, priority=priority) + super().__init__(priority=priority) self.by_epoch = by_epoch + self.lr_scheduler = lr_scheduler + self.store_lr_in_state = store_lr_in_state - assert not ('warmup_epochs' in lr_scheduler_cfg and 'warmup_steps' in lr_scheduler_cfg), \ - 'Do not set both warmup_epochs and warmup_steps for lr_scheduler.' - warmup_steps = 0 - if by_epoch: - total_steps = trainer.max_epochs - if 'warmup_epochs' in lr_scheduler_cfg: - warmup_steps = lr_scheduler_cfg['warmup_epochs'] - elif 'warmup_steps' in lr_scheduler_cfg: - warmup_steps = lr_scheduler_cfg['warmup_steps'] - else: - total_steps = trainer.max_epochs * trainer.steps_per_epoch - if trainer.max_steps is not None: - total_steps = min(total_steps, trainer.max_steps) - if 'warmup_epochs' in lr_scheduler_cfg: - warmup_steps = lr_scheduler_cfg['warmup_epochs'] * trainer.steps_per_epoch - elif 'warmup_steps' in lr_scheduler_cfg: - warmup_steps = lr_scheduler_cfg['warmup_steps'] + def after_hook_is_attached(self, trainer): + trainer.states['metrics']['train']['lr'] = LearningRate(epoch_only=self.by_epoch, + initial_lr=self.lr_scheduler.get_last_lr()[0]) - lr_scheduler_cfg['total_steps'] = total_steps - lr_scheduler_cfg['warmup_steps'] = warmup_steps - lr_scheduler_cfg.pop('warmup_epochs', None) - - self.lr_scheduler = build_lr_scheduler( - lr_scheduler_cfg, trainer.engine.optimizer) - - if store_lr_in_state: - self.trainer.states['metrics']['train']['lr'] = LearningRate(epoch_only=by_epoch, - initial_lr=self.lr_scheduler.get_lr()[0]) - - def after_train_epoch(self): + def after_train_epoch(self, trainer): if self.by_epoch: self.lr_scheduler.step() - self.trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_lr()[0]) + trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_last_lr()[0]) - def after_train_iter(self, output: Tensor, label: Tensor, loss: Tensor): + def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): if not self.by_epoch: self.lr_scheduler.step() - self.trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_lr()[0]) + trainer.states['metrics']['train']['lr'].update(self.lr_scheduler.get_last_lr()[0]) diff --git a/colossalai/trainer/hooks/_metric_hook.py b/colossalai/trainer/hooks/_metric_hook.py index 50834d8ab705..aa2e22fa0825 100644 --- a/colossalai/trainer/hooks/_metric_hook.py +++ b/colossalai/trainer/hooks/_metric_hook.py @@ -5,7 +5,6 @@ from colossalai.registry import HOOKS from colossalai.utils import is_no_pp_or_last_stage from ._base_hook import BaseHook -from .._trainer import Trainer from ..metric import Loss, Accuracy1D, Accuracy2D, Accuracy, Accuracy2p5D, Accuracy3D @@ -22,16 +21,14 @@ class MetricHook(BaseHook): """ def __init__(self, - trainer: Trainer, priority: int, ): - super().__init__(trainer, priority) + super().__init__(priority) self._is_stage_to_compute = is_no_pp_or_last_stage() - self._check_metric_states_initialization() - def _check_metric_states_initialization(self): - if 'metrics' not in self.trainer.states: - self.init_runner_states('metrics', dict(train={}, test={})) + def _check_metric_states_initialization(self, trainer): + if 'metrics' not in trainer.states: + self.init_runner_states(trainer, 'metrics', dict(train={}, test={})) @HOOKS.register_module @@ -44,32 +41,35 @@ class LossHook(MetricHook): :type priority: int, optional """ - def __init__(self, trainer: Trainer, priority: int = 0): - super().__init__(trainer, priority) + def __init__(self, priority: int = 0): + super().__init__(priority) + + def after_hook_is_attached(self, trainer): + self._check_metric_states_initialization(trainer) if self._is_stage_to_compute: self.train_loss = Loss(epoch_only=False) self.test_loss = Loss(epoch_only=True) # register the metric calculator - self.trainer.states['metrics']['train'][ + trainer.states['metrics']['train'][ self.train_loss.__class__.__name__] = self.train_loss - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.test_loss.__class__.__name__] = self.test_loss - def before_train_epoch(self): + def before_train_epoch(self, trainer): if self._is_stage_to_compute: self.train_loss.reset() - def after_train_iter(self, logits, label, loss): + def after_train_iter(self, trainer, logits, label, loss): if self._is_stage_to_compute: self.train_loss.update(loss) - def before_test_epoch(self): + def before_test_epoch(self, trainer): if self._is_stage_to_compute: self.test_loss.reset() - def after_test_iter(self, logits, label, loss): + def after_test_iter(self, trainer, logits, label, loss): if self._is_stage_to_compute: self.test_loss.update(loss) @@ -85,26 +85,27 @@ class Accuracy1DHook(MetricHook): :type priority: int, optional """ - def __init__(self, trainer: Trainer, priority: int = 10): - super().__init__(trainer, priority) + def __init__(self, priority: int = 10): + super().__init__(priority) + def after_hook_is_attached(self, trainer): + self._check_metric_states_initialization(trainer) if self._is_stage_to_compute: self.metric = Accuracy1D(epoch_only=True) # register the metric - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.metric.__class__.__name__] = self.metric - def before_test(self): + def before_test(self, trainer): if self._is_stage_to_compute: self.metric.reset() - def after_test_iter(self, logits, label, *args): + def after_test_iter(self, trainer, logits, label, *args): if self._is_stage_to_compute: self.metric.update(logits, label) - @HOOKS.register_module class Accuracy2DHook(MetricHook): """Specialized hook class for :class:`Accuracy2D`. @@ -116,42 +117,46 @@ class Accuracy2DHook(MetricHook): :type priority: int, optional """ - def __init__(self, trainer: Trainer, priority: int = 0): - super().__init__(trainer, priority) + def __init__(self, priority: int = 0): + super().__init__(priority) + def after_hook_is_attached(self, trainer): + self._check_metric_states_initialization(trainer) if self._is_stage_to_compute: self.metric = Accuracy2D(epoch_only=True) # register the metric - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.metric.__class__.__name__] = self.metric - def before_test(self): + def before_test(self, trainer): if self._is_stage_to_compute: self.metric.reset() - def after_test_iter(self, logits, label, *args): + def after_test_iter(self, trainer, logits, label, *args): if self._is_stage_to_compute: self.metric.update(logits, label) @HOOKS.register_module class Accuracy2p5DHook(MetricHook): - def __init__(self, trainer: Trainer, priority: int = 0): - super().__init__(trainer, priority) + def __init__(self, priority: int = 0): + super().__init__(priority) + def after_hook_is_attached(self, trainer): + self._check_metric_states_initialization(trainer) if self._is_stage_to_compute: self.metric = Accuracy2p5D(epoch_only=True) # register the metric - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.metric.__class__.__name__] = self.metric - def before_test(self): + def before_test(self, trainer): if self._is_stage_to_compute: self.metric.reset() - def after_test_iter(self, logits, label, *args): + def after_test_iter(self, trainer, logits, label, *args): if self._is_stage_to_compute: self.metric.update(logits, label) @@ -167,26 +172,22 @@ class Accuracy3DHook(MetricHook): """ def __init__(self, - trainer: Trainer, - # input_parallel_mode: ParallelMode, - # weight_parallel_mode: ParallelMode, priority: int = 10): - super().__init__(trainer, priority) + super().__init__(priority) + def after_hook_is_attached(self, trainer): if self._is_stage_to_compute: self.metric = Accuracy3D(epoch_only=True) - # input_parallel_mode=input_parallel_mode, - # weight_parallel_mode=weight_parallel_mode) # register the metric - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.metric.__class__.__name__] = self.metric - def before_test(self): + def before_test(self, trainer): if self._is_stage_to_compute: self.metric.reset() - def after_test_iter(self, logits, label, *args): + def after_test_iter(self, trainer, logits, label, *args): if self._is_stage_to_compute: self.metric.update(logits, label) @@ -201,20 +202,21 @@ class AccuracyHook(MetricHook): :type priority: int """ - def __init__(self, trainer: Trainer, priority: int = 0): - super().__init__(trainer, priority) + def __init__(self, priority: int = 0): + super().__init__(priority) + def after_hook_is_attached(self, trainer): if self._is_stage_to_compute: self.metric = Accuracy(epoch_only=True) # register the metric - self.trainer.states['metrics']['test'][ + trainer.states['metrics']['test'][ self.metric.__class__.__name__] = self.metric - def before_test(self): + def before_test(self, trainer): if self._is_stage_to_compute: self.metric.reset() - def after_test_iter(self, logits, label, *args): + def after_test_iter(self, trainer, logits, label, *args): if self._is_stage_to_compute: self.metric.update(logits, label) diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py index 64aafab740e9..b155ad0a3048 100644 --- a/colossalai/utils/__init__.py +++ b/colossalai/utils/__init__.py @@ -1,23 +1,26 @@ from .activation_checkpoint import checkpoint -from .common import print_rank_0, sync_model_param_in_dp, is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp, is_using_pp, ConditionalContext +from .common import (print_rank_0, sync_model_param_in_dp, is_dp_rank_0, + is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp, + is_using_pp, conditional_context, is_model_parallel_parameter, + clip_grad_norm_fp32, count_zeros_fp32, copy_tensor_parallel_attributes, + param_is_not_tensor_parallel_duplicate) from .cuda import get_current_device, synchronize, empty_cache, set_to_cuda from .memory import report_memory_usage from .timer import MultiTimer, Timer +from .multi_tensor_apply import multi_tensor_applier +from .gradient_accumulation import accumulate_gradient +from .data_sampler import DataParallelSampler, get_dataloader -_GLOBAL_MULTI_TIMER = MultiTimer(on=False) - - -def get_global_multitimer(): - return _GLOBAL_MULTI_TIMER - - -def set_global_multitimer_status(mode: bool): - _GLOBAL_MULTI_TIMER.set_status(mode) - - -__all__ = ['checkpoint', 'print_rank_0', 'sync_model_param_in_dp', 'get_current_device', - 'synchronize', 'empty_cache', 'set_to_cuda', 'report_memory_usage', 'Timer', 'MultiTimer', - 'get_global_multitimer', 'set_global_multitimer_status', - 'is_dp_rank_0', 'is_tp_rank_0', 'is_no_pp_or_last_stage', - 'is_using_ddp', 'ConditionalContext', 'is_using_pp' +__all__ = ['checkpoint', + 'print_rank_0', 'sync_model_param_in_dp', 'is_dp_rank_0', + 'is_tp_rank_0', 'is_no_pp_or_last_stage', 'is_using_ddp', + 'is_using_pp', 'conditional_context', 'is_model_parallel_parameter', + 'clip_grad_norm_fp32', 'count_zeros_fp32', 'copy_tensor_parallel_attributes', + 'param_is_not_tensor_parallel_duplicate', + 'get_current_device', 'synchronize', 'empty_cache', 'set_to_cuda', + 'report_memory_usage', + 'Timer', 'MultiTimer', + 'multi_tensor_applier', + 'accumulate_gradient', + 'DataParallelSampler', 'get_dataloader' ] diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py index ce6432166328..ed4523c75c95 100644 --- a/colossalai/utils/common.py +++ b/colossalai/utils/common.py @@ -1,10 +1,23 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import torch +from torch._six import inf + +try: + import colossal_C +except: + pass + import torch.distributed as dist from contextlib import contextmanager from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc +from .multi_tensor_apply import multi_tensor_applier +from colossalai.constants import IS_TENSOR_PARALLEL, TENSOR_PARALLEL_ATTRIBUTES, NUM_PARTITIONS +import torch.distributed as dist +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc def print_rank_0(msg: str, logger=None): @@ -18,7 +31,6 @@ def print_rank_0(msg: str, logger=None): print(msg, flush=True) else: logger.info(msg) - # print(msg, flush=True) def sync_model_param_in_dp(model): @@ -54,9 +66,186 @@ def is_using_pp(): @contextmanager -def ConditionalContext(context_manager, enable=True): +def conditional_context(context_manager, enable=True): if enable: with context_manager: yield else: yield + + +def is_model_parallel_parameter(p): + return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL) + + +def _calc_l2_norm(grads): + norm = 0.0 + if len(grads) > 0: + dummy_overflow_buf = torch.cuda.IntTensor([0]) + norm, _ = multi_tensor_applier( + colossal_C.multi_tensor_l2norm, + dummy_overflow_buf, + [grads], + False # no per-parameter norm + ) + return norm + + +def _calc_lp(grads, norm_type): + norm = 0.0 + for grad in grads: + grad_norm = torch.norm(grad, norm_type) + norm += grad_norm ** norm_type + return norm + +# ======== Gradient Clipping ========= + + +def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): + """Clips gradient norm of an iterable of parameters whose gradients + are in fp32. + + This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and + added functionality to handle model parallel parameters. Note that + the gradients are modified in place. + + Arguments: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + + Returns: + Total norm of the parameters (viewed as a single vector). + """ + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + # Filter parameters based on: + # - grad should not be none + # - parameter should not be shared + # - should not be a replica due to tensor model parallelism + params = [] + for param in parameters: + if param.grad is not None: + # Make sure the grads are in fp32 + assert param.grad.type() == 'torch.cuda.FloatTensor', \ + f'expected gradient to be dtype torch.cuda.FloatTensor, but got {param.grad.type()}' + params.append(param) + # Norm parameters. + max_norm = float(max_norm) + norm_type = float(norm_type) + + # Calculate norm. + if norm_type == inf: + total_norm = max(p.grad.data.abs().max() for p in params) + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + ops = [] + # Take max across all model-parallel GPUs. + if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1: + ops.append(dist.all_reduce(total_norm_cuda, + op=dist.ReduceOp.MAX, + group=gpc.get_group( + ParallelMode.TENSOR), + async_op=True)) + if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: + ops.append(dist.all_reduce(total_norm_cuda, + op=dist.ReduceOp.MAX, + group=gpc.get_group( + ParallelMode.PIPELINE), + async_op=True)) + for req in ops: + req.wait() + total_norm = total_norm_cuda[0].item() + else: + tensor_parallel_grads = [] + no_tensor_parallel_grads = [] + for p in params: + if is_model_parallel_parameter(p): + reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS)) ** (1 / norm_type) + tensor_parallel_grads.append(p.grad.data / reductor) + else: + no_tensor_parallel_grads.append(p.grad.data) + if norm_type == 2.0: + tensor_parallel_norm = _calc_l2_norm( + tensor_parallel_grads) ** norm_type + no_tensor_parallel_norm = _calc_l2_norm( + no_tensor_parallel_grads) ** norm_type + else: + tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type) + no_tensor_parallel_grads = _calc_lp( + no_tensor_parallel_grads, norm_type) + # Sum across all model-parallel GPUs. + if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0: + dist.all_reduce(tensor_parallel_norm, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.TENSOR)) + total_norm = tensor_parallel_norm + no_tensor_parallel_norm + if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1: + dist.all_reduce(total_norm, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.PIPELINE)) + total_norm = total_norm ** (1.0 / norm_type) + if type(total_norm) == 'torch.cuda.FloatTensor': + total_norm = total_norm.item() + + # Scale. + clip_coeff = max_norm / (total_norm + 1.0e-6) + if clip_coeff < 1.0: + grads = [p.grad.detach() for p in params] + dummy_overflow_buf = torch.cuda.IntTensor([0]) + multi_tensor_applier(colossal_C.multi_tensor_scale, + dummy_overflow_buf, + [grads, grads], + clip_coeff) + + return total_norm + + +def count_zeros_fp32(parameters): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + + # Filter parameters based on: + # - grad should not be none + # - parameter should not be shared + # - should not be a replica due to tensor model parallelism + total_num_zeros = 0.0 + for param in parameters: + grad_not_none = param.grad is not None + is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) + if grad_not_none and is_not_tp_duplicate: + grad = param.grad.detach() + num_zeros = grad.numel() - torch.count_nonzero(grad) + total_num_zeros = num_zeros + total_num_zeros + + # Sum across all model-parallel GPUs. + ops = [] + ops.append(dist.all_reduce(total_num_zeros, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.TENSOR), + async_op=True)) + ops.append(dist.all_reduce(total_num_zeros, + op=dist.ReduceOp.SUM, + group=gpc.get_group(ParallelMode.PIPELINE), + async_op=True)) + for req in ops: + req.wait() + total_num_zeros = total_num_zeros.item() + + return total_num_zeros + + +def copy_tensor_parallel_attributes(src_tensor, dst_tensor): + for attr in TENSOR_PARALLEL_ATTRIBUTES: + if hasattr(src_tensor, attr): + val = getattr(src_tensor, attr) + setattr(dst_tensor, attr, val) + + +def param_is_not_tensor_parallel_duplicate(param): + return (hasattr(param, IS_TENSOR_PARALLEL) and + getattr(param, IS_TENSOR_PARALLEL)) or ( + gpc.get_local_rank(ParallelMode.TENSOR) == 0) diff --git a/colossalai/utils/data_sampler/__init__.py b/colossalai/utils/data_sampler/__init__.py new file mode 100644 index 000000000000..12798a94c2d0 --- /dev/null +++ b/colossalai/utils/data_sampler/__init__.py @@ -0,0 +1,4 @@ +from .base_sampler import BaseSampler +from .data_parallel_sampler import DataParallelSampler, get_dataloader + +__all__ = ['BaseSampler', 'DataParallelSampler', 'get_dataloader'] diff --git a/colossalai/nn/data/sampler/base_sampler.py b/colossalai/utils/data_sampler/base_sampler.py similarity index 100% rename from colossalai/nn/data/sampler/base_sampler.py rename to colossalai/utils/data_sampler/base_sampler.py diff --git a/colossalai/nn/data/sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py similarity index 68% rename from colossalai/nn/data/sampler/data_parallel_sampler.py rename to colossalai/utils/data_sampler/data_parallel_sampler.py index 2b3817e03a72..afd20add2d77 100644 --- a/colossalai/nn/data/sampler/data_parallel_sampler.py +++ b/colossalai/utils/data_sampler/data_parallel_sampler.py @@ -3,19 +3,21 @@ # adpated from torch.utils.data.DistributedSampler import math +import random +import numpy as np from typing import TypeVar, Iterator import torch -from torch.utils.data import Sampler, Dataset +from torch.utils.data import Sampler, Dataset, DataLoader from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.registry import SAMPLERS +from colossalai.registry import DATA_SAMPLERS T_co = TypeVar('T_co', covariant=True) -@SAMPLERS.register_module +@DATA_SAMPLERS.register_module class DataParallelSampler(Sampler): """A data sampler for distributed data parallelism @@ -66,6 +68,10 @@ def __iter__(self) -> Iterator[T_co]: g.manual_seed(self.seed + self.epoch) # type: ignore[arg-type] indices = torch.randperm(len(self.dataset), generator=g).tolist() + + # update for next epoch so that there is no need to call + # set_epoch manually + self.epoch += 1 else: indices = list(range(len(self.dataset))) # type: ignore[arg-type] @@ -100,3 +106,44 @@ def set_epoch(self, epoch: int) -> None: :type epoch: int """ self.epoch = epoch + + +def get_dataloader(dataset, shuffle=False, seed=1024, add_sampler=True, **kwargs): + '''Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not) + + .. note: when pipeline parallel is enabled, shuffle cannot be True + as it will result in mismatch between input data on the 1st + stage and label on the last stage + + :param dataset: a :class:utils.data.dataset dataset + :param seed: random worker seed, defaults to 1024 + :type seed: int, optional + :param add_sampler_if_possible: [description], defaults to False + :type add_sampler_if_possible: bool, optional + :return: a :class:utils.data.dataset dataloader + :rtype: torch.utils.data.dataset + ''' + _kwargs = kwargs.copy() + + if add_sampler and gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1: + sampler = DataParallelSampler(dataset, shuffle=shuffle) + else: + sampler = None + + # Deterministic dataloader + def seed_worker(worker_id): + worker_seed = seed + np.random.seed(worker_seed) + torch.manual_seed(worker_seed) + random.seed(worker_seed) + + if sampler is None: + return DataLoader(dataset, + worker_init_fn=seed_worker, + shuffle=shuffle, + **_kwargs) + else: + return DataLoader(dataset, + sampler=sampler, + worker_init_fn=seed_worker, + **_kwargs) diff --git a/colossalai/utils/gradient_accumulation/__init__.py b/colossalai/utils/gradient_accumulation/__init__.py new file mode 100644 index 000000000000..342f360c1d0f --- /dev/null +++ b/colossalai/utils/gradient_accumulation/__init__.py @@ -0,0 +1,29 @@ +import torch.nn as nn +from typing import List +from colossalai.engine import BaseGradientHandler +from typing import Iterable +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +from ._gradient_accumulation import GradAccumDataloader, GradAccumOptimizer, GradAccumLrSchedulerByStep, GradAccumGradientHandler + + +def accumulate_gradient(model: nn.Module, + optimizer: Optimizer, + dataloader: Iterable, + accumulate_size: int, + gradient_handlers: List[BaseGradientHandler] = None, + lr_scheduler: _LRScheduler = None): + optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model) + dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size) + + if gradient_handlers is not None: + gradient_handlers = [GradAccumGradientHandler(handler, accumulate_size) for handler in gradient_handlers] + + if lr_scheduler is not None: + lr_scheduler = GradAccumLrSchedulerByStep(lr_scheduler, accumulate_size=accumulate_size) + + return optimizer, dataloader, gradient_handlers, lr_scheduler + + +__all__ = ['accumulate_gradient', 'GradAccumDataloader', 'GradAccumOptimizer', + 'GradAccumLrSchedulerByStep', 'GradAccumGradientHandler'] diff --git a/colossalai/utils/gradient_accumulation/_gradient_accumulation.py b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py new file mode 100644 index 000000000000..0aa25188a7d6 --- /dev/null +++ b/colossalai/utils/gradient_accumulation/_gradient_accumulation.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import torch.nn as nn +from torch import Tensor +from typing import Iterable, Any +from colossalai.nn.optimizer import ColossalaiOptimizer +from torch.nn.parallel.distributed import DistributedDataParallel +from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.data import DataLoader +from colossalai.utils import conditional_context +from colossalai.engine import BaseGradientHandler + + +class GradAccumOptimizer(ColossalaiOptimizer): + + def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None): + super().__init__(optim) + self.accumulate_size = accumulate_size + self.accumulate_step = 0 + + # handle pytorch ddp auto all reduce + self.model = model + self.is_torch_ddp = isinstance(self.model, DistributedDataParallel) + + def zero_grad(self, *args, **kwargs): + if self.accumulate_step == 0: + self.optim.zero_grad(*args, **kwargs) + + def step(self, *args, **kwargs): + if self.accumulate_step < self.accumulate_size: + return None + else: + self.accumulate_step = 0 + return self.optim.step(*args, **kwargs) + + def clip_grad_norm(self, model: nn.Module, max_norm: float): + if self.accumulate_step < self.accumulate_size: + pass + else: + self.optim.clip_grad_norm(model, max_norm) + + def backward(self, loss: Tensor): + self.accumulate_step += 1 + + if self.is_torch_ddp: + no_sync = self.accumulate_step < self.accumulate_size + with conditional_context(self.model.no_sync(), enable=no_sync): + scaled_loss = loss / self.accumulate_size + self.optim.backward(scaled_loss) + else: + scaled_loss = loss / self.accumulate_size + self.optim.backward(scaled_loss) + + def backward_by_grad(self, tensor: Tensor, grad: Tensor): + no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size + + if no_sync: + with self.model.no_sync(): + self.optim.backward_by_grad(tensor, grad) + else: + self.optim.backward_by_grad(tensor, grad) + + +class GradAccumDataloader(): + + def __init__(self, dataloader: Iterable, accumulate_size: int) -> None: + self.dataloader = dataloader + self.consume_remain_data = not isinstance(dataloader, DataLoader) + self.steps_per_epoch = len(dataloader) - len(dataloader) % accumulate_size + + def __getattr__(self, __name: str) -> Any: + return getattr(self.dataloader, __name) + + def __len__(self): + return self.steps_per_epoch + + def __iter__(self): + self._cur_step = 0 + self._dataiter = iter(self.dataloader) + return self + + def __next__(self) -> Any: + if self._cur_step < self.steps_per_epoch: + self._cur_step += 1 + + if self._cur_step == self.steps_per_epoch and self.consume_remain_data: + # this is to handle non standard pytorch dataloader + # such as dali dataloader + while True: + try: + _ = next(self._dataiter) + except StopIteration: + break + return next(self._dataiter) + else: + raise StopIteration + + +class GradAccumLrSchedulerByStep(_LRScheduler): + + def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None: + self.lr_scheduler = lr_scheduler + self.accumulate_size = accumulate_size + self.accumulate_step = 0 + + @staticmethod + def compute_effective_steps_per_epoch(dataloader: Iterable, accumulate_size: int): + return len(dataloader) // accumulate_size + + def __getattr__(self, __name: str) -> Any: + return getattr(self.lr_scheduler, __name) + + def step(self, *args, **kwargs): + self.accumulate_step += 1 + if self.accumulate_step < self.accumulate_size: + pass + else: + self.accumulate_step = 0 + self.lr_scheduler.step(*args, **kwargs) + + def get_lr(self): + return self.lr_scheduler.get_lr() + + def get_last_lr(self): + return self.lr_scheduler.get_last_lr() + + def print_lr(self, *args, **kwargs): + self.lr_scheduler.print_lr(*args, **kwargs) + + def state_dict(self) -> dict: + return self.lr_scheduler.state_dict() + + def load_state_dict(self, state_dict: dict) -> None: + self.lr_scheduler.load_state_dict(state_dict) + + +class GradAccumGradientHandler(): + + def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None: + assert isinstance(grad_handler, BaseGradientHandler), \ + f'expected grad_handler to be type BaseGradientHandler, but got {type(grad_handler)}' + self.grad_handler = grad_handler + self.accumulate_size = accumulate_size + self.accumulate_step = 0 + + def handle_gradient(self): + self.accumulate_step += 1 + if self.accumulate_step < self.accumulate_size: + pass + else: + self.accumulate_step = 0 + self.grad_handler.handle_gradient() diff --git a/colossalai/utils/memory.py b/colossalai/utils/memory.py index f60029b045c0..904ec894b605 100644 --- a/colossalai/utils/memory.py +++ b/colossalai/utils/memory.py @@ -8,27 +8,28 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger def bytes_to_GB(val, decimal=2): '''A byte-to-Gigabyte converter, defaultly using binary notation. - :param val: X bytes to convert + :param val: X bytes to convert :return: X' GB ''' return round(val / (1024 * 1024 * 1024), decimal) + def bytes_to_MB(val, decimal=2): '''A byte-to-Megabyte converter, defaultly using binary notation. - :param val: X bytes to convert + :param val: X bytes to convert :return: X' MB ''' return round(val / (1024 * 1024), decimal) -def report_memory_usage(message): +def report_memory_usage(message, logger=None, report_cpu=False): '''Calculate and print RAM usage (in GB) :param message: a prefix message to add in the log @@ -38,19 +39,24 @@ def report_memory_usage(message): if not gpc.is_initialized(ParallelMode.GLOBAL): raise EnvironmentError("No distributed environment is initialized") - # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports - gc.collect() - vm_stats = psutil.virtual_memory() - vm_used = bytes_to_GB(vm_stats.total - vm_stats.available) - gpu_allocated = bytes_to_MB(torch.cuda.memory_allocated()) gpu_max_allocated = bytes_to_MB(torch.cuda.max_memory_allocated()) gpu_cached = bytes_to_MB(torch.cuda.memory_reserved()) gpu_max_cached = bytes_to_MB(torch.cuda.max_memory_reserved()) - get_global_dist_logger().info( - f"{message} - GPU: allocated {gpu_allocated}MB, max allocated {gpu_max_allocated}MB, cached: {gpu_cached} MB, " - f"max cached: {gpu_max_cached}MB, CPU Virtual Memory: used = {vm_used}GB, percent = {vm_stats.percent}%") + full_log = f"{message} - GPU: allocated {gpu_allocated} MB, max allocated {gpu_max_allocated} MB, \ + cached: {gpu_cached} MB, max cached: {gpu_max_cached} MB" + + if report_cpu: + # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports + gc.collect() + vm_stats=psutil.virtual_memory() + vm_used=bytes_to_MB(vm_stats.total - vm_stats.available) + full_log += f", CPU Virtual Memory: used = {vm_used} MB, percent = {vm_stats.percent}%" + + if logger is None: + logger = get_dist_logger() + logger.info(full_log) # get the peak memory to report correct data, so reset the counter for the next call if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+ diff --git a/colossalai/nn/multi_tensor_apply/__init__.py b/colossalai/utils/multi_tensor_apply/__init__.py similarity index 100% rename from colossalai/nn/multi_tensor_apply/__init__.py rename to colossalai/utils/multi_tensor_apply/__init__.py diff --git a/colossalai/nn/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py similarity index 100% rename from colossalai/nn/multi_tensor_apply/multi_tensor_apply.py rename to colossalai/utils/multi_tensor_apply/multi_tensor_apply.py diff --git a/colossalai/zero/__init__.py b/colossalai/zero/__init__.py new file mode 100644 index 000000000000..8fe3dcab9dcf --- /dev/null +++ b/colossalai/zero/__init__.py @@ -0,0 +1,28 @@ +import torch.nn as nn +from torch.optim import Optimizer +from colossalai.amp.naive_amp import NaiveAMPModel +from colossalai.utils import is_no_pp_or_last_stage + +from .zero_redundancy_optimizer_level_2 import ZeroRedundancyOptimizer_Level_2 +from .zero_redundancy_optimizer_level_3 import ZeroRedundancyOptimizer_Level_3 + + +def convert_to_zero(model: nn.Module, + optimizer: Optimizer, + level: int, + zero_config): + assert level == 2 or level == 3, 'Only ZERO Optimizer Level 2 and 3 are provided' + + if is_no_pp_or_last_stage(): + model = NaiveAMPModel(model, output_to_fp32=True) + else: + model = NaiveAMPModel(model, output_to_fp32=False) + + if level == 2: + optimizer = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, **zero_config) + else: + optimizer = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, module=model, **zero_config) + return model, optimizer + + +__all__ = ['convert_to_zero', 'ZeroRedundancyOptimizer_Level_2', 'ZeroRedundancyOptimizer_Level_3'] diff --git a/colossalai/nn/optimizer/loss_scaler.py b/colossalai/zero/loss_scaler.py similarity index 100% rename from colossalai/nn/optimizer/loss_scaler.py rename to colossalai/zero/loss_scaler.py diff --git a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py b/colossalai/zero/zero_redundancy_optimizer_level_2.py similarity index 99% rename from colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py rename to colossalai/zero/zero_redundancy_optimizer_level_2.py index 1a57c5876f01..f022aaa6fdca 100644 --- a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_2.py +++ b/colossalai/zero/zero_redundancy_optimizer_level_2.py @@ -21,11 +21,10 @@ from torch.optim import Optimizer from colossalai.core import global_context as gpc -from colossalai.registry import OPTIMIZER_WRAPPERS from colossalai.utils import report_memory_usage -from ._utils import is_model_parallel_parameter +from colossalai.utils.common import is_model_parallel_parameter from .loss_scaler import LossScaler, DynamicLossScaler -from ...context.parallel_mode import ParallelMode +from colossalai.context import ParallelMode # Toggle this to true to enable correctness test # with gradient partitioning and without @@ -74,7 +73,6 @@ def print_rank_msg(msg): print(f"rank {dist.get_rank()} - {msg}") -@OPTIMIZER_WRAPPERS.register_module class ZeroRedundancyOptimizer_Level_2(Optimizer): """ ZeroRedundancyOptimizer_Level_2 designed to reduce the memory footprint @@ -252,7 +250,7 @@ def __init__(self, self.nccl_start_alignment_factor = 2 assert ( - allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} " + allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} " self.all_reduce_print = False self.dtype = self.optimizer.param_groups[0]['params'][0].dtype @@ -760,7 +758,7 @@ def increment_value(dictionary, key): elif start_index > current_index and start_index < (current_index + param_size): assert ( - first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index set_key_value_list(self.param_to_partition_ids[i], @@ -804,7 +802,7 @@ def get_param_id(self, param): def report_ipg_memory_usage(self, tag, param_elems): elem_count = self.elements_in_ipg_bucket + param_elems percent_of_bucket_size = ( - 100.0 * elem_count) // self.reduce_bucket_size + 100.0 * elem_count) // self.reduce_bucket_size if self.verbose: report_memory_usage( f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}" @@ -1492,7 +1490,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id): params_in_partition.append(tensor) assert ( - first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index else: @@ -1527,6 +1525,11 @@ def _model_parallel_all_reduce(self, tensor, op): op=op, group=self.model_parallel_group) + def clip_grad_norm(self, *args, **kwargs): + # dummy function to retain the same function interface + # as ColossalaiOptimizer for compatibility + pass + def get_grad_norm_direct(self, gradients, params, norm_type=2): """Clips gradient norm of an iterable of parameters. @@ -1800,7 +1803,7 @@ def step(self, closure=None): num_elements = shard_size assert shard_size * \ - num_shards <= partitioned_params[partition_id].numel() + num_shards <= partitioned_params[partition_id].numel() for shard_id in range(num_shards): @@ -2249,7 +2252,7 @@ def estimate_zero2_model_states_mem_needs(total_params, if cpu_offload: gpu_mem = 2 * total_params cpu_mem = total_params * \ - max(4 * total_gpus, 16) * additional_buffer_factor + max(4 * total_gpus, 16) * additional_buffer_factor else: gpu_mem = 4 * total_params + int(16 * total_params / total_gpus) cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor diff --git a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py b/colossalai/zero/zero_redundancy_optimizer_level_3.py similarity index 99% rename from colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py rename to colossalai/zero/zero_redundancy_optimizer_level_3.py index 4e54f3cd3e62..cf281d6b4e34 100644 --- a/colossalai/nn/optimizer/zero_redundancy_optimizer_level_3.py +++ b/colossalai/zero/zero_redundancy_optimizer_level_3.py @@ -28,10 +28,9 @@ from torch.optim import Optimizer from colossalai.core import global_context as gpc -from colossalai.registry import OPTIMIZER_WRAPPERS from colossalai.utils import report_memory_usage from .loss_scaler import LossScaler, DynamicLossScaler -from ...context.parallel_mode import ParallelMode +from colossalai.context import ParallelMode # Toggle this to true to enable correctness test # with gradient partitioning and without @@ -412,7 +411,7 @@ def fetch_sub_module(self, sub_module): ) params_to_fetch = [ param for _, - param in sub_module.named_parameters(recurse=False) + param in sub_module.named_parameters(recurse=False) ] # print([n for n,p in sub_module.named_parameters(recurse=False)]) @@ -422,7 +421,7 @@ def fetch_sub_module(self, sub_module): ) params_to_fetch += [ param for _, - param in sub_module.ds_external_parameters() + param in sub_module.ds_external_parameters() ] # for _, param in sub_module.named_parameters(recurse=False): for param in params_to_fetch: @@ -474,14 +473,14 @@ def release_sub_module(self, sub_module): ) params_to_release = [ param for _, - param in sub_module.named_parameters(recurse=False) + param in sub_module.named_parameters(recurse=False) ] if hasattr(sub_module, 'ds_external_parameters'): # print_rank_0(f"Releasing external parameters {sub_module.ds_external_parameters()}") params_to_release += [ param for _, - param in sub_module.ds_external_parameters() + param in sub_module.ds_external_parameters() ] # for _, param in sub_module.named_parameters(recurse=False): @@ -604,7 +603,6 @@ def backward(ctx, *args): INITIAL_MICRO_STEP_ID = -1 -@OPTIMIZER_WRAPPERS.register_module class ZeroRedundancyOptimizer_Level_3(Optimizer): """ ZeroRedundancyOptimizer_Level_3 designed to reduce the memory footprint @@ -718,7 +716,7 @@ def __init__(self, self.offload_optimizer_pin_memory = offload_optimizer_config[ OFFLOAD_OPTIMIZER_PIN_MEMORY] self.swap_optimizer = offload_optimizer_config[ - OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE + OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE self.offload_optimizer_fast_init = offload_optimizer_config[ OFFLOAD_OPTIMIZER_FAST_INIT] @@ -733,7 +731,7 @@ def __init__(self, self.offload_param_pin_memory = offload_param_config[ OFFLOAD_PARAM_PIN_MEMORY] self.params_in_nvme_and_cpu = offload_param_config[ - OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE + OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU] if self.verbose: print_rank_0( @@ -1360,7 +1358,7 @@ def _create_fp32_partitions(self): if self.params_in_nvme_and_cpu and tensor is None: num_swap_from_nvme_partitions += 1 swap_from_nvme_memory_usage += ( - fp32_element_size * num_elements) + fp32_element_size * num_elements) if self.offload_optimizer_fast_init: sub_group_partitions = self._get_sub_group_partitions( i) @@ -1380,7 +1378,7 @@ def _create_fp32_partitions(self): else: num_swap_from_cpu_partitions += 1 swap_from_cpu_memory_usage += ( - fp32_element_size * num_elements) + fp32_element_size * num_elements) swappable_fp32_tensors.append( self.fp32_partitioned_groups_flat[i]) swappable_fp16_src_tensors.append( @@ -1944,7 +1942,7 @@ def increment_value(dictionary, key): elif start_index > current_index and start_index < (current_index + param_size): assert ( - first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index set_key_value_list(self.param_to_partition_ids[i], @@ -2003,7 +2001,7 @@ def get_param_id(self, param): def report_ipg_memory_usage(self, tag, param_elems): elem_count = self.elements_in_ipg_bucket + param_elems percent_of_bucket_size = ( - 100.0 * elem_count) // self.reduce_bucket_size + 100.0 * elem_count) // self.reduce_bucket_size report_memory_usage( f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}") @@ -2200,7 +2198,7 @@ def partition_previous_reduced_grads(self): if self.offload_optimizer: allocate_grads_in_partition = self.grads_in_partition is None \ - and self.gradient_accumulation_steps > 1 + and self.gradient_accumulation_steps > 1 else: allocate_grads_in_partition = self.grads_in_partition is None @@ -2308,7 +2306,7 @@ def reduce_ipg_grads(self, extra_param=None): self.partition_previous_reduced_grads() params_to_reduce = [param for i, param, - param_id in self.params_in_ipg_bucket] + param_id in self.params_in_ipg_bucket] # print(f"Params in ipg bucket {self.params_in_ipg_bucket}") # print(f"Reducing {[(debug_param2name_id_shape(param), param.grad) for param in params_to_reduce]}") # exit(0) @@ -2522,7 +2520,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id): params_in_partition.append(tensor) assert ( - first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" + first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition" first_offset = start_index - current_index else: @@ -2557,6 +2555,11 @@ def _model_parallel_all_reduce(self, tensor, op): op=op, group=self.model_parallel_group) + def clip_grad_norm(self, *args, **kwargs): + # dummy function to retain the same function interface + # as ColossalaiOptimizer for compatibility + pass + def get_grad_norm_direct(self, gradients, params, norm_type=2): """Clips gradient norm of an iterable of parameters. @@ -2824,7 +2827,7 @@ def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set( self.optimizer_swapper.swap_out_optimizer_state( parameter=self.fp32_partitioned_groups_flat[sub_group_id], async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is - not None) + not None) self.stop_timers([OPTIMIZER_SWAP_OUT_STATE]) if self.verbose: @@ -3175,7 +3178,7 @@ def _get_lean_tensors(self, padded_flattened_tensor, group_tensors, paddings): individual_tensors = self.unflatten( padded_flattened_tensor, group_tensors) lean_lengths = [t.numel() - pad for t, - pad in zip(group_tensors, paddings)] + pad in zip(group_tensors, paddings)] lean_tensors = [t[:len] for t, len in zip(individual_tensors, lean_lengths)] # print()(f'rank {dist.get_rank()}: lean_tensors = {[t.numel() for t in lean_tensors]}') diff --git a/docs/run_demo.md b/docs/run_demo.md index 6d8c5b49a192..2b0c4bdf3444 100644 --- a/docs/run_demo.md +++ b/docs/run_demo.md @@ -32,13 +32,13 @@ realizes the training process. ```python import colossalai from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.trainer import Trainer def run_trainer(): engine, train_dataloader, test_dataloader = colossalai.initialize() - logger = get_global_dist_logger() + logger = get_dist_logger() logger.info("engine is built", ranks=[0]) diff --git a/docs/run_demo_zh.md b/docs/run_demo_zh.md index 54839760d430..5eadef6f2c70 100644 --- a/docs/run_demo_zh.md +++ b/docs/run_demo_zh.md @@ -24,13 +24,13 @@ HOST=xxx.xxx.xxx.xxx srun ./scripts/slurm_dist_train.sh ./examples/run_trainer.p ```python import colossalai from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.trainer import Trainer def run_trainer(): engine, train_dataloader, test_dataloader = colossalai.initialize() - logger = get_global_dist_logger() + logger = get_dist_logger() logger.info("engine is built", ranks=[0]) trainer = Trainer(engine=engine, diff --git a/docs/trainer_engine.md b/docs/trainer_engine.md index 88b872826aca..c2abf1808d0a 100644 --- a/docs/trainer_engine.md +++ b/docs/trainer_engine.md @@ -36,7 +36,7 @@ from colossalai.engine import Engine model = models.resnet18() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) -schedule = colossalai.engine.NoPipelineSchedule() +schedule = colossalai.engine.NonPipelineSchedule() MyEngine = Engine( model=model, diff --git a/docs/trainer_engine_zh.md b/docs/trainer_engine_zh.md index 737d6745bb58..5729a05994c3 100644 --- a/docs/trainer_engine_zh.md +++ b/docs/trainer_engine_zh.md @@ -31,7 +31,7 @@ model = models.resnet18() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model) lr_scheduler = colossalai.nn.lr_scheduler.CosineAnnealingLR(optimizer, 1000) -schedule = colossalai.engine.NoPipelineSchedule() +schedule = colossalai.engine.NonPipelineSchedule() MyEngine = Engine( model=model, diff --git a/examples/colossal_cifar_demo.ipynb b/examples/colossal_cifar_demo.ipynb index 221707bbbdc8..266fd2543a16 100644 --- a/examples/colossal_cifar_demo.ipynb +++ b/examples/colossal_cifar_demo.ipynb @@ -1,20 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "colossal_cifar_demo.ipynb", - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", @@ -27,6 +11,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -34,14 +19,10 @@ "id": "vP7LvCpG23a2", "outputId": "b37f7203-8a02-4736-c527-603f2bb34d7d" }, - "source": [ - "!pip install ColossalAI deepspeed" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: ColossalAI in /usr/local/lib/python3.7/dist-packages (0.1)\n", "Requirement already satisfied: deepspeed in /usr/local/lib/python3.7/dist-packages (0.5.4)\n", @@ -60,10 +41,14 @@ "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from triton->deepspeed) (3.3.0)\n" ] } + ], + "source": [ + "!pip install ColossalAI deepspeed" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -71,24 +56,23 @@ "id": "UVKEurtS4SFS", "outputId": "99fb6050-5da7-4f27-b4eb-9b3ccf830efb" }, - "source": [ - "import colossalai\n", - "from colossalai.engine import Engine, NoPipelineSchedule\n", - "from colossalai.trainer import Trainer\n", - "from colossalai.context import Config\n", - "import torch" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Please install apex to use FP16 Optimizer\n", "Apex should be installed to use the FP16 optimizer\n", "apex is required for mixed precision training\n" ] } + ], + "source": [ + "import colossalai\n", + "from colossalai.engine import Engine, NonPipelineSchedule\n", + "from colossalai.trainer import Trainer\n", + "from colossalai.context import Config\n", + "import torch" ] }, { @@ -102,6 +86,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -109,24 +94,10 @@ "id": "8yF7Lc-K7NAS", "outputId": "01312349-a8b0-4de4-9103-7d1b48e6cc36" }, - "source": [ - "parallel_cfg = Config(dict(parallel=dict(\n", - " data=dict(size=1),\n", - " pipeline=dict(size=1),\n", - " tensor=dict(size=1, mode=None),\n", - ")))\n", - "colossalai.init_dist(config=parallel_cfg,\n", - " local_rank=0,\n", - " world_size=1,\n", - " host='127.0.0.1',\n", - " port=8888,\n", - " backend='nccl')" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0\n", "colossalai - torch.distributed.distributed_c10d - 2021-10-15 03:27:51,598 INFO: Rank 0: Completed store-based barrier for 1 nodes.\n", @@ -137,13 +108,26 @@ ] }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "process rank 0 is bound to device 0\n", "initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1124,the default parallel seed is ParallelMode.DATA.\n" ] } + ], + "source": [ + "parallel_cfg = Config(dict(parallel=dict(\n", + " data=dict(size=1),\n", + " pipeline=dict(size=1),\n", + " tensor=dict(size=1, mode=None),\n", + ")))\n", + "colossalai.init_dist(config=parallel_cfg,\n", + " local_rank=0,\n", + " world_size=1,\n", + " host='127.0.0.1',\n", + " port=8888,\n", + " backend='nccl')" ] }, { @@ -157,13 +141,24 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "id": "ZyGhyD47-dUY", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "ZyGhyD47-dUY", "outputId": "98bbf2d1-a1c4-4bb4-b6df-600777b1e8f5" }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Files already downloaded and verified\n", + "Files already downloaded and verified\n" + ] + } + ], "source": [ "transform_cfg = [\n", " dict(type='ToTensor'),\n", @@ -179,17 +174,6 @@ "\n", "testset = colossalai.nn.data.CIFAR10Dataset(transform_cfg, root='./data', train=False)\n", "testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Files already downloaded and verified\n", - "Files already downloaded and verified\n" - ] - } ] }, { @@ -203,9 +187,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "cQ_y7lBG09LS" }, + "outputs": [], "source": [ "import torch.nn as nn\n", "import torch.nn.functional as F\n", @@ -232,9 +218,7 @@ "\n", "\n", "model = Net().cuda()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -247,6 +231,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -254,6 +239,18 @@ "id": "YtaDoCax1BCf", "outputId": "b33b1641-03d8-4597-c8c2-1a4c1d61e9b0" }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n", + "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n", + "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n", + "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n" + ] + } + ], "source": [ "import torch.optim as optim\n", "\n", @@ -270,19 +267,6 @@ "trainer = Trainer(engine=engine,\n", " hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],\n", " verbose=True)" - ], - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "colossalai - rank_0 - 2021-10-15 03:27:56,018 WARNING: No gradient handler is set up, please make sure you do not need to all-reduce the gradients after a training step.\n", - "colossalai - rank_0 - 2021-10-15 03:27:56,024 INFO: build LogMetricByEpochHook for train, priority = 1\n", - "colossalai - rank_0 - 2021-10-15 03:27:56,026 INFO: build LossHook for train, priority = 10\n", - "colossalai - rank_0 - 2021-10-15 03:27:56,029 INFO: build AccuracyHook for train, priority = 10\n" - ] - } ] }, { @@ -296,6 +280,7 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -303,22 +288,10 @@ "id": "w-J3IP-J1sfx", "outputId": "bdb76939-04f1-4124-ce5e-3af44c0d902c" }, - "source": [ - "num_epochs = 10\n", - "test_interval = 1\n", - "trainer.fit(\n", - " train_dataloader=trainloader,\n", - " test_dataloader=testloader,\n", - " max_epochs=num_epochs,\n", - " display_progress=True,\n", - " test_interval=test_interval\n", - " )" - ], - "execution_count": null, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "[Epoch 0 train]: 0%| | 0/391 [00:00`_. diff --git a/tests/test_config/sample_config.py b/tests/test_config/sample_config.py index e48c70e142bb..08ca108281b9 100644 --- a/tests/test_config/sample_config.py +++ b/tests/test_config/sample_config.py @@ -1,12 +1,10 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -import os -from pathlib import Path train_data = dict( dataset=dict( type='CIFAR10Dataset', - root=Path(os.environ['DATA']), + root='/path/to/data', download=True, transform_pipeline=[ dict(type='RandomResizedCrop', size=224), diff --git a/tests/test_context/test_2d_init.py b/tests/test_context/test_2d_init.py index 24e0749aeaee..d373964f8015 100644 --- a/tests/test_context/test_2d_init.py +++ b/tests/test_context/test_2d_init.py @@ -7,7 +7,7 @@ import pytest import torch.multiprocessing as mp -from colossalai import init_dist +from colossalai import launch from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc @@ -58,22 +58,22 @@ def check_2d_parallel_rank(rank): assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) == 1 -def init_2d(local_rank, world_size, backend, port, host): +def init_2d(rank, world_size, backend, port, host): dist_args = dict( config=CONFIG_PATH, - local_rank=local_rank, + rank=rank, world_size=world_size, backend=backend, port=port, - host=host + host=host, + verbose=True ) - init_dist(**dist_args) - - check_tensor_parallel_rank(local_rank) - check_data_parallel_rank(local_rank) - check_2d_parallel_rank(local_rank) - check_pipeline_parallel_rank(local_rank) + launch(**dist_args) + check_tensor_parallel_rank(rank) + check_data_parallel_rank(rank) + check_2d_parallel_rank(rank) + check_pipeline_parallel_rank(rank) gpc.destroy() diff --git a/tests/test_context/test_2p5d_init.py b/tests/test_context/test_2p5d_init.py index 26de7f7ff577..c071d86e7cd1 100644 --- a/tests/test_context/test_2p5d_init.py +++ b/tests/test_context/test_2p5d_init.py @@ -9,7 +9,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.initialize import init_dist +from colossalai.initialize import launch CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2p5d_init.py').absolute() @@ -82,20 +82,21 @@ def check_2p5d_parallel_rank(rank): assert xp_rank == i -def init_2halfd(local_rank, world_size, backend, port, host): +def init_2halfd(rank, world_size, backend, port, host): dist_args = dict( config=CONFIG_PATH, - local_rank=local_rank, + rank=rank, world_size=world_size, backend=backend, port=port, - host=host + host=host, + verbose=True ) - init_dist(**dist_args) - check_data_parallel_rank(local_rank) - check_pipeline_parallel_rank(local_rank) - check_tensor_parallel_rank(local_rank) - check_2p5d_parallel_rank(local_rank) + launch(**dist_args) + check_data_parallel_rank(rank) + check_pipeline_parallel_rank(rank) + check_tensor_parallel_rank(rank) + check_2p5d_parallel_rank(rank) gpc.destroy() diff --git a/tests/test_context/test_3d_init.py b/tests/test_context/test_3d_init.py index 0fba98bff146..a1c48a9b77b3 100644 --- a/tests/test_context/test_3d_init.py +++ b/tests/test_context/test_3d_init.py @@ -9,7 +9,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.initialize import init_dist +from colossalai.initialize import launch CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_3d_init.py').absolute() @@ -74,21 +74,21 @@ def check_3d_parallel_rank(rank): assert op_rank == i -def init_3d(local_rank, world_size, backend, port, host): +def init_3d(rank, world_size, backend, port, host): dist_args = dict( config=CONFIG_PATH, - local_rank=local_rank, + rank=rank, world_size=world_size, backend=backend, port=port, - host=host + host=host, + verbose=True ) - init_dist(**dist_args) - check_tensor_parallel_rank(local_rank) - check_3d_parallel_rank(local_rank) - check_data_parallel_rank(local_rank) - check_pipeline_parallel_rank(local_rank) - print('pass') + launch(**dist_args) + check_tensor_parallel_rank(rank) + check_3d_parallel_rank(rank) + check_data_parallel_rank(rank) + check_pipeline_parallel_rank(rank) gpc.destroy() diff --git a/tests/test_data/test_cifar10_dataset.py b/tests/test_data/test_cifar10_dataset.py index 10b79dd03cdb..569cea2ca1ed 100644 --- a/tests/test_data/test_cifar10_dataset.py +++ b/tests/test_data/test_cifar10_dataset.py @@ -5,39 +5,50 @@ from pathlib import Path import pytest +from torchvision import transforms from torch.utils.data import DataLoader -from colossalai.builder import build_dataset +from colossalai.builder import build_dataset, build_transform from colossalai.context import Config -train_data = dict( +TRAIN_DATA = dict( dataset=dict( - type='CIFAR10Dataset', + type='CIFAR10', root=Path(os.environ['DATA']), train=True, - download=True, - transform_pipeline=[ - dict(type='ToTensor'), - dict(type='Normalize', - mean=(0.5, 0.5, 0.5), - std=(0.5, 0.5, 0.5)) - ]), - dataloader=dict(batch_size=4, shuffle=True, num_workers=2) + download=True + ), + dataloader=dict(batch_size=4, shuffle=True, num_workers=2), + transform_pipeline=[ + dict(type='ToTensor'), + dict(type='Normalize', + mean=(0.5, 0.5, 0.5), + std=(0.5, 0.5, 0.5) + ) + ] ) @pytest.mark.cpu def test_cifar10_dataset(): - global train_data - config = Config(train_data) - dataset = build_dataset(config.dataset) - dataloader = DataLoader(dataset=dataset, **config.dataloader) + config = Config(TRAIN_DATA) + dataset_cfg = config.dataset + dataloader_cfg = config.dataloader + transform_cfg = config.transform_pipeline + + # build transform + transform_pipeline = [build_transform(cfg) for cfg in transform_cfg] + transform_pipeline = transforms.Compose(transform_pipeline) + dataset_cfg['transform'] = transform_pipeline + + # build dataset + dataset = build_dataset(dataset_cfg) + + # build dataloader + dataloader = DataLoader(dataset=dataset, **dataloader_cfg) data_iter = iter(dataloader) img, label = data_iter.next() - assert isinstance(img, list) and isinstance(label, list), \ - f'expected the img and label to be list but got {type(img)} and {type(label)}' - if __name__ == '__main__': test_cifar10_dataset() diff --git a/tests/test_data/test_data_parallel_sampler.py b/tests/test_data/test_data_parallel_sampler.py index 056f0441a8b9..2f2e275c4e80 100644 --- a/tests/test_data/test_data_parallel_sampler.py +++ b/tests/test_data/test_data_parallel_sampler.py @@ -12,54 +12,54 @@ from torch.utils.data import DataLoader import colossalai -from colossalai.builder import build_dataset, build_data_sampler -from colossalai.context.parallel_mode import ParallelMode +from colossalai.builder import build_dataset, build_data_sampler, build_transform +from torchvision import transforms +from colossalai.context import ParallelMode, Config from colossalai.core import global_context as gpc +from colossalai.utils import get_dataloader -CONFIG = dict( - train_data=dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=True, - download=True, +CONFIG = Config( + dict( + train_data=dict( + dataset=dict( + type='CIFAR10', + root=Path(os.environ['DATA']), + train=True, + download=True, + ), + dataloader=dict( + batch_size=8, + ), transform_pipeline=[ dict(type='ToTensor'), dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ] ), - dataloader=dict( - num_workers=2, - batch_size=8, - sampler=dict( - type='DataParallelSampler', - ) - ) - ), - parallel=dict( - pipeline=dict(size=1), - tensor=dict(size=1, mode=None), - ), - seed=1024, -) + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None), + ), + seed=1024, + )) -def run_data_sampler(local_rank, world_size): +def run_data_sampler(rank, world_size): dist_args = dict( config=CONFIG, - local_rank=local_rank, + rank=rank, world_size=world_size, backend='gloo', port='29503', host='localhost' ) - colossalai.init_dist(**dist_args) + colossalai.launch(**dist_args) print('finished initialization') + transform_pipeline = [build_transform(cfg) for cfg in gpc.config.train_data.transform_pipeline] + transform_pipeline = transforms.Compose(transform_pipeline) + gpc.config.train_data.dataset['transform'] = transform_pipeline dataset = build_dataset(gpc.config.train_data.dataset) - sampler_cfg = gpc.config.train_data.dataloader.pop('sampler') - sampler = build_data_sampler(sampler_cfg, dataset) - dataloader = DataLoader(dataset=dataset, sampler=sampler, **gpc.config.train_data.dataloader) + dataloader = get_dataloader(dataset, **gpc.config.train_data.dataloader) data_iter = iter(dataloader) img, label = data_iter.next() img = img[0] diff --git a/tests/test_data/test_deterministic_dataloader.py b/tests/test_data/test_deterministic_dataloader.py index 9cfd6c4fc7e9..237c92b77b72 100644 --- a/tests/test_data/test_deterministic_dataloader.py +++ b/tests/test_data/test_deterministic_dataloader.py @@ -9,56 +9,70 @@ import torch.cuda import torch.distributed as dist import torch.multiprocessing as mp +from torchvision import transforms from torch.utils.data import DataLoader import colossalai -from colossalai.builder import build_dataset -from colossalai.context.parallel_mode import ParallelMode +from colossalai.builder import build_dataset, build_transform +from colossalai.context import ParallelMode, Config from colossalai.core import global_context as gpc -CONFIG = dict( - train_data=dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=True, - download=True, +CONFIG = Config( + dict( + train_data=dict( + dataset=dict( + type='CIFAR10', + root=Path(os.environ['DATA']), + train=True, + download=True, + ), + dataloader=dict( + num_workers=2, + batch_size=2, + shuffle=True + ), transform_pipeline=[ dict(type='ToTensor'), dict(type='RandomCrop', size=32), dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ] ), - dataloader=dict( - num_workers=2, - batch_size=2, - shuffle=True - ) - ), - parallel=dict( - pipeline=dict(size=1), - tensor=dict(size=1, mode=None), - ), - seed=1024, + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None), + ), + seed=1024, + ) ) -def run_data_sampler(local_rank, world_size): +def run_data_sampler(rank, world_size): dist_args = dict( config=CONFIG, - local_rank=local_rank, + rank=rank, world_size=world_size, backend='gloo', port='29499', host='localhost' ) - colossalai.init_dist(**dist_args) - gpc.set_seed() - + colossalai.launch(**dist_args) print('finished initialization') - dataset = build_dataset(gpc.config.train_data.dataset) - dataloader = DataLoader(dataset=dataset, **gpc.config.train_data.dataloader) + dataset_cfg = gpc.config.train_data.dataset + dataloader_cfg = gpc.config.train_data.dataloader + transform_cfg = gpc.config.train_data.transform_pipeline + + # build transform + transform_pipeline = [build_transform(cfg) for cfg in transform_cfg] + transform_pipeline = transforms.Compose(transform_pipeline) + dataset_cfg['transform'] = transform_pipeline + + # build dataset + dataset = build_dataset(dataset_cfg) + + # build dataloader + dataloader = DataLoader(dataset=dataset, **dataloader_cfg) + data_iter = iter(dataloader) img, label = data_iter.next() img = img[0] diff --git a/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py b/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py deleted file mode 100644 index c97ed18044e2..000000000000 --- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2d.py +++ /dev/null @@ -1,150 +0,0 @@ -import os -from pathlib import Path - -from colossalai.engine import AMP_TYPE - -BATCH_SIZE = 256 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - # num_workers=1, - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=2), - tensor=dict(size=4, mode='2d'), -) - -fp16 = dict( - mode=AMP_TYPE.PARALLEL, -) - -engine = dict( - schedule=dict( - num_microbatches=2 - ) -) - -hooks = [ - dict( - type='LRSchedulerHook', - by_epoch=True, - lr_scheduler_cfg=dict( - type='LinearWarmupLR', - warmup_steps=5 - ) - ), -] -num_epochs = 60 - -logging = dict( - root_path='test_vit_2d_log' -) diff --git a/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py b/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py deleted file mode 100644 index fd9c89eb434f..000000000000 --- a/tests/test_data_pipeline_tensor_parallel/configs/vit_2p5d.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -from pathlib import Path - -BATCH_SIZE = 250 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=0, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=0, - shuffle=True - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2p5D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2p5D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2p5D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2p5D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2p5D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2p5D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2p5D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2p5D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2p5D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=2), - tensor=dict(size=4, depth=1, mode='2.5d'), -) - -hooks = [ - dict( - type='LRSchedulerHook', - by_epoch=True, - lr_scheduler_cfg=dict( - type='LinearWarmupLR', - warmup_steps=5 - ) - ), -] - -engine = dict( -schedule = dict( - num_microbatches=2 -) -) - -num_epochs = 60 diff --git a/tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py b/tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py new file mode 100644 index 000000000000..529fedf5ace0 --- /dev/null +++ b/tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py @@ -0,0 +1,139 @@ +from pathlib import Path +from colossalai.amp.amp_type import AMP_TYPE +from colossalai.context.parallel_mode import ParallelMode +from colossalai.logging import get_dist_logger +import colossalai +import torch +import os +from colossalai.builder import PipelineModelInitializer +from colossalai.core import global_context as gpc +from colossalai.utils import get_dataloader, MultiTimer +from colossalai.nn.loss import CrossEntropyLoss2D +from colossalai.trainer.metric import Accuracy2D +from colossalai.trainer import metric, hooks, Trainer +from colossalai.utils.gradient_accumulation import GradAccumLrSchedulerByStep +from colossalai.engine.schedule import PipelineSchedule +from torchvision import transforms +from torchvision.datasets import CIFAR10 +from colossalai.nn import LinearWarmupLR +from tqdm import tqdm +import vit_t_2d + +BATCH_SIZE = 16 +NUM_EPOCHS = 60 +WARMUP_EPOCHS = 5 +CONFIG = dict( + parallel=dict( + pipeline=2, + tensor=dict(size=4, mode='2d') + ), + fp16=dict( + mode=AMP_TYPE.TORCH + ), + gradient_accumulation=2 +) + + +def main(): + parser = colossalai.get_default_parser() + args = parser.parse_args() + colossalai.launch_from_slurm(config=CONFIG, + host=args.host, + port=29500) + + logger = get_dist_logger() + # if gpc.get_global_rank() == 0: + # logger.log_to_file('./logs/cifar10_2d_vit', + # suffix='cifar10_2d_vit_ddp1_torch_amp_grad_accum_2_clip_grad_1', mode='w') + + # build vit-t-32 + initializer = PipelineModelInitializer(vit_t_2d.model_cfg, num_chunks=1) + model = initializer.initialize() + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.RandomCrop(size=32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[ + 0.2023, 0.1994, 0.2010]), + ] + ) + ) + + test_dataset = CIFAR10( + root=Path(os.environ['DATA']), + train=False, + transform=transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[ + 0.2023, 0.1994, 0.2010]), + ] + ) + ) + + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + add_sampler=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + ) + + test_dataloader = get_dataloader(dataset=test_dataset, + add_sampler=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + ) + + # build criterion + criterion = CrossEntropyLoss2D() + + # optimizer + optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0) + + # lr_scheduler + steps_per_epoch = GradAccumLrSchedulerByStep.compute_effective_steps_per_epoch(train_dataloader, accumulate_size=2) + total_steps = steps_per_epoch * NUM_EPOCHS + warmup_steps = steps_per_epoch * WARMUP_EPOCHS + lr_scheduler = LinearWarmupLR(optimizer, total_steps=total_steps, warmup_steps=warmup_steps) + + engine, train_dataloader, test_dataloader, lr_scheduler = colossalai.initialize( + model, optimizer, criterion, train_dataloader, test_dataloader, lr_scheduler) + + timer = MultiTimer() + + schedule = PipelineSchedule(num_microbatches=4) + + trainer = Trainer( + engine=engine, + timer=timer, + logger=logger, + schedule=schedule + ) + + hook_list = [ + hooks.LossHook(), + hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False), + hooks.Accuracy2DHook(), + hooks.LogMetricByEpochHook(logger), + ] + + trainer.fit( + train_dataloader=train_dataloader, + epochs=NUM_EPOCHS, + test_dataloader=test_dataloader, + test_interval=1, + hooks=hook_list, + display_progress=True + ) + + +if __name__ == '__main__': + main() diff --git a/tests/test_data_pipeline_tensor_parallel/test.sh b/tests/test_data_pipeline_tensor_parallel/test.sh index 1c6012a5239f..0796e23cb013 100644 --- a/tests/test_data_pipeline_tensor_parallel/test.sh +++ b/tests/test_data_pipeline_tensor_parallel/test.sh @@ -1,4 +1,3 @@ #!/usr/bin/env sh -test_file=$1 -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 +python run_cifar10_vit2d_with_pipeline.py --host $HOST diff --git a/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py b/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py deleted file mode 100644 index b68a58cea3e3..000000000000 --- a/tests/test_data_pipeline_tensor_parallel/test_vit_2d/test_vit_2d.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py') - - -def eval(engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - - if gpc.is_last_rank(ParallelMode.PIPELINE): - # loss = sum(loss) - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output, - ParallelMode.PARALLEL_2D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2D_COL, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label == output) - correct_sum += correct - total_sum += label.size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine, train_dataloader): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - - if gpc.is_last_rank(ParallelMode.PIPELINE): - accumulated_loss += loss.detach().cpu().numpy() - avg_loss = accumulated_loss / num_steps - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - logger = get_global_dist_logger() - - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine, train_dataloader) - if gpc.is_last_rank(ParallelMode.PIPELINE): - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine, test_dataloader) - if gpc.is_last_rank(ParallelMode.PIPELINE): - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2d_parallel_vision_transformer() diff --git a/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py b/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py deleted file mode 100644 index 70857f1e8d9a..000000000000 --- a/tests/test_data_pipeline_tensor_parallel/test_vit_2p5d/test_vit_2p5d.py +++ /dev/null @@ -1,89 +0,0 @@ -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py') - - -def eval(engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - - if gpc.is_last_rank(ParallelMode.PIPELINE): - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output, - ParallelMode.PARALLEL_2P5D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2P5D_COL, - 0, - ) - output = _gather( - output, - ParallelMode.PARALLEL_2P5D_DEP, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label == output) - correct_sum += correct - total_sum += label.size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine, train_dataloader): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - - if gpc.is_last_rank(ParallelMode.PIPELINE): - accumulated_loss += loss.detach().cpu().numpy() - - avg_loss = accumulated_loss / num_steps - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2p5d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - logger = get_global_dist_logger() - - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine, train_dataloader) - if gpc.is_last_rank(ParallelMode.PIPELINE): - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine, test_dataloader) - if gpc.is_last_rank(ParallelMode.PIPELINE): - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2p5d_parallel_vision_transformer() diff --git a/tests/test_data_pipeline_tensor_parallel/vit_t_2d.py b/tests/test_data_pipeline_tensor_parallel/vit_t_2d.py new file mode 100644 index 000000000000..5be7a575a590 --- /dev/null +++ b/tests/test_data_pipeline_tensor_parallel/vit_t_2d.py @@ -0,0 +1,74 @@ + +import sys +from pathlib import Path +repo_path = str(Path(__file__).absolute().parents[2]) +sys.path.append(repo_path) + +try: + import model_zoo.vit.vision_transformer_from_config +except ImportError: + raise ImportError("model_zoo is not found, please check your path") + +IMG_SIZE = 32 +PATCH_SIZE = 4 +DIM = 512 +NUM_ATTENTION_HEADS = 8 +NUM_CLASSES = 10 +DEPTH = 6 + +model_cfg = dict( + type='VisionTransformerFromConfig', + tensor_splitting_cfg=dict( + type='ViTInputSplitter2D', + ), + embedding_cfg=dict( + type='ViTPatchEmbedding2D', + img_size=IMG_SIZE, + patch_size=PATCH_SIZE, + embed_dim=DIM, + ), + token_fusion_cfg=dict( + type='ViTTokenFuser2D', + img_size=IMG_SIZE, + patch_size=PATCH_SIZE, + embed_dim=DIM, + drop_rate=0.1 + ), + norm_cfg=dict( + type='LayerNorm2D', + normalized_shape=DIM, + eps=1e-6, + ), + block_cfg=dict( + type='ViTBlock', + attention_cfg=dict( + type='ViTSelfAttention2D', + hidden_size=DIM, + num_attention_heads=NUM_ATTENTION_HEADS, + attention_dropout_prob=0., + hidden_dropout_prob=0.1, + ), + droppath_cfg=dict( + type='VanillaViTDropPath', + ), + mlp_cfg=dict( + type='ViTMLP2D', + in_features=DIM, + dropout_prob=0.1, + mlp_ratio=1 + ), + norm_cfg=dict( + type='LayerNorm2D', + normalized_shape=DIM, + eps=1e-6, + ), + ), + head_cfg=dict( + type='ViTHead2D', + hidden_size=DIM, + num_classes=NUM_CLASSES, + ), + embed_dim=DIM, + depth=DEPTH, + drop_path_rate=0., +) diff --git a/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py b/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py index f845d98420fb..1415bcb85e92 100644 --- a/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py +++ b/tests/test_engine/configs/non_pipeline_resnet_apex_amp.py @@ -1,7 +1,6 @@ import os from pathlib import Path -from colossalai.engine import AMP_TYPE BATCH_SIZE = 128 IMG_SIZE = 224 @@ -9,34 +8,9 @@ NUM_CLASSES = 10 NUM_ATTN_HEADS = 12 -# resnet 18 -model = dict(type='VanillaResNet', - block_type='ResNetBasicBlock', - layers=[2, 2, 2, 2], - num_cls=10) parallel = dict( pipeline=dict(size=1), tensor=dict(size=1, mode=None) ) - -train_data = dict(dataset=dict(type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - download=True, - transform_pipeline=[ - dict(type='Resize', - size=(IMG_SIZE, IMG_SIZE)), - dict(type='ToTensor'), - dict(type='Normalize', - mean=(0.5, 0.5, 0.5), - std=(0.5, 0.5, 0.5)) - ]), - dataloader=dict(batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - drop_last=True)) - -optimizer = dict(type='Adam', lr=0.001) - -loss = dict(type='CrossEntropyLoss') fp16 = dict(mode=AMP_TYPE.APEX) diff --git a/tests/test_engine/test.sh b/tests/test_engine/test.sh index 24d0c54231ee..0d90c8e55da4 100644 --- a/tests/test_engine/test.sh +++ b/tests/test_engine/test.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh test_file=$1 -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file +python $test_file --world_size $SLURM_NPROCS --host $HOST --port 29500 --rank $SLURM_PROCID \ No newline at end of file diff --git a/tests/test_engine/test_engine/test_engine_apex_amp.py b/tests/test_engine/test_engine/test_engine_apex_amp.py new file mode 100644 index 000000000000..ff9c9f9bf18e --- /dev/null +++ b/tests/test_engine/test_engine/test_engine_apex_amp.py @@ -0,0 +1,114 @@ +# !/usr/bin/env python +# -*- encoding: utf-8 -*- + +import colossalai +import os +import pytest +import torch +import os.path as osp +from pathlib import Path +import torch.nn as nn + +from torchvision import transforms +from torch.optim import Adam +from colossalai.core import global_context as gpc +from colossalai.amp import AMP_TYPE +from colossalai.logging import get_dist_logger +from colossalai.utils import report_memory_usage, get_dataloader +from colossalai.initialize import get_default_parser +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + + +# Config +BATCH_SIZE = 128 +IMG_SIZE = 224 +DIM = 768 +NUM_CLASSES = 10 +NUM_ATTN_HEADS = 12 + +CONFIG = dict( + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ), + fp16=dict(mode=AMP_TYPE.APEX), + clip_grad_norm=1.0 +) + + +def run_no_pipeline(): + parser = get_default_parser() + args = parser.parse_args() + + # init dist env + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + logger = get_dist_logger() + rank = torch.distributed.get_rank() + + engine.train() + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + output = engine(img) + loss = engine.criterion(output, label) + engine.backward(loss) + engine.step() + break + + logger.info('Rank {} returns: {}'.format(rank, loss.item())) + + gpc.destroy() + logger.info('Test engine finished') + report_memory_usage("After testing") + + +@pytest.mark.skip("This test should be invoked using the test.sh provided") +@pytest.mark.dist +def test_engine(): + run_no_pipeline() + + +if __name__ == '__main__': + test_engine() diff --git a/tests/test_engine/test_engine/test_engine_naive_amp.py b/tests/test_engine/test_engine/test_engine_naive_amp.py new file mode 100644 index 000000000000..dd75b93596cc --- /dev/null +++ b/tests/test_engine/test_engine/test_engine_naive_amp.py @@ -0,0 +1,113 @@ +import colossalai +import os +import pytest +import torch +import os.path as osp +from pathlib import Path +import torch.nn as nn + +from torchvision import transforms +from torch.optim import Adam +from colossalai.core import global_context as gpc +from colossalai.amp import AMP_TYPE +from colossalai.logging import get_dist_logger +from colossalai.utils import report_memory_usage, get_dataloader +from colossalai.initialize import get_default_parser +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + + +# Config +BATCH_SIZE = 128 +IMG_SIZE = 224 +DIM = 768 +NUM_CLASSES = 10 +NUM_ATTN_HEADS = 12 + +CONFIG = dict( + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ), + fp16=dict( + mode=AMP_TYPE.NAIVE, + clip_grad=1.0 + ) +) + + +def run_no_pipeline(): + parser = get_default_parser() + args = parser.parse_args() + + # init dist env + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + logger = get_dist_logger() + rank = torch.distributed.get_rank() + + engine.train() + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + output = engine(img) + loss = engine.criterion(output, label) + engine.backward(loss) + engine.step() + break + + logger.info('Rank {} returns: {}'.format(rank, loss.item())) + + gpc.destroy() + logger.info('Test engine finished') + report_memory_usage("After testing") + + +@pytest.mark.skip("This test should be invoked using the test.sh provided") +@pytest.mark.dist +def test_engine(): + run_no_pipeline() + + +if __name__ == '__main__': + test_engine() diff --git a/tests/test_engine/test_engine/test_engine_no_amp.py b/tests/test_engine/test_engine/test_engine_no_amp.py new file mode 100644 index 000000000000..f8392c98ab23 --- /dev/null +++ b/tests/test_engine/test_engine/test_engine_no_amp.py @@ -0,0 +1,110 @@ +import colossalai +import os +import pytest +import torch +import os.path as osp +from pathlib import Path +import torch.nn as nn + +from torchvision import transforms +from torch.optim import Adam +from colossalai.core import global_context as gpc +from colossalai.amp import AMP_TYPE +from colossalai.logging import get_dist_logger +from colossalai.utils import report_memory_usage, get_dataloader +from colossalai.initialize import get_default_parser +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + + +# Config +BATCH_SIZE = 128 +IMG_SIZE = 224 +DIM = 768 +NUM_CLASSES = 10 +NUM_ATTN_HEADS = 12 + +CONFIG = dict( + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ), + clip_grad_norm=1.0 +) + + +def run_no_pipeline(): + parser = get_default_parser() + args = parser.parse_args() + + # init dist env + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + logger = get_dist_logger() + rank = torch.distributed.get_rank() + + engine.train() + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + output = engine(img) + loss = engine.criterion(output, label) + engine.backward(loss) + engine.step() + break + + logger.info('Rank {} returns: {}'.format(rank, loss.item())) + + gpc.destroy() + logger.info('Test engine finished') + report_memory_usage("After testing") + + +@pytest.mark.skip("This test should be invoked using the test.sh provided") +@pytest.mark.dist +def test_engine(): + run_no_pipeline() + + +if __name__ == '__main__': + test_engine() diff --git a/tests/test_engine/test_engine/test_engine_torch_amp.py b/tests/test_engine/test_engine/test_engine_torch_amp.py new file mode 100644 index 000000000000..fdafd494ce1b --- /dev/null +++ b/tests/test_engine/test_engine/test_engine_torch_amp.py @@ -0,0 +1,111 @@ +import colossalai +import os +import pytest +import torch +import os.path as osp +from pathlib import Path +import torch.nn as nn + +from torchvision import transforms +from torch.optim import Adam +from colossalai.core import global_context as gpc +from colossalai.amp import AMP_TYPE +from colossalai.logging import get_dist_logger +from colossalai.utils import report_memory_usage, get_dataloader +from colossalai.initialize import get_default_parser +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + + +# Config +BATCH_SIZE = 128 +IMG_SIZE = 224 +DIM = 768 +NUM_CLASSES = 10 +NUM_ATTN_HEADS = 12 + +CONFIG = dict( + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ), + fp16=dict(mode=AMP_TYPE.TORCH), + clip_grad_norm=1.0 +) + + +def run_no_pipeline(): + parser = get_default_parser() + args = parser.parse_args() + + # init dist env + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + logger = get_dist_logger() + rank = torch.distributed.get_rank() + + engine.train() + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + output = engine(img) + loss = engine.criterion(output, label) + engine.backward(loss) + engine.step() + break + + logger.info('Rank {} returns: {}'.format(rank, loss.item())) + + gpc.destroy() + logger.info('Test engine finished') + report_memory_usage("After testing") + + +@pytest.mark.skip("This test should be invoked using the test.sh provided") +@pytest.mark.dist +def test_engine(): + run_no_pipeline() + + +if __name__ == '__main__': + test_engine() diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py deleted file mode 100644 index 98c2b807256d..000000000000 --- a/tests/test_engine/test_non_pipeline_engine/test_engine_apex_amp.py +++ /dev/null @@ -1,46 +0,0 @@ -# !/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os.path as osp - -import pytest -import torch - -from colossalai import initialize -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.utils import report_memory_usage - -NUM_BATCH = 128 -NUM_MICRO = 6 - -BATCH_SIZE = 32 -SEQ_LENGTH = 128 -HIDDEN_SIZE = 512 - -DIR_PATH = osp.dirname(osp.realpath(__file__)) -NO_PIPE_CONFIG_PATH = osp.join(DIR_PATH, '../configs/non_pipeline_resnet_apex_amp.py') - - -def run_no_pipeline(config): - engine, train_dataloader, test_dataloader = initialize(config) - logger = get_global_dist_logger() - rank = torch.distributed.get_rank() - - engine.train() - output, label, loss = engine.step(iter(train_dataloader)) - logger.info('Rank {} returns: {}'.format(rank, loss.item())) - - gpc.destroy() - logger.info('Test engine finished') - report_memory_usage("After testing") - - -@pytest.mark.skip("This test should be invoked using the test.sh provided") -@pytest.mark.dist -def test_engine(): - run_no_pipeline(NO_PIPE_CONFIG_PATH) - - -if __name__ == '__main__': - test_engine() diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py deleted file mode 100644 index effb65e02441..000000000000 --- a/tests/test_engine/test_non_pipeline_engine/test_engine_no_amp.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os.path as osp - -import pytest -import torch - -from colossalai import initialize -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.utils import report_memory_usage - -NUM_BATCH = 128 -NUM_MICRO = 6 - -BATCH_SIZE = 32 -SEQ_LENGTH = 128 -HIDDEN_SIZE = 512 - -DIR_PATH = osp.dirname(osp.realpath(__file__)) -NO_PIPE_CONFIG_PATH = osp.join(DIR_PATH, '../configs/non_pipeline_resnet.py') - - -def test_no_pipeline(config): - print('Test no pipeline engine start') - - engine, train_dataloader, test_dataloader = initialize(config) - logger = get_global_dist_logger() - - rank = torch.distributed.get_rank() - - engine.train() - output, label, loss = engine.step(iter(train_dataloader)) - logger.info('Rank {} returns: {}'.format(rank, loss.item())) - - gpc.destroy() - logger.info('Test engine finished') - report_memory_usage("After testing") - - -@pytest.mark.skip("This test should be invoked using the test.sh provided") -@pytest.mark.dist -def test_engine(): - test_no_pipeline(NO_PIPE_CONFIG_PATH) - - -if __name__ == '__main__': - test_engine() diff --git a/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py b/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py deleted file mode 100644 index a4c496a7db8b..000000000000 --- a/tests/test_engine/test_non_pipeline_engine/test_engine_torch_amp.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os.path as osp - -import pytest -import torch - -from colossalai import initialize -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.utils import report_memory_usage - -NUM_BATCH = 128 -NUM_MICRO = 6 - -BATCH_SIZE = 32 -SEQ_LENGTH = 128 -HIDDEN_SIZE = 512 - -DIR_PATH = osp.dirname(osp.realpath(__file__)) -NO_PIPE_CONFIG_PATH = osp.join(DIR_PATH, '../configs/non_pipeline_resnet_torch_amp.py') - - -def test_no_pipeline(config): - print('Test no pipeline engine start') - - engine, train_dataloader, test_dataloader = initialize(config) - logger = get_global_dist_logger() - rank = torch.distributed.get_rank() - - engine.train() - output, label, loss = engine.step(iter(train_dataloader)) - logger.info('Rank {} returns: {}'.format(rank, loss.item())) - - gpc.destroy() - logger.info('Test engine finished') - report_memory_usage("After testing") - - -@pytest.mark.skip("This test should be invoked using the test.sh provided") -@pytest.mark.dist -def test_engine(): - test_no_pipeline(NO_PIPE_CONFIG_PATH) - - -if __name__ == '__main__': - test_engine() diff --git a/tests/test_engine/test_pipeline_engine/test_engine.py b/tests/test_engine/test_pipeline_engine/test_engine.py deleted file mode 100644 index 9d6c9f59f206..000000000000 --- a/tests/test_engine/test_pipeline_engine/test_engine.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os.path as osp - -import pytest -import torch - -from colossalai import initialize -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger - -NUM_BATCH = 128 - -BATCH_SIZE = 32 -SEQ_LENGTH = 128 -HIDDEN_SIZE = 512 - -DIR_PATH = osp.dirname(osp.realpath(__file__)) -PIPE_CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py') - - -def run_pipeline(config): - engine, train_dataloader, test_dataloader = initialize(config) - logger = get_global_dist_logger() - rank = torch.distributed.get_rank() - - engine.train() - outputs, labels, loss = engine.step(iter(train_dataloader)) - if gpc.is_last_rank(ParallelMode.PIPELINE): - logger.info('losses: {}'.format(rank, loss.item())) - - gpc.destroy() - logger.info('Test engine pipeline finished') - - -@pytest.mark.skip("This test should be invoked using the test.sh provided") -@pytest.mark.dist -def test_engine(): - run_pipeline(PIPE_CONFIG_PATH) - - -if __name__ == '__main__': - test_engine() diff --git a/tests/test_fp16_optimizer/configs/vit_2d.py b/tests/test_fp16_optimizer/configs/vit_2d.py deleted file mode 100644 index 6283dea9b2d3..000000000000 --- a/tests/test_fp16_optimizer/configs/vit_2d.py +++ /dev/null @@ -1,143 +0,0 @@ -import os -from pathlib import Path - -from colossalai.engine import AMP_TYPE - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=4, mode='2d'), -) - -fp16 = dict( - mode=AMP_TYPE.PARALLEL, - initial_scale=2 ** 4 -) - -num_epochs = 60 - - -lr_scheduler = dict( - type='LinearWarmupLR', - warmup_steps=5, - total_steps=num_epochs -) - diff --git a/tests/test_fp16_optimizer/test.sh b/tests/test_fp16_optimizer/test.sh deleted file mode 100644 index 24d0c54231ee..000000000000 --- a/tests/test_fp16_optimizer/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env sh -test_file=$1 - -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file diff --git a/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py b/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py deleted file mode 100644 index 45c36f3843d2..000000000000 --- a/tests/test_fp16_optimizer/test_vit_2d/test_vit_2d.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.builder import build_lr_scheduler -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py') - - -def eval(engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output[0], - ParallelMode.PARALLEL_2D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2D_COL, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label[0] == output) - correct_sum += correct - total_sum += label[0].size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine, train_dataloader, lr_scheduler): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.squeeze(0).detach().cpu().numpy() - avg_loss = accumulated_loss / num_steps - lr_scheduler.step() - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer) - logger = get_global_dist_logger() - - logger.info('start training') - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine, train_dataloader, lr_scheduler) - - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine, test_dataloader) - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2d_parallel_vision_transformer() diff --git a/tests/test_layers/test.sh b/tests/test_layers/test.sh index 24d0c54231ee..da5afd5aede6 100644 --- a/tests/test_layers/test.sh +++ b/tests/test_layers/test.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh test_file=$1 -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file +python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_layers/test_1d/test_1d.py index 2d6436b05a62..533376999a5c 100644 --- a/tests/test_layers/test_1d/test_1d.py +++ b/tests/test_layers/test_1d/test_1d.py @@ -4,7 +4,7 @@ import pytest from colossalai.core import global_context as gpc -from colossalai.initialize import init_dist +from colossalai.initialize import launch, get_default_parser from test_layer import * CONFIG = dict( @@ -28,12 +28,19 @@ def check_layer(): check_embed() check_head() + @pytest.mark.dist @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") def test_1d(): - init_dist(config=CONFIG) - gpc.set_seed() - + parser = get_default_parser() + args = parser.parse_args() + launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) + check_layer() gpc.destroy() diff --git a/tests/test_layers/test_1d/test_layer.py b/tests/test_layers/test_1d/test_layer.py index 3fa9eee9dc94..682a4257adec 100644 --- a/tests/test_layers/test_1d/test_layer.py +++ b/tests/test_layers/test_1d/test_layer.py @@ -1,4 +1,3 @@ -from colossalai.nn.optimizer.zero_redundancy_optimizer_level_2 import print_rank_msg from tests.test_layers.test_3d.common import IMG_SIZE import torch import torch.distributed as dist @@ -142,6 +141,7 @@ def check_linear_row(): print_rank_0('linear_row no parallel_input backward: pass') + class Testvithead(torch.nn.Module): def __init__(self, in_features, out_features, bias=True): super().__init__() @@ -152,6 +152,7 @@ def forward(self, x): x = self.linear(x) return x + def check_head(): device = get_current_device() dtype = torch.float32 @@ -159,7 +160,7 @@ def check_head(): i = gpc.get_local_rank(ParallelMode.PARALLEL_1D) - head = ViTHead1D(INPUT_SIZE,NUM_CLASSES,dtype=dtype) + head = ViTHead1D(INPUT_SIZE, NUM_CLASSES, dtype=dtype) torch.nn.init.zeros_(head.linear.bias) torch.nn.init.ones_(head.linear.weight) head = head.to(device) @@ -185,7 +186,7 @@ def check_head(): A_master.requires_grad = True C_master = layer(A_master) # C = torch.chunk(C_master, DEPTH, dim=0)[i] - print_rank_msg('Rank {} head forward: {}'.format(i, check_equal(out, C_master))) + print_rank_0('Rank {} head forward: {}'.format(i, check_equal(out, C_master))) grad_shape = C_master.shape grad_master = torch.randn(grad_shape, @@ -198,7 +199,7 @@ def check_head(): out.backward(grad_master) # bwd_end = time.time() # print_rank_0('head backward: pass | {:.3f} s'.format(bwd_end - bwd_start), - # logger) + # logger) C_master.backward(grad_master) A_grad = A_master.grad @@ -207,7 +208,6 @@ def check_head(): i, check_equal(A_grad, A.grad))) - class Testvitembed(torch.nn.Module): def __init__(self, img_size: int, patch_size: int, in_chans: int, embed_size: int, drop_prob: float) -> None: @@ -230,6 +230,7 @@ def forward(self, x): x = self.pos_drop(x + self.pos_embed) return x + def check_embed(): device = get_current_device() dtype = torch.float32 @@ -276,7 +277,7 @@ def check_embed(): # logger.info('Rank {} embed forward (cls): {}'.format( # rank, check_equal(out_cls, C_cls))) # C = C_master[:, 1:] - print_rank_msg('Rank {} embed forward: {}'.format(i, check_equal(out, C_master))) + print_rank_0('Rank {} embed forward: {}'.format(i, check_equal(out, C_master))) grad_shape = C_master.shape grad_master = torch.randn(grad_shape, @@ -297,7 +298,7 @@ def check_embed(): C_master.backward(grad_master) A_grad = A_master.grad - print_rank_msg('Rank {} embed backward (input_grad): {}'.format(i, check_equal(A_grad, A.grad))) + print_rank_0('Rank {} embed backward (input_grad): {}'.format(i, check_equal(A_grad, A.grad))) print_rank_0('Rank {} embed backward (cls_grad): {}'.format( i, check_equal(layer_master.cls_token.grad, layer2.cls_token.grad))) @@ -305,14 +306,15 @@ def check_embed(): print_rank_0('Rank {} embed backward (pos_embed_grad): {}'.format( i, check_equal(layer_master.pos_embed.grad, layer2.pos_embed.grad))) - print_rank_msg('Rank {} embed backward (proj_weight_grad): {}'.format( + print_rank_0('Rank {} embed backward (proj_weight_grad): {}'.format( i, check_equal(layer_master.proj.weight.grad, layer.proj.weight.grad))) - print_rank_msg('Rank {} embed backward (proj_bias_grad): {}'.format( + print_rank_0('Rank {} embed backward (proj_bias_grad): {}'.format( i, check_equal(layer_master.proj.bias.grad, layer.proj.bias.grad))) return fwd_end - fwd_start, bwd_end - bwd_start + def check_attention(): device = get_current_device() dtype = torch.float32 @@ -321,10 +323,9 @@ def check_attention(): i = gpc.get_local_rank(ParallelMode.PARALLEL_1D) - layer = ViTSelfAttention1D( - HIDDEN_SIZE, - NUM_ATTENTION_HEADS, + HIDDEN_SIZE, + NUM_ATTENTION_HEADS, 0.5, 0.5 ).to(device=device) @@ -379,6 +380,7 @@ def check_mlp(): assert A.grad.shape == A.shape print_rank_0('mlp backward: pass') + def check_patch_embedding(): device = get_current_device() dtype = torch.float32 @@ -400,7 +402,7 @@ def check_patch_embedding(): A.requires_grad = True out = layer(A) - print('output size: ',out.size()) + print('output size: ', out.size()) assert out.shape == (BATCH_SIZE, 4, HIDDEN_SIZE) print_rank_0('patch embedding forward: pass') diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_layers/test_2d/test_2d.py index 994b2d37aaf5..f1b683b9f9d8 100644 --- a/tests/test_layers/test_2d/test_2d.py +++ b/tests/test_layers/test_2d/test_2d.py @@ -4,7 +4,7 @@ import pytest from colossalai.core import global_context as gpc -from colossalai.initialize import init_dist +from colossalai.initialize import launch, get_default_parser from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer from test_operation import check_AB, check_ABT, check_ATB @@ -36,8 +36,14 @@ def check_layer(): @pytest.mark.dist @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") def test_2d(): - init_dist(config=CONFIG) - gpc.set_seed() + parser = get_default_parser() + args = parser.parse_args() + launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) check_operations() check_layer() gpc.destroy() diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_layers/test_2p5d/test_2p5d.py index 488d38d87ec6..bad2a9a04daf 100644 --- a/tests/test_layers/test_2p5d/test_2p5d.py +++ b/tests/test_layers/test_2p5d/test_2p5d.py @@ -1,7 +1,7 @@ import pytest from colossalai.core import global_context as gpc -from colossalai.initialize import init_dist +from colossalai.initialize import launch, get_default_parser from test_layer import check_linear, check_layernorm, check_attention, check_mlp, check_transformerlayer from test_operation import check_AB, check_ABT, check_ATB @@ -30,8 +30,14 @@ def check_layer(): @pytest.mark.dist @pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") def test_2p5d(): - init_dist(config=CONFIG) - gpc.set_seed() + parser = get_default_parser() + args = parser.parse_args() + launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) check_layer() check_operations() gpc.destroy() diff --git a/tests/test_layers/test_2p5d/test_operation.py b/tests/test_layers/test_2p5d/test_operation.py index 5ffaafe2cca6..2342db3bb384 100644 --- a/tests/test_layers/test_2p5d/test_operation.py +++ b/tests/test_layers/test_2p5d/test_operation.py @@ -16,7 +16,7 @@ def check_AB(): pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size( ParallelMode.PIPELINE) tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR) - + dtype = torch.float i = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL) j = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW) @@ -41,11 +41,10 @@ def check_AB(): out_shape = (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, 4 * HIDDEN_SIZE // TESSERACT_DIM) out = Matmul_AB_2p5D.apply( A, B, - TESSERACT_DIM, TESSERACT_DEP, out_shape, + TESSERACT_DIM, out_shape, i, j, k, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL, - ParallelMode.PARALLEL_2P5D_DEP, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, @@ -93,7 +92,7 @@ def check_ABT(): pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size( ParallelMode.PIPELINE) tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR) - + dtype = torch.float device = get_current_device() @@ -119,11 +118,10 @@ def check_ABT(): out = Matmul_ABT_2p5D.apply( C, B, - TESSERACT_DIM, TESSERACT_DEP, (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, HIDDEN_SIZE // TESSERACT_DIM), + TESSERACT_DIM, (BATCH_SIZE // TESSERACT_DIM, SEQ_LENGTH, HIDDEN_SIZE // TESSERACT_DIM), i, j, k, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL, - ParallelMode.PARALLEL_2P5D_DEP, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, @@ -169,7 +167,7 @@ def check_ATB(): pipeline_parallel_size = 1 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_world_size( ParallelMode.PIPELINE) tensor_parallel_size = gpc.get_world_size(ParallelMode.TENSOR) - + device = get_current_device() dtype = torch.float @@ -195,11 +193,10 @@ def check_ATB(): out = Matmul_ATB_2p5D.apply( A, C, - TESSERACT_DIM, TESSERACT_DEP, (HIDDEN_SIZE // TESSERACT_DIM, 4 * HIDDEN_SIZE // TESSERACT_DIM), + TESSERACT_DIM, (HIDDEN_SIZE // TESSERACT_DIM, 4 * HIDDEN_SIZE // TESSERACT_DIM), i, j, k, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL, - ParallelMode.PARALLEL_2P5D_DEP, data_parallel_rank, pipeline_parallel_rank, pipeline_parallel_size, diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_layers/test_3d/test_3d.py index 7c1212c20f4a..b05fc672a3c9 100644 --- a/tests/test_layers/test_3d/test_3d.py +++ b/tests/test_layers/test_3d/test_3d.py @@ -1,10 +1,11 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from colossalai.initialize import init_dist +from colossalai.initialize import launch, get_default_parser from test_layer import * from test_operation import * +from colossalai.logging import get_dist_logger CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)), seed=0) @@ -17,11 +18,10 @@ # check_add() # check_mul() # check_sum() -# check_pooler() def check_layer(): - logger = get_global_dist_logger() + logger = get_dist_logger() liear_fwd_time, linear_bwd_time = check_linear() norm_fwd_time, norm_bwd_time = check_layernorm() attn_fwd_time, attn_bwd_time = check_attention() @@ -40,11 +40,16 @@ def check_layer(): def _test_main(): # init dist - init_dist(CONFIG) - logger = get_global_dist_logger() + parser = get_default_parser() + args = parser.parse_args() + launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) + logger = get_dist_logger() logger.info('Distributed environment is initialzied.', ranks=[0]) - - global_context.set_seed() torch.backends.cudnn.benchmark = True # check operation diff --git a/tests/test_layers/test_3d/test_layer.py b/tests/test_layers/test_3d/test_layer.py index 4c661ed658bc..92720e42c5dd 100644 --- a/tests/test_layers/test_3d/test_layer.py +++ b/tests/test_layers/test_3d/test_layer.py @@ -7,7 +7,7 @@ import numpy as np from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.registry import LAYERS, LOSSES from colossalai.utils import get_current_device, print_rank_0 from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env @@ -18,7 +18,7 @@ def check_linear(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() device = get_current_device() dtype = torch.float32 INPUT_SIZE = HIDDEN_SIZE @@ -34,8 +34,8 @@ def check_linear(): layer = LAYERS.get_module('Linear3D')(INPUT_SIZE, OUTPUT_SIZE, - # ParallelMode.PARALLEL_3D_INPUT, - # ParallelMode.PARALLEL_3D_WEIGHT, + # ParallelMode.PARALLEL_3D_INPUT, + # ParallelMode.PARALLEL_3D_WEIGHT, dtype=dtype, bias=True) # torch.nn.init.zeros_(layer.bias) @@ -120,7 +120,7 @@ def check_linear(): def check_layernorm(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() device = get_current_device() dtype = torch.float32 INPUT_SIZE = HIDDEN_SIZE @@ -141,7 +141,7 @@ def check_layernorm(): norm = norm.to(device) norm_master = torch.nn.LayerNorm(INPUT_SIZE, eps=1e-6) norm_master = norm_master.to(device) - + weight_master = norm_master.weight.data torch.distributed.broadcast(weight_master, src=0) weight = torch.chunk(weight_master, DEPTH)[k] @@ -208,7 +208,7 @@ def check_layernorm(): bias_grad = torch.chunk(bias_grad, DEPTH)[k] logger.info('Rank {} layernorm backward (weight_grad): {}'.format( rank, check_equal(bias_grad, norm.weight.grad))) - + bias_grad = norm_master.bias.grad bias_grad = torch.chunk(bias_grad, DEPTH)[k] logger.info('Rank {} layernorm backward (bias_grad): {}'.format( @@ -220,7 +220,7 @@ def check_layernorm(): def check_attention(): rank = torch.distributed.get_rank() device = get_current_device() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float32 INPUT_SIZE = HIDDEN_SIZE NUM_ATTENTION_HEADS = 2 @@ -277,7 +277,7 @@ def check_attention(): def check_mlp(): rank = torch.distributed.get_rank() device = get_current_device() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float32 INPUT_SIZE = HIDDEN_SIZE @@ -337,7 +337,7 @@ def forward(self, x): def check_head(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() device = get_current_device() dtype = torch.float32 INPUT_SIZE = HIDDEN_SIZE @@ -495,7 +495,7 @@ def forward(self, x): def check_embed(): rank = torch.distributed.get_rank() device = get_current_device() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float32 input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D) @@ -632,7 +632,7 @@ def check_embed(): def check_loss(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() device = get_current_device() dtype = torch.float32 @@ -645,7 +645,7 @@ def check_loss(): k = C_rank = global_context.get_local_rank(output_parallel_mode) criterion = LOSSES.get_module('CrossEntropyLoss3D')() - # ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT) + # ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT) criterion_master = torch.nn.CrossEntropyLoss() out_shape = (BATCH_SIZE, NUM_CLASSES) diff --git a/tests/test_layers/test_3d/test_operation.py b/tests/test_layers/test_3d/test_operation.py index 05acb7f585db..a0c34432cd3a 100644 --- a/tests/test_layers/test_3d/test_operation.py +++ b/tests/test_layers/test_3d/test_operation.py @@ -3,7 +3,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.nn.layer.parallel_3d._operation import * from colossalai.utils import get_current_device @@ -12,7 +12,7 @@ def check_AB(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) i = global_context.get_local_rank(ParallelMode.PARALLEL_3D_WEIGHT) @@ -83,7 +83,7 @@ def check_AB(): def check_ABT(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) @@ -152,7 +152,7 @@ def check_ABT(): def check_ATB(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() device = get_current_device() dtype = torch.float @@ -222,7 +222,7 @@ def check_ATB(): def check_add(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) @@ -296,7 +296,7 @@ def check_add(): def check_mul(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) @@ -370,7 +370,7 @@ def check_mul(): def check_sum(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) @@ -417,7 +417,7 @@ def check_sum(): def check_reduce(): rank = torch.distributed.get_rank() - logger = get_global_dist_logger() + logger = get_dist_logger() dtype = torch.float j = A_rank = global_context.get_local_rank(ParallelMode.PARALLEL_3D_INPUT) diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_layers/test_sequence/test_sequence.py index 16122f93ab00..64a42a653f61 100644 --- a/tests/test_layers/test_sequence/test_sequence.py +++ b/tests/test_layers/test_sequence/test_sequence.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from colossalai.initialize import init_dist -from colossalai.logging import get_global_dist_logger +from colossalai.initialize import launch, get_default_parser +from colossalai.logging import get_dist_logger from test_layer import * CONFIG = dict( @@ -19,11 +19,17 @@ def check_layer(): def _test_main(): # init dist - init_dist(CONFIG) - logger = get_global_dist_logger() + parser = get_default_parser() + args = parser.parse_args() + launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) + logger = get_dist_logger() logger.info('Distributed environment is initialzied.', ranks=[0]) - gpc.set_seed() torch.backends.cudnn.benchmark = True # check layers diff --git a/tests/test_lr_scheduler/test_lr_scheduler.py b/tests/test_lr_scheduler/test_lr_scheduler.py deleted file mode 100644 index 012ea4476ab6..000000000000 --- a/tests/test_lr_scheduler/test_lr_scheduler.py +++ /dev/null @@ -1,69 +0,0 @@ -# from colossal.components.optimizer.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmupLR, FlatAnnealingLR, FlatAnnealingWarmupLR -# from colossal.components.optimizer.lr_scheduler import LinearWarmupLR -# from colossal.components.optimizer.lr_scheduler import MultiStepLR, MultiStepWarmupLR -# from colossal.components.optimizer.lr_scheduler import OneCycleLR -# from colossal.components.optimizer.lr_scheduler import PolynomialLR, PolynomialWarmupLR -import matplotlib.pyplot as plt -import pytest -from torch.optim import SGD -from torchvision.models import resnet18 - -from colossalai.builder import build_lr_scheduler - -NUM_EPOCHS = 5 -NUM_STEPS_PER_EPOCH = 10 - -cfg = { - 'warmup_steps': 5 -} - - -def init_cfg(name, **kwargs): - return { - 'type': name, - **cfg, - **kwargs - } - - -def test_scheduler(optimizer, scheduler_name, **kwargs): - for group in optimizer.param_groups: - group['lr'] = 0.1 - config = init_cfg(scheduler_name, **kwargs) - scheduler = build_lr_scheduler(config, - optimizer, NUM_EPOCHS * NUM_STEPS_PER_EPOCH, NUM_STEPS_PER_EPOCH) - x = [] - y = [] - for epoch in range(NUM_EPOCHS): - for i in range(NUM_STEPS_PER_EPOCH): - step = epoch * NUM_STEPS_PER_EPOCH + i - lr = optimizer.param_groups[0]['lr'] - x.append(step) - y.append(lr) - scheduler.step() - print(y) - plt.plot(x, y) - plt.show() - - -@pytest.mark.skip("This test is skipped as it requires visualization, " - "You can visualize the test output plots on your local environment") -def test(): - model = resnet18() - optimizer = SGD(model.parameters(), lr=1.0) - test_scheduler(optimizer, 'CosineAnnealingLR') - test_scheduler(optimizer, 'CosineAnnealingWarmupLR') - test_scheduler(optimizer, 'FlatAnnealingLR') - test_scheduler(optimizer, 'FlatAnnealingWarmupLR') - test_scheduler(optimizer, 'LinearWarmupLR') - test_scheduler(optimizer, 'MultiStepLR', milestones=[1, 3]) - test_scheduler(optimizer, 'MultiStepWarmupLR', milestones=[1, 3]) - test_scheduler(optimizer, 'MultiStepWarmupLR', - milestones=[1, 3], warmup_epochs=1) - test_scheduler(optimizer, 'PolynomialLR', power=2.0) - test_scheduler(optimizer, 'PolynomialWarmupLR', power=2.0) - test_scheduler(optimizer, 'OneCycleLR') - - -if __name__ == '__main__': - test() diff --git a/tests/test_models/test_vanilla_resnet/test_vanilla_resnet.py b/tests/test_models/test_vanilla_resnet/test_vanilla_resnet.py deleted file mode 100644 index bc9144fe05d4..000000000000 --- a/tests/test_models/test_vanilla_resnet/test_vanilla_resnet.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import pytest -import torch -import torchvision.models as models - -from colossalai.builder import build_model - -NUM_CLS = 10 - -RESNET18 = dict( - type='VanillaResNet', - block_type='ResNetBasicBlock', - layers=[2, 2, 2, 2], - num_cls=NUM_CLS -) - -RESNET34 = dict( - type='VanillaResNet', - block_type='ResNetBasicBlock', - layers=[3, 4, 6, 3], - num_cls=NUM_CLS -) - -RESNET50 = dict( - type='VanillaResNet', - block_type='ResNetBottleneck', - layers=[3, 4, 6, 3], - num_cls=NUM_CLS -) - -RESNET101 = dict( - type='VanillaResNet', - block_type='ResNetBottleneck', - layers=[3, 4, 23, 3], - num_cls=NUM_CLS -) - -RESNET152 = dict( - type='VanillaResNet', - block_type='ResNetBottleneck', - layers=[3, 8, 36, 3], - num_cls=NUM_CLS -) - - -def compare_model(data, colossal_model, torchvision_model): - colossal_output = colossal_model(data) - torchvision_output = torchvision_model(data) - assert colossal_output[ - 0].shape == torchvision_output.shape, f'{colossal_output[0].shape}, {torchvision_output.shape}' - - -@pytest.mark.cpu -def test_vanilla_resnet(): - """Compare colossal resnet with torchvision resnet""" - # data - x = torch.randn((2, 3, 224, 224)) - - # resnet 18 - col_resnet18 = build_model(RESNET18) - col_resnet18.build_from_cfg() - torchvision_resnet18 = models.resnet18(num_classes=NUM_CLS) - - compare_model(x, col_resnet18, torchvision_resnet18) - - # resnet 34 - col_resnet34 = build_model(RESNET34) - col_resnet34.build_from_cfg() - torchvision_resnet34 = models.resnet34(num_classes=NUM_CLS) - - compare_model(x, col_resnet34, torchvision_resnet34) - - # resnet 50 - col_resnet50 = build_model(RESNET50) - col_resnet50.build_from_cfg() - torchvision_resnet50 = models.resnet50(num_classes=NUM_CLS) - - compare_model(x, col_resnet50, torchvision_resnet50) - - # resnet 101 - col_resnet101 = build_model(RESNET101) - col_resnet101.build_from_cfg() - torchvision_resnet101 = models.resnet101(num_classes=NUM_CLS) - - compare_model(x, col_resnet101, torchvision_resnet101) - - # # resnet 152 - col_resnet152 = build_model(RESNET152) - col_resnet152.build_from_cfg() - torchvision_resnet152 = models.resnet152(num_classes=NUM_CLS) - - compare_model(x, col_resnet152, torchvision_resnet152) - - -if __name__ == '__main__': - test_vanilla_resnet() diff --git a/tests/test_models/test_vision_transformer/configs/vit_1d.py b/tests/test_models/test_vision_transformer/configs/vit_1d.py deleted file mode 100644 index 40e28f6f186f..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_1d.py +++ /dev/null @@ -1,137 +0,0 @@ -import os -from pathlib import Path - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -NUM_CLASSES = 10 -DEPTH = 6 -LOG_NAME = 'vit1D_cifar10_tp=2_selfattention_V2' - -# # ViT Base -# BATCH_SIZE = 512 -# IMG_SIZE = 224 -# PATCH_SIZE = 16 -# DIM = 384 -# NUM_ATTENTION_HEADS = 6 -# NUM_CLASSES = 100 -# DEPTH = 12 -# LOG_NAME = 'vit1D_imagenet100' - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - download = True, - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True)) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=400, - pin_memory=True, - num_workers=4, - shuffle=True)) - -optimizer = dict(type='Adam', lr=0.001, weight_decay=0) - -loss = dict(type='CrossEntropyLoss1D', ) - -model = dict( - type='VisionTransformerFromConfig', - embedding_cfg=dict( - type='ViTPatchEmbedding1D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict(type='ViTTokenFuser1D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1), - norm_cfg=dict( - type='LayerNorm', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention1DV2', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict(type='VanillaViTDropPath', ), - mlp_cfg=dict(type='ViTMLP1D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1), - norm_cfg=dict( - type='LayerNorm', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead1D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=2, mode='1d'), -) - -hooks = [ - dict(type='LogMetricByEpochHook'), - # dict(type='LogTimingByEpochHook'), - # dict(type='LogMemoryByEpochHook'), - dict(type='TensorboardHook', log_dir=f'./tests/test_models/test_vision_transformer/test_vit_1d/tb_logs_{LOG_NAME}'), - dict( - type='Accuracy1DHook', - ), - dict(type='LossHook'), - # dict(type='TensorboardHook', log_dir='./tfb_logs'), - # dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'), - # dict(type='LoadCheckpointHook', epoch=20, checkpoint_dir='./ckpt') -] - -logging = dict( - root_path=f"./tests/test_models/test_vision_transformer/test_vit_1d/{LOG_NAME}" -) - -lr_scheduler = dict(type='LinearWarmupLR', warmup_epochs=5) - -num_epochs = 70 - -seed = 42 \ No newline at end of file diff --git a/tests/test_models/test_vision_transformer/configs/vit_2d.py b/tests/test_models/test_vision_transformer/configs/vit_2d.py deleted file mode 100644 index 1fd1102fba88..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_2d.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -from pathlib import Path - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True)) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=400, - pin_memory=True, - num_workers=4, - shuffle=True)) - -optimizer = dict(type='Adam', lr=0.001, weight_decay=0) - -loss = dict(type='CrossEntropyLoss2D', ) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict(type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict(type='VanillaViTDropPath', ), - mlp_cfg=dict(type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=4, mode='2d'), -) - -num_epochs = 60 - -lr_scheduler = dict(type='LinearWarmupLR', warmup_steps=5, total_steps=num_epochs) diff --git a/tests/test_models/test_vision_transformer/configs/vit_2d_imagenet.py b/tests/test_models/test_vision_transformer/configs/vit_2d_imagenet.py deleted file mode 100644 index 8cac68b06a43..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_2d_imagenet.py +++ /dev/null @@ -1,105 +0,0 @@ -from colossalai.engine import AMP_TYPE - -BATCH_SIZE = 128 -LEARNING_RATE = 0.001 -IMG_SIZE = 224 -PATCH_SIZE = 16 -DIM = 2048 -NUM_ATTENTION_HEADS = 16 -NUM_CLASSES = 1000 -DEPTH = 48 -NUM_EPOCHS = 300 - -parallel = dict( - data=4, - pipeline=1, - tensor=dict(size=1, mode='2d'), -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict(type='ViTInputSplitter2D', ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict(type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict(type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - checkpoint=True), - droppath_cfg=dict(type='VanillaViTDropPath', ), - mlp_cfg=dict(type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=4, - checkpoint=True), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -optimizer = dict( - type='AdamW', - lr=3e-3, - weight_decay=0.3, -) - -loss = dict(type='CrossEntropyLoss2D', reduction=True) - -clip_grad = 1.0 - -num_epochs = NUM_EPOCHS - -fp16 = dict(mode=AMP_TYPE.PARALLEL, initial_scale=2**8) - -# this engine config can be ignored if you want to use default values -engine = dict( - # schedule=None, - schedule=dict(num_microbatches=4), - gradient_handlers=None, - gradient_accumulation=1, - gradient_clipping=1.0, -) - -hooks = [ - dict(type='LogMetricByEpochHook'), - dict(type='LogMemoryByEpochHook'), - dict(type='LogTimingByEpochHook'), - dict(type='Accuracy2DHook'), - dict(type='LossHook'), - dict(type='LRSchedulerHook', - by_epoch=True, - lr_scheduler_cfg=dict(type='CosineAnnealingWarmupLR', - warmup_steps=32)) -] - -logging = dict( - root_path= - f"./vit_2d_imagenet1k_bs{BATCH_SIZE}_{fp16['mode']}_clip_grad{clip_grad}") diff --git a/tests/test_models/test_vision_transformer/configs/vit_2p5d.py b/tests/test_models/test_vision_transformer/configs/vit_2p5d.py deleted file mode 100644 index 3c16d684a8b1..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_2p5d.py +++ /dev/null @@ -1,130 +0,0 @@ -import os -from pathlib import Path - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=0, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=400, - pin_memory=True, - num_workers=0, - shuffle=True - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2p5D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2p5D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2p5D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2p5D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2p5D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2p5D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2p5D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2p5D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2p5D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=4, depth=1, mode='2.5d'), -) - -num_epochs = 60 - -lr_scheduler = dict(type='LinearWarmupLR', warmup_steps=5, total_steps=num_epochs) diff --git a/tests/test_models/test_vision_transformer/configs/vit_3d.py b/tests/test_models/test_vision_transformer/configs/vit_3d.py deleted file mode 100644 index 5dffcb753b5a..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_3d.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os -from pathlib import Path - -# from colossalai.context import ParallelMode -from colossalai.engine import AMP_TYPE -from torchvision.transforms import AutoAugmentPolicy - -IMG_SIZE = 32 -PATCH_SIZE = 4 -EMBED_SIZE = 256 -HIDDEN_SIZE = 256 -NUM_HEADS = 4 -NUM_CLASSES = 10 -NUM_BLOCKS = 7 -DROP_RATE = 0.1 - -BATCH_SIZE = 512 -LEARNING_RATE = 0.001 -WEIGHT_DECAY = 3e-2 - -DATASET_PATH = Path(os.environ['DATA']) - -model = dict( - type='VisionTransformerFromConfig', - embedding_cfg=dict( - type='ViTPatchEmbedding3D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - in_chans=3, - embed_size=EMBED_SIZE, - drop_prob=DROP_RATE, - ), - block_cfg=dict( - type='ViTBlock', - norm_cfg=dict( - type='LayerNorm3D', - normalized_shape=HIDDEN_SIZE, - eps=1e-6, - # input_parallel_mode=ParallelMode.PARALLEL_3D_INPUT, - # weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT, - ), - attention_cfg=dict( - type='ViTSelfAttention3D', - hidden_size=HIDDEN_SIZE, - num_attention_heads=NUM_HEADS, - attention_probs_dropout_prob=0., - hidden_dropout_prob=DROP_RATE, - ), - droppath_cfg=dict(type='VanillaViTDropPath', ), - mlp_cfg=dict( - type='ViTMLP3D', - hidden_size=HIDDEN_SIZE, - mlp_ratio=2, - hidden_dropout_prob=DROP_RATE, - hidden_act='gelu', - ), - ), - norm_cfg=dict(type='LayerNorm3D', - normalized_shape=HIDDEN_SIZE, - eps=1e-6, - # input_parallel_mode=ParallelMode.PARALLEL_3D_INPUT, - # weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT, - ), - head_cfg=dict( - type='ViTHead3D', - in_features=HIDDEN_SIZE, - num_classes=NUM_CLASSES, - ), - embed_dim=HIDDEN_SIZE, - depth=NUM_BLOCKS, - drop_path_rate=0., -) - -loss = dict(type='CrossEntropyLoss3D', - # input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT, - # weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT, - # reduction=True, - ) -# loss = dict(type='CrossEntropyLoss', label_smoothing=0.1) - -optimizer = dict(type='AdamW', lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=DATASET_PATH, - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - # dict(type='RandomHorizontalFlip'), - dict(type='AutoAugment', policy=AutoAugmentPolicy.CIFAR10), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=BATCH_SIZE, - pin_memory=True, - shuffle=True, - num_workers=1)) - -test_data = dict(dataset=dict(type='CIFAR10Dataset', - root=DATASET_PATH, - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ]), - dataloader=dict(batch_size=1000, - pin_memory=True)) - -parallel = dict( - data=1, - pipeline=1, - tensor=dict(mode='3d', size=8), -) - -clip_grad = 1.0 - -engine = dict( - schedule=None, - gradient_handlers=None, - gradient_accumulation=1, - gradient_clipping=clip_grad, -) - -num_epochs = 200 - -hooks = [ - dict(type='LogMetricByEpochHook'), - dict(type='LogMemoryByEpochHook'), - dict( - type='Accuracy3DHook', - # input_parallel_mode=ParallelMode.PARALLEL_3D_OUTPUT, - # weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT, - ), - dict(type='LossHook'), - dict(type='LRSchedulerHook', - by_epoch=False, - lr_scheduler_cfg=dict(type='CosineAnnealingWarmupLR', - warmup_epochs=10, - eta_min=1e-5)), -] - -# fp16 = dict(mode=AMP_TYPE.TORCH, init_scale=2**6) - -logging = dict( - root_path= - f"./vit_3d_cifar10_bs{BATCH_SIZE}_lr{LEARNING_RATE}_clip_grad{clip_grad}" -) diff --git a/tests/test_models/test_vision_transformer/configs/vit_3d_imagenet.py b/tests/test_models/test_vision_transformer/configs/vit_3d_imagenet.py deleted file mode 100644 index 175c87f0acbc..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_3d_imagenet.py +++ /dev/null @@ -1,119 +0,0 @@ -from colossalai.engine import AMP_TYPE -from colossalai.context import ParallelMode - -### VIT-S/16 -IMG_SIZE = 224 -PATCH_SIZE = 16 -EMBED_SIZE = 384 -HIDDEN_SIZE = 384 -MLP_RATIO = 4 -NUM_HEADS = 6 -NUM_CLASSES = 1000 -DROP_RATE = 0.1 -DEPTH = 12 -### - -# ### ViT-L/16 -# IMG_SIZE = 224 -# PATCH_SIZE = 16 -# EMBED_SIZE = 10240 -# HIDDEN_SIZE = 10240 -# MLP_RATIO = 4 -# NUM_HEADS = 64 -# NUM_CLASSES = 1000 -# DROP_RATE = 0.1 -# DEPTH = 64 -# ### - -BATCH_SIZE = 4096 - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=8, mode='3d'), -) - -optimizer = dict( - type='AdamW', - lr=3e-3, - weight_decay=0.3, -) - -loss = dict(type='CrossEntropyLoss3D', reduction=True) - -model = dict( - type='VisionTransformerFromConfig', - embedding_cfg=dict( - type='ViTPatchEmbedding3D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - in_chans=3, - embed_size=EMBED_SIZE, - drop_prob=DROP_RATE, - init_method='jax', - ), - block_cfg=dict( - type='ViTBlock', - norm_cfg=dict( - type='LayerNorm3D', - normalized_shape=HIDDEN_SIZE, - eps=1e-6, - ), - attention_cfg=dict(type='ViTSelfAttention3D', - hidden_size=HIDDEN_SIZE, - num_attention_heads=NUM_HEADS, - attention_probs_dropout_prob=0., - hidden_dropout_prob=DROP_RATE, - checkpoint=True, - init_method='jax'), - droppath_cfg=dict(type='VanillaViTDropPath', ), - mlp_cfg=dict(type='ViTMLP3D', - hidden_size=HIDDEN_SIZE, - mlp_ratio=4, - hidden_dropout_prob=DROP_RATE, - hidden_act='gelu', - checkpoint=True, - init_method='jax'), - ), - norm_cfg=dict(type='LayerNorm3D', normalized_shape=HIDDEN_SIZE, eps=1e-6), - head_cfg=dict( - type='ViTHead3D', - in_features=HIDDEN_SIZE, - num_classes=NUM_CLASSES, - init_method='jax', - ), - embed_dim=HIDDEN_SIZE, - depth=DEPTH, - drop_path_rate=0., -) - -clip_grad = 1.0 - -engine = dict( - schedule=None, - gradient_handlers=None, - gradient_accumulation=1, - gradient_clipping=clip_grad, -) - -num_epochs = 300 - -hooks = [ - dict(type='LogMetricByEpochHook'), - dict(type='LogMemoryByEpochHook'), - dict(type='LogTimingByEpochHook'), - dict(type='Accuracy3DHook', ), - dict(type='LossHook'), - dict(type='LRSchedulerHook', - by_epoch=True, - lr_scheduler_cfg=dict(type='CosineAnnealingWarmupLR', - warmup_steps=32, - eta_min=1e-5)), -] - -fp16 = dict(mode=AMP_TYPE.TORCH, ) - -logging = dict( - root_path= - f"./vit_3d_imagenet1k_bs{BATCH_SIZE}_{fp16['mode']}_clip_grad{clip_grad}") - -seed = 42 \ No newline at end of file diff --git a/tests/test_models/test_vision_transformer/configs/vit_vanilla.py b/tests/test_models/test_vision_transformer/configs/vit_vanilla.py deleted file mode 100644 index 7602fd0c8427..000000000000 --- a/tests/test_models/test_vision_transformer/configs/vit_vanilla.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch.nn as nn - -IMG_SIZE = 224 -DIM = 768 -NUM_CLASSES = 1000 -NUM_ATTN_HEADS = 12 - -model = dict( - type='VisionTransformerFromConfig', - embedding_cfg=dict( - type='VanillaViTPatchEmbedding', - img_size=IMG_SIZE, - patch_size=16, - in_chans=3, - embed_dim=DIM - ), - norm_cfg=dict( - type='LayerNorm', - eps=1e-6, - normalized_shape=DIM - ), - block_cfg=dict( - type='ViTBlock', - checkpoint=True, - attention_cfg=dict( - type='VanillaViTAttention', - dim=DIM, - num_heads=NUM_ATTN_HEADS, - qkv_bias=True, - attn_drop=0., - proj_drop=0. - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='VanillaViTMLP', - in_features=DIM, - hidden_features=DIM * 4, - act_layer=nn.GELU, - drop=0. - ), - norm_cfg=dict( - type='LayerNorm', - normalized_shape=DIM - ), - ), - head_cfg=dict( - type='VanillaViTHead', - in_features=DIM, - intermediate_features=DIM * 2, - out_features=NUM_CLASSES - ), - depth=12, - drop_path_rate=0., -) diff --git a/tests/test_models/test_vision_transformer/test.sh b/tests/test_models/test_vision_transformer/test.sh deleted file mode 100644 index 1c6012a5239f..000000000000 --- a/tests/test_models/test_vision_transformer/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env sh -test_file=$1 - -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 diff --git a/tests/test_models/test_vision_transformer/test_vit_1d/test_vit_1d.py b/tests/test_models/test_vision_transformer/test_vit_1d/test_vit_1d.py deleted file mode 100644 index 6e7cef3af264..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_1d/test_vit_1d.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.engine import Engine -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather -from colossalai.trainer import Trainer - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_1d.py') - - -def eval(engine): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - - for i in range(engine.schedule.num_steps): - output, label, loss = engine.step() - accumulated_loss += loss.detach().cpu().numpy() - if isinstance(output, (list, tuple)): - output = output[0] - if isinstance(label, (list, tuple)): - label = label[0] - output = torch.argmax(output, dim=-1) - correct = torch.sum(label == output) - correct_sum += correct - total_sum += label.size(0) - avg_loss = accumulated_loss / engine.schedule.num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine): - engine.train() - accumulated_loss = 0 - - for i in range(engine.schedule.num_steps): - output, label, loss = engine.step() - accumulated_loss += loss.detach().cpu().numpy() - avg_loss = accumulated_loss / engine.schedule.num_steps - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_1d_parallel_vision_transformer(): - # init dist - model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize( - CONFIG_PATH) - logger = get_global_dist_logger() - - engine = Engine(model=model, - train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - criterion=criterion, - optimizer=optimizer, - lr_scheduler=lr_scheduler, - schedule=schedule) - - logger.info('start training') - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine) - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine) - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - -def train(): - model, train_dataloader, test_dataloader, criterion, \ - optimizer, schedule, lr_scheduler = colossalai.initialize(CONFIG_PATH) - - logger = get_global_dist_logger() - engine = Engine(model=model, - train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - criterion=criterion, - optimizer=optimizer, - lr_scheduler=lr_scheduler, - schedule=schedule) - logger.info("Engine is built", ranks=[0]) - - trainer = Trainer(engine=engine, hooks_cfg=gpc.config.hooks, verbose=True) - logger.info("Trainer is built", ranks=[0]) - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - max_epochs=gpc.config.num_epochs, - display_progress=True, - test_interval=1) - -if __name__ == '__main__': - train() diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-3/acc-2D-lr1e-3.jpg b/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-3/acc-2D-lr1e-3.jpg deleted file mode 100644 index 541ef9c5515486f15edb946484fd95d2e2118f02..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29576 zcmeFZ2Ut^U+BOtx<^Co}!DK{?<53c~`Pff5I9rRQ%^Iab~xY1cJyMs)tTN&`&qSLYWDjo7V~e_?EkIUf7NRoBE-W5b{>x? z1O{O-3leuh{?GBBZ39dejF^w2p0nnoZvk?stIg6-zHbO;zJx0VX=%EK*4}>RF%3Go zspw+svfewgLbRfeBFIWh+MbLWs*3IpSES8XJ#-C>GQ~K?9zsj9`H@{pJm8H+I&woe zkWE#Z?(I_&GkrU&`@cV{i?{Q&AP>CGK0h~z(Rys%802@kd7WpQ$Y2^2ag55>is5$y zI1o~Nst!u{i@~+a*5=M5KMuX3Xr8&pW2y78Bvw_ZTp|5Db}PyYGqy$+xvAB(7oiG# zB~D2IPMsea0SSo1q$n$j$yB%NO2FGYW+{3vgvKu23<}45DAi7L((a5_p^K)m1tit7 zjdl~E3IMJdEl*+E?=eY{ZgZ9DWWY}P9qDEi^f&0|Ys=m{h>hMCjmo&!kEVJ+Yei4Yd`%_^CBC>3Q6|bJUq7lfK=FyH)v>=61{1?0c4$ z!hsMA+~ipIn9g9scd?#*hmLX}r+}+rq{D2k#kr_}`1FOI2(7&A;06H){Jd9CMZ`sB zQH69a5#`N+jMvkHfi9X2o1Z~kzRiKiFpWvZR%d}a_t!1n1wQ4jMmPAUqwQ4o7U-Fr z(Y~I@Lo9dXF_ecuue-2^rz(ojNq$VOP{t`HEX}Q+W|Svi-3RS(P+sx#&`o$Y-~iQ+ zaxK6M>OdkAcOE;pSvDXJ=8GRQ>_kg=0#IVWcQ>@j$iNiDV3YN)|kgmUTinDbZhg}@27<9^=7-eKEt z{=NLk3I01#_meBNkY74+CBp$jI0o4Eo z%1rJ;;1jku8>*WZ2nghAL3&1Z;N`u4gnu$tu zjwf>mdW;{-I7mTjXR`9?%T?>}l@TRR>To<|opzAe$rLA(@XZ)tPueYUtE6eowAj~8 z3c#?yi#gpH`YRKg3CcD5n$g^>XDH%RB~7Y2MKXY) zfb8NxD(%f3=5;6N@b@^zr4mKxA^p^`rgm+wxA@^H(BM*Fyjf7)j(+#nL*!*@jUrh>qV z$Zfco5g$Vt)#8WsqNvdC(fprGTd+;SJzFB@Tgw9A4OY&^mo-&R=bXk4kn-$G{C7Oe z4%n`@lCzUe&fxm)*n%3zk&K$8X=>$+>rAd1KvezCatN@stSp?4JM&3*f0TB*zhCH% znc1F>_ovR93gun-6z8z&Xj1tKah%E*-vBQ}7>_W63rD}QRZ3R?k-Z4zTTd@{0) zDhk&oNfo}}Kw6Downr$IH4dYTlRUx_9-2Rz`FPHAr@MOorkl}*uaJ|_H_P$Q?r28Q zKLGYuMnrwYnrl(pzK?!PAC=8s?^x(?m>~4387uFzfO?yChLZ4!Fd-&S{%AP56E3Qw zDQ!KZD`+Kr@XSzkxZl%;ckj~OYhre*UaYV@Sn&5DM10EHSF7@ccbP!%2UqbW!l3fK!k&9%t+5r%D2k+= z%z<2krobybnc`Sd+}9}|4y2ql+*pe6S(Tu?9SY%}`DAop_Fd)aikgChYYlP!*M%X- zhlpdWm)NV|G(j2-m`EWUSX8v4wIlV7cRi3mil z93h3_u(50yK*1n~IS^qJ9Yi$iKH}hOUzjZ7>{T)WnhFyxZlq~8WvNj5U%NMCp06QX zeiu>Wt8WowCcek2G?(SUN^a125iU(X0lM zy8#kk>t{!Je2|acjljvhXBM}0Dq98U96Ujr*it7HETbZMi-ZRDs8d22CW!r&tU~NH z6reUnoo}u5^$R13$RZ8{7`{X`D(gNYFe3CaUFvKVf1{IGY|oW5JymhKbEqoyrRAqc zJc@b@JN~3vJ0;h#E#NLvpK6m}be87Zk!$@iU~4|c^*qTmZW~&8Sf(Ixi!%AznHSro ze7=hi*66*g2W-$ix9mjN5RJ`AIq}taabSx0aZ-|sy7fK9PQK@Cn_N?V(IZ1&jVOzU zMGtc?+|wH}bPQJFdMZ-;-8#U6rj_JLf{FNCcN_JQDxVmgbhc1BJe_`A^)Bv?gWVRD z65Z9#rA|h$dMbf_a|Gpu7YjFRVJiVvw2k@ZNVDqD2o~C(9@A-_G4}fTyT|!;*nB=>a@mlc)`(X(X8C&Hg6vTP-n1kbF&9legA(-d+f_4&^^X=CCgOIR z^Y2`{C0bKKe9>}9CH)Ar2r7xara{2R%N5_nUV=fq&{f(ARRLnp6J7!ER;XA=scFCG z{`&U;T1Dv@6CK?Cckkq^CEeE52PY!EzK+#^a))nX4?CiT<8Q0cqN^t|fNQ?6aqjk~ zjd_EZaia%_i3j?g-7fx}hq{)S#?$aI0{t9ylT*N*Ds4W5A+@~#=O^cU+_z}#`4GR@ zm20OcTxJm@EDS}=g>N&m#GqQVrE$7|X5LYR4rk8Jie3#bYGOum~5tK_LwN&+Cuwn69i`rt%j-4i zA3V|mqCf1Ptba7-SzDPM<$T=eg}Bajjkp8*WCb&uyU6N^OgJkG=4tiQ_*7z-x=Bzz zHesrK>UIcL2q%IsMyJ{U@h017%o- zRCGI@uoVX)i-gMng<--pp%%BM$cD-%L7`3v``m9E&K3#})Zf0zw7z3_ks7OtUeHqB zp8Qt6gr}<-oS-9=vs8Fz6Hhs-62xw3Zz=nprIEGQLfGVWTO3M`DK| z!Gdyeg`DAHcJE<*sj4@P{NjVd3B13?Kk243UX0v#7_+Ji71F);v;PyxjL zMj&55LB|(Z_H_0w{cLzjU(EE9W?5xdu;PS3dTZb8ttoi%cZjX2#Q*R({=z7IVYN#r zPVv&^RLFPEtp1 zISk7`>fYkN)8clMLYQd$y*QzzQJ%dvx{Fw98@vj`+Sy6J3A($mW;pZXC%(Qa$6ol~ zU+CjNgiIMHIgoxAl$Quo9l^_ieC&Zl%bOx*fAbRb@9**dr<oxLVHSzW-}~hZe@pg99GSxP63y-#~s-jWX2KH zv>{QO?cv3N+)-u~p%%1YReWsmCTv zWoN?TC`tsk#?TbEMY@U-;l%^fZ3j-u-c8|x?P_Hju%4hir*1<b^?;bDM7%2 z(ZdM1gD{*AP5O_=Z&)g$qOk%Qvy>4-45c zN%SlZP<#e_QdA6+_k>^HoX1W=wuMW-)I^A)8g$FJwuS_i6UzVj9%?V588I@u-ncE^3uW)1`=itNXJ&w-Bfa3K6sT&UH9=FI)l z6b%l0F+GKihgNAX(TD*`s4I2R+H7$c|Aj@gyL)mK z+&h6;as7a!B5!yOQ-Y7K3-mFrgom}lEA->{B9*+RAoD{ZzSxAo&$^xT#-8j)D9M>f z>$t}^I1rCie(UN6zk{dBeP7Sw4;flrl4bcR?iagP-e|lh&N!Ik0>(t>rvcrtqABKf)&mno zz%F{LZ&%fY27O|wp!C2@{Ryk$n=N7wa<3%n-g_++qa%E;P|5H2Tcmg4;?sECRNOo9 zaOdfqTRH(-z0;xs!^dyL7PWhF)xFJnp69M`vUJW;qahXOuCElcRC^JAmRQ?@=MG

-Lu($WMRdb$K%vM&`F>a@yaw z9#21VFY-OABCHc5J{C?OqnBE!%N@DBk7TS39ci91uXN>0?Kj;k?+z4y2eCY4cUX67 zbaZs##=eEztB)OqhiQ8q=u|UNWm6e@dxc#fnMve=8~*;``Q0yKsuTLN)mg><-@k&) z#aa4AAdI#(TIsM#zXuHu7QnN@QsAMKc&PXToW!w~z$b;qyb-m(HP}&qKPiqV#S-D8 zwe-UPoVwZu-O?n7?7|8$4X0sJX6`hbJ38823-sI@6>&mW{V4@;m*4X8+o?LNpEfuL z-GaT8Hu{K;qR9a`qW)4CwX((F-3$96BNZycdcgmP$|in4qFKwP)aurMrM`jV=%r-_ z>?=}fgsJk1DM$^S%px@+jf30eu8!Q%RIMG>wHW71-B{F1E^Gf#T~^BfOo&K0b{t6w zoI(BW*%hVTz8nJuy`~pzGZu$VLv2B5lTjj8feI^LCfc7gs}0q&w%Q%%{a_$vXDOVV zOmhLjw#X=%ew_6%)R_`S_h)4_$umuw1}VeBCs&)(TgL?YuC5d}E~M z?33q(-dgAqr%ogBjz0XR_p+*HGJBe-M>oZYIIRr9=q3K2TmRa7 z|HB4JT&3?~6;YOE-`W5g@trGndKX3F5xmK_&>K=-JPUg?+Cg~z7%pKHMZ>cBe^3p{ zuyz|!6AURpim6KdR!LHHunI5G536;$CT_jkBFk3aV(CV~%VZ)%WBi}9@g=%8>zgM~ zBYYglCu^HCKIj$Uw;af85f0>OE@Zuy%^zLj>J6uwFORL+m1kc0gv6Ujg`;K1ai%pb zbB%K&&2p!*`?`W(`t-klFZh1Efmq(@vKfNpZs1|_qdAaW*8y-odQhfOtOEqcXY5ZL z2z4Ch`3W_hj+%DzTX_z`omjRMl(72PbS4_Tgr*aa90+~?WDTxwqut&ncR%)mq4uLn zW^$lp6UKCK_I_Fa9?FIW2Jik3WEP|HIS~A}QVt|GDUGd_1~|ThGTDn*;1E~`(${%3 zFtN;XhO-?@K<=WK0~xC3KxQKwwU^R15wz^;HwVj-#d-*$ z5^4GsC}C}Dzh?hFfLKU8U{8Y;px z0Tv>r@$!T(QxfVl;m1nC!yj)rq?E#q8%rAM2Lk5G-Gg4XNPNVzKPef}ikH%e*t?TDgUB6UmA11=n-2NN72|}tJITDuv1L__LZhF1ikvK8wm%H{ z>ewIh<#n#cl~uL+xca(?!c+GSZ=`4v>vWxMamS; z@Oj5OKlaEmcXHD81kQN2?U=W_n4hVc=kCUvFbC|k+ZWU-<0UxBNMsHqsgN!CEuI5` zy~D7#YQcbav}Is-Ku3x=v3RLGG`iF|Jl8i(3@AKK9Zr+J&2&Q;hpdfkWHmMjj6dDJ zK$X5&F`@mjJcmDLj|Q=fhmK>FnFt~jSQTsu?`?@7U`R#=hTcKKADl(@hcdet1 z8T*)uz$L2Z>7fwqJ17OjRr0Ra?4IBdBTt$hqixkGBc+<0O^d)@@G@TQgzE zjY|JCO!c1m~?e>QY*~ zd}qSXq>z9`-#jAHls4!BwXTUdj=tvcHUNl{Zb6GHVmCEyMz{cXu9GY&;8#J(v`F0-CXXPgZ;kvO;4tn{TIUniqh zSx=oHTkzcSv7JjluJ`kxGaM0-Lz1YR$gZk$zZ=ImybJrz1w%D{ow(lFj;1;UjB2;8 zp~h~Y&@XhIGAizKI+X0m-gpJZi7~l8O3E-V;1YwM88WhzlRan?Yx+vd z*j;;v_IKit`HtS5N}k7TU1z;e3qc8DB%$mBv-V{5as5XeNSDgN#giJdcpxI+?zOrR z7%>l4b!vO`Ts44H=b?$+vf@BQPE%u;8SD;p)gJ5d&um%6dFm={V+eK8gS8vpe+HFx z3^lCqsLVOCqky@9wzY+8f%vco69AhL1ao-|R^d0UFa1otl|i;iphOoe06HGqUc3b5 z2Z=zKn3lHu&B^^uHGSJ{f>yrqEp!g2Eh+RN&e;BVqRWBQ$)H}zS^*xj3j$zJ4Y6u3z}TP zcw#fFu!sXm!vMQc?c!f&k6uH6Xv7j>t;^zw?EnfIYvfD|bEP$sT&)iJHYs1#*LrhtvYn8& zLF7O-_+KmvpJgAdh5i6pwD00Bd{6s$xPwYtZ^rm859-Yb1 zNaG%BpO*J?Gd^pbem*)Q4Ey$7R`IEWtS9c5yJel|HsfScd{ToW*qO#yAd9Bb895c# zU_-*j8t4;#7;_J@szaks3R=Xy+ta-nr#sY2C||w2rE>q|tuTm4h0m|+1f&4j8UbAk#ayi{R*|X8MLYhp?JG~wmeglChju{9kI?UmT=5%{MaE6*Ekcmtof9m ze%a0Nj(hN7JC%)jdNK`8NuOSz5LpkKc$tT2{%FnH#rAcy+IB$-w{Jx7$}3xf*9hsV zMwdtUdm}$%FPfl@gg_jbpM~($pOvkXnoL7D5axDblLW+olG&0e!>}W1lX{9 zrKHpOcRV6W-|j!>pImHKy$NYgxd&*UtxE8F)euo?E+$-FxW(mVYOJQLI_;_OhK2}R z)Hs-(xV4l6L6(7_;xm+*@D=_2Oemc3BY*?x_`(yEhyxLW4H$Y9#DZpVsuac6H)kqG z&a)&G9oZ}}^G$mP!O)zEQ?1Zcwn)zOa;Hd~n>;P=mQktd1~l=zVYFGVxeLYP&a@)G z>><@F9+gWdkEt<&6>v0=VZ4BaFeL+s^4*@b^;9DzsVZxwaFibIn?G} z*)0$kXE%I)e5Ys$7X*G^NXR}~obJJT#MCXo++iMOm;fp^137LLBT}v1t=WM`bggu5 zX==ypy@VX#b|UOO>yEfQx&759-xMrx;s<_XmjQ;gQOkjlLFcCa8r64n2(>(fX=?(!{*vUYVSJk^gn$nqziY`ix%JrH2<(U2a})PQG;>CJbUt z{MZO_@=;VFVh6xS#0ZQn8eB6#=KF#~$oTorP*45`xBz9GTzrCXd&_yko+iXwo+wYx z|F+v61D*_Zb~lW-N-?+<%LntoK*Fu8Xa*FT)}v|k=Fw5-MV#+zE%%t67dpmNp@;kx z8}WbQpDsjq5lf0^i2?I{$a2 z;4)I@Kz87_aUi4isPB1uKEG6ge$r&Zd{y^Q4&ZA+3oJL7t>}U6GGemd*eFa`4yXvu(!uThVxFNp+;oVA%z-6aTxRJKOb6;9M z*c4b}>(t!gqVcT*p&Hg^8l~Raja8BMk}C~x>e+fqmy(p$72_?E`A&){s%-sWdiqxE# z0I`Kx)bfPIqwHbb#dh|^12>5KWl)0HOZdA`IV0bzjH6(Yk0VUINl%bt%w-XTQThB4 z>Rbd*w`$Uhcu4&>mt>Dq8UjUGQCp&}MTJcFIZvVQU#48`>$NlunTr&6)}#MW6sp@( z`Bj7cP5SF@jn6;MBn3rBI6N~5V@lRf@4b;k!pI@(bNMLVVG99$Ud6(Hm`VV$<8m24}OWBombe5IJZjw%`yz`D% zlsyNF=Vf287JXobmJ!^AZ4#1Xrzh~;M{(b-->;&CWA#3YjGNPc0EP^huPs8q-*36W zj|P(Fv;7S`)4Q4mqBYX94p83Lp1PR9f#^+CltO%amQYvFEk0#y?DRq9 z9(Sr|OYKtj3{RF^%DCc)$iSX>ukDgoW_<}*&$KZ$dg25%WtaIGSJ%Vd z)<5>#p%n4(*)BshN@0m?7v|(kb|3Ddv=&?nbki`^92o0ebqdVB2ezz?s@#7ecJAY| z$Ef*1`56wRb~59|m-S1m;pu%Qv-98y{*q}yB@w6h@3op+VCNFfcrN~ z-OnKlrjj%EWljxsFD<*){Az)kD2bB3d?hDaTh|OrB>$E#89P!>2))|(!R)9c4#3ba zvT~WBA=*q|hDo@J;}e%dXgg}zvt_g^URHOa&(vV3q<$bXX~hp|&FiG&+cBj4tf3#3 z>tTMI$wh>5jkDsLC;9tz&hJlCw;Z@!b1L$(E_d~pg6DSkXM5Dc<}q_Fm$sN^@9FNg z`Oso2aqj-FoxS8gfe}ZPxIm{3BB%TI$Q~5^Xv0np2|3ieXq(S@2=Ksu4U=N_S!KOVCzJS5ICX8lWV`|AK-|@JtrfTm^DSP$C zv#QJ5uKQdstj|iMsF_YSYw=5d0@CP5SdSVmnkbu+mg5|ps-SM)AcmI z1S8j%zVB|9r*s!R4l{Vt)AoiCZ+Kg@qVwL?iCbxBhhQ`EV8YG?%7Gl!H9qsFyfD=W&6~5tFjmay_wQQRTua5CuKWDsULEWn=JExjU5p2hPwd9O zf*qZnh~C;lZ*;Uxe|zK}X4f;SB0V-{vyJ3jWPCFg*fW{$!Inc2 zoSPTC!#6n^d{o*j3U@ZM`AB`qUg}CFtA-L|V~n97asahd?2kbc;pn54^&?F87gwvX zia;)^dNHS8e@e}P192#3ytsKpK1noJ<@N_^()eYKJ&pONH$b_TUE-`;qFuD+TPxW| z?$AnHUr2p#Q5DG1s;GZtxm-7q;R znh3vZuNj5QQ=*;D8@EaLF0XmAkIgD$y|k8(wr|<|y3g&hzt+NHWZ7;F-H;%-2%C#3 z4A{Jy1mdT|TBd};gBW&4a@Hc1;sU%?7q=d(JdtRDKeUh+IpS<8X}@QvOv@)DD4O8M zqSTlgoGZa)KR)MpB?BI0iR1!}h1`D|c>ij0_(aB@<3R5Db+WFbSeria-9)vhpk^iE zm2lR*0Dja$-f+_$Odjg*=g%4NQUCb(Yi=>`>8~fJ4i^#n8&j_oWbjl2m<6SDUk;=~ zcv_ovaD3#`;J=sZcNu9%C8Gq9o5PQxln`#T$w$*HwXJf?c5k;RmOr_RjWjn-=W-vF zTTcy0MpvVl63QNZSdT-xB5_!OtCw?7;-OK;a+_+UkLir^Z&o#auBtShb-yE8LFocq zn$Nykz$&8K0<&}2jw17%B~|<=rDq>-_@(9Hp{Qd9UF%KhX%Uu1i{HV~p2=!75}^jS z?`KK^XQz|!cc&z@?+`?IfWX+sy3UR6t~(CH`?jdoe&pJ86T5E)seo+FGX_S;@~~7e zZn71T90%&1YchL3H7PEvD=AUg5uNufrR{FrbX_UVKLB}UjG|v-Wg@q*KOm(tacF*o z=B$hBlZ~c&1RI(E((3U;Ep?UP3`;|^ZDKb{dLpeu@cZ*)VvBOrFoh-T!(p==2s$u2 zbIzBPb38-#Dk5aarR#;Wy)|m9aUFJkCMQDc99t$-UT7boHJeV`8(0u`!nFC!Aqdx} zvn2%$XFm(`um858`Y2lZ zY^?l)X{y%#tl7ix?ENnNS+y0e(hlQ^iyKgDV3nkPvq=i@N69cPY1t2-E;!)U#_yCx z-aAqJW@3ih=Hg-F(z$47`gfp?VdWDHK-v65)P#nk+!mJ_fkM)kot2;V+L-Ov;MzHm z_N`=-g5vAvHAZ|tmD)f6igeyY4LOY7ri)FU++?Wt<_F6U@tk^2d3T!0%< zV8hCKAcP{ zjGRYq@RRIpl{MQEV}0%eJiN!@mIbO_Dk_so*7i%lJuppe*YKAUxG5kqnC?Vkc=U|G$_NyS1F3PP+?2!60NR|=No!zipV!# z3E;S+zxeUXY9Xm$e%H^&iYLcL;)zpQMly1DW217@Ald@sc=bla17@#`8ci<+JgTTk~hb)7l(n^WV}MKl=3)RbF@K zvsgz1O>YH%tldVaOs8>$U&gf{1wWOA^r)Yzwu*+?RNPfPbD~h-W#Q9jR8%Ze6D&^a zuo6G@bgXaDf`RKx=Y@^v$48Z8cFG7mMP&}II-PT;eHAM-OODuun@*eGZBiNS))iG= zfC@E@ z{up@32x6XL6&84d8gqB^$|}I|wRMT5Waq7GBx*_xM(M@#_peO445ubsPYyhnW%rhs zUrjEHMpL7X^r3v=qhZZ8@M?2hBeabM5T+*+qRFtWK$=D3T>s*tvsF}ropXdlAs4Kt z>NS_`!a24vs>KlNjvfyn@iF%$CUuqr5H3^~(axt-vK>8Z^9~go;BL?#!zrW8Z)sXd9WE)gldduVv^qy&pf8 z%}`!P1-(EwjQM-)!+yoi zuHG98skvEz^OY0#Cw%as(hq`8MTRJVhR^d~W-5J40Y&QZ>9D2G)SP30Jk_zav>@CW z=WuMmd_Zc^`{Vv^d#$Vt8!GA#9@NP=Xjm!bp_=Kc_G>cv4|SZs47fdt$Ira!zos^% z>pbdLR#xEcujRj!U%{eq@5jzK$WKSM%d+;0@Uz!lrL8D$JfXN{iic?#K=qxjjSE0- zXPy}nXYX|5rLGIBM#zeN{&1kC9CBQ2g8bO8Tf~?K=D%pQ^j@0oatj(zpLT`ehASVz zC78a0WtNpyY^H&aLiBSmwRQBK*{5#JC~Z)Jz8$fHb-%$d(4A-{pF{~bhcawKY7Spr z{h{d=6sD7TDx!mAaWrmIW=vc8no11rYP$XY#nsPtr}15bzUMLDFws9|obrUqM_=`y z&U2&Kc-og@bghlmIFJTb`@oYg5stWkQO8cSoY7TJF!B|El1kyyzBS;&Bbdmw14hQv zODobJK;q{}v)20APwNjHpl3nzc;d}bxdP?56Gv}84T0Q{Sk%CGxYG_TsM~(2O17)I zTyLgf3e?v7<)sR0<1TI=+WD&b{-cIxj1M-BvlY&&_iRliJ8s1N+j+8I<=6gvXYe0` z^rlf!SD#<7E?LwMbS4nm`fZF>Hd*wUT>!Ix5PnrEBwuY*FQZ;%xVlUQ7im0l*XZa= z=h4yggod@+oZu~_7*)HE-+FGogs5gVb2sTHLP1y4;@A+nvu$-T7rHe6)U7JeR3gJ^ z0@(Pb*K*lRc5ixoxhe|9I`-87orootLbpAq#>XvdhoHx%a8B7hz7Dr?WgEgG#66d& zwY=B*;y>b+MbVA5-3)Quq|&E`olnO=h5CkR4f^>4y(_@o-E&LoE@e}JlEg#ypj-^| zbYMpus50G&VM?IcGhlGTi&@5o&AS3IpuF$_7=2NL;fw?rQ)XmEI}*wV=(nOp8uni$ zVtK-Y@Rnfydy}5Q4YYsxac+@m`!kRqYpR(}=WN|J747c&aGHWXF4G}un^{8zFO$RCxa0@GLjA+RLb%>IO}nrg8b3vP+;;6P-V zz5zvqIt_OZY(qWv9;$oS4tTD^Oo)ujY+|bKeED=5L7FYF9oa`-&;=D$e6J6u;pi3D zXxew5!d5{Dure>c^+obM3*{sBoyE7i_c#_&7Q4j}q}bN9lFxoqQCr z*Y34vkaWwd3isQu*Fq~{6g0!T^$M%PM5Ud&);w}T8n{5zxVNSs=xO50;y?^5lMk5J z$tZD)!jB?^SdZ9z>>i}F(l%fxb;|ma<`$?|)cIPQ5OcF!kB^!9_2r#Dl&^&hE5Ta> z+Qr@|eAgI$SmQiyrf4U#>+#gsg9c<>id82&x|J6HEH4xNWa2Uhl2v%HrAe5I!QlnG z*?d&9SihnLIs2E+-XQ$Br1BT|WAbb!&)U1sG`!+|vHEVqnyA3!OOi6Bsvvq+j`=+p z=J$T*PsTdGFo1tG)==vd&2b2df>o zi%yiTBg+{Y>@JuP>Z%BzbwT)He_c&q`y7hNE}F+Hwts$^tC*FQPQ1E&lH{rBAe+4x z9!(V8&d)w;Pb+VjEH7F=S$w{;D&#%~Qm+V)b(Fo^v&YC}v&$Q&(=UfVlWm?A^>v@e zG5Y%gQI(h_zG*xJs53EU_nBzD1s>AuLLDEqjg#)w4#;D&F8Q0i7q#4R9`gC*l8{5w zor1*G!0?IF2AHZcjmDf4IajhWtO_i(KYpi%D`;`&ROZWTPB~0Ds(qW8av-37de0A` ze~9<|TMzsvdQ)T|rgNEKjlFabjAz+4Am1GWtMUP&*rI6QLRrzIu*4!L$!)a+XIc>1bI#Gr;agj6Q zx6@t1rFylu5PWEhxz`eLOPP(V_Q5iPOX#Jq@>1tNj1(h3W2^8KVIl_DP6tf^5JXzR zlhlf+nYdbmte&jS(Qlesy3jYrqjiGfU>+Gdb#FuOi+ny1&oc|IJ^L%&0$c>y3>PA? zQefAWwUHYftUOq?l<{fVDgY!s1_>@#1qXOp3)?b&}0t+`DDGF zm#A1=_ra1w7Xuk%nINxVr2p|_<8VViVVW|}lqjXA0y%y(aS|+{05AqUf^$4B7asRr z(Y{PmmvT&^XVq25J)|V{!)BT5AC9^xuxzMl;9_tN_9G`oLHI#qej z`|g*1*i;sFJjkXoDzkOA&M#Xdx5!G3*f`p;pi`uI<#X=4M2E(@*WugR=OT(G#j~Gi zR26GS+EoVn@9W>~YIwLd3ZC7Gr(OV;LSnsS*q=i&;qpX}q@ZBoXHqD__{4tn`WHA$ z{t)|HhhnpGaRtb**(3JQQYIqt6muGyB;Ed~{rcQg8(S?~e$%B%4d0-q^LXmAa?`Vd z%7{6=ya!k1G#KgNfFuUwg0co~v(g>SgCi2N7r+!!*I{iYS5OOzxOd(}8cf2&saCBo zi0IpSJpD}LBGo%K<=zfSVmh}WuI1fz5blqbhZJb8iV}`l2AQS$hieF*Yj3cejUz)A z`eSa3SzgPia7(xVk{f5v|4~ThZ}0ObBOBzEuW(SU-Mj4#2>Rm?C!!`^p1W)5&)AmIt49*vvAAnP~6x|J0HTXPxZosi7=F?cGAmJoM$eSMI zt|7PmdJrK^cgpsNk`2QEi33+DYA+R)E-EPl>O{eM>$B(H#M-xmHt)ROnR?0Elc%?q) znT<6o#$=NVyBA%NcDqT*cmJW`XXWgEnrg=5_ZumFtHqNxd+pKPU5hG`OFFioWZ3kS zn_B21;;bL`5tb=(BK4!1T(*dyfCO7cHKzbfjm0ntAm9GR;GYuTzZXdTv+w=y1X^NOt?+!FilU%xqfcHVsXhO>D=bj=dtiDL_gqr7XVb4k28xut0arbp*anwNGp;_Kz;;4+Xal`rU}R_Ul1sr&5p zL*_GhF$}{aLICA)J7Nl$4{t#f6l>=iZ0ss%aR1q}|9Up%U;WMh&g{#25`78iA!B93 zA6c?uv7W-`aAdS1;UJLY6t5cYezbS&+eBKuN^*fvY*&keSJ&5O;-Y)`p8KxZWwlk8 ztM8*T;87J$Re@w$6;;O5;pFm(g4+9e>4~cW-y8!@^rCQ14Pd4LOX)6V9;1Hu)o#V! zava(3y^+3N2<{IubL16GDMj(^_y;m@Q$<4?>Nc{QdYp*A2?7UvT%>l(@|}0*tNOkh z?&p2?SUT%nL&dhu3yR%_N=u!^Cl|k7>z1|>ci2}@b~1N1CTg5g!+|uaO`u;ot0-Xj z90kqJzFu}HoO|T#*0l@st$nBDWY6;Uf<%p8WAOaX7j*m)*Z<5==fBiwdwwknJv;bo ze&MDOxO@b3ntvy8MXZ7z^f$CDzf2M%JiVIr9HGhB#gxGUXpB=eYt>*ZJotF5nxC_6 zjv@u#IpbzZQWSIWlUTskGJL0#l}MiDDx8HPc7TG?3_g zs6oZAwDYBRg@#DQ>9sX!yQ+mdsLEH@F1TyJ%@L;nLz*^QUc+yl7 zjQ9I!wV#!3g=b>##!3gjdRW3fi>>$-p~yg2OeY3V4_gXmSTn(ZIPM7zLtI?kXsD5* z8x$VJ_((Zy3_TOmc|-7V7_Jp!1Ft*XOvBGw&0_O-*BJf@4??3Q4n zrnnFI;431wsF&fYmv=YYPq5{BibvmG?>ja7#m-5ot{#CJp<0X)D*4DRbTtOd&0kJA z4z9+LQiO)0<3=Urm6%yEU0jr*$;T?&_`uK4XZG1-1( zBKXV3uPD5KZ;F{XquCm8V6Tp7S}vgl&U0aP= z-4EOLz(4*}z}7EPA`7z@Dou*Ig2(?Op6{op{NLyOdT8jo6s97P<=j5C0klBw#wqYe2j#Mn6fMb}l z%ZL?BplGd~@!Cm^QH?nfK!ftRePqcQMcwGI%TYQ&JDpVEoVDm=^`)qJvllwTOsW&*N%sgU%%g}0~z^33@LFS~(;Z#{sV!*oDV?O`z{k}!jC^A@)qBQt4l7$*gF-9Z}VXj-6K zhxs@16ZlM=W%S@o4p}w44n&4G&tdaI$ZLa1sM{BQ?3A!|J4^u?k$>+LPi!W1#>8LkKiz#Ie%Sn3Gn!VARyVIxO!HwO~$27Q-#v_C|!d&JL; zEqKmbt+qVTDy`B~XYK;?!KCS(-Fhnf?6*U${%R~>^+0MXiERv6c``!S{V0)!-6S*= zNOBojGE#s26}gLs##x7%J~}_hdmne^-NR79Lk`^`Ydr6(gV1m7&r6bSW-%Q9<($74 zPujmGb!&14LO}zMW=al_wC{@a8p%>KPksVP#XP^T>)L@!7WlbeMjc%Q6v|@g&sfi! zG@xh-O0+D5OPdv|d1@z0WS?VscGQDYu5N>aDC*%Bq-CC#H{+wn)||*xIKX?jd(~?r z0~Dkv&}04?H}FSa`;)l_o_6HFts-gUX_?|Rkp*~2>PaRVfE6yq8x#Kdm2=w&Iv~&V zo!$*F0<0m3*QIVtN4!lqqwg>g@%m*VWEP=LwV6)4#a0Ju$=Y$ugLJqUun_Wt=1@c* zLsMEmNZlD(^Pf9ZW445?xBvXOh8*jv^hXk4T-${*ZyMyT2ebJVw$?IqzD`Nr?V?S! zeMZ4oVe`cggJ-^RAn&0?QD}-FNYVq290-INKrYw96#(XTO%C|ulCLsEhJ1U$sqwIB z8%X1;&;YOK5^2^Yd!8SW~S8;sOTHU<*YO*`D#bVWax^(a4 zb7iZ~Gq>GW?$2tsw}ETPR?{qg|BABziuM29-~8bm!9NTu{yGehToMWO)T|2HwkEa2s+!yUP|IPDdTw`7vaDY)AP#eb3F%AUW^A8KAe&%Y@e)GRKO;Y|t z57`UsBx?g2+8s6ta4EKuHpPm{a~dXSgbJ9>u7_2AtJJ1H%sJX0imLJh?bi#$W0X&X z;equP{crMgOd+cEXW$C9eBT!~9<+cd8{k>DitD?dijg4+Lm)s1Q52bpBtl4N zL4`mLC?3t=JHA`5%3E!Jc(1IwRqsnGwLk1#=j^@q+H39g|DCyfxg`>`L@SKRQ(vDq zLkLaQTp$Do`A+M0q+1VW+duU756wSx5ZVM9K?sLtG#qpQEJYUVc11Z67HJ`-kf)t( z7-92g21Aw4Hy?MTQ|Gf-(p-nzlqr@)!~tWXf*UailkNS=?a?R8^(jiS`#k9jtPMX% z`cwg7G5*$81pgNj2rBh$ftJ9V_atZ)$=wvtZvh`I`ffAqUmTpjclJXEsGk&Z<@$wu zz9$)shHO;9tbG=yG-Uf5Hg_qsXw4AjeFHSzRmy%yTf$=F`8aeW>Ff#}h;(j#UF=#+ z8T#m7XTR|C3rmU#G)`OpBiNh1o&BE8wuWm75@R~PY!C2p2?*CCM z_#t*YM;VH1$4GrKd_NJr1kgf9V$QDE)W;-=shWyw%C7oO?v3~uDDCOtfMAK%|C%g3 zI!8Y;HFqlZOhbk$KjGg5J&<(l`|l~7+u8+MTD8Grzrg7E%`hYN95hx7yVV5w-oASg z^?L(&0&m)32zy4FcEF zA+`%gIGy8`;PHowuhj(=o~v}<7`Bzdw-8;;2Ox>$6$uojX+^$W{kXqEzkAtLw@V(w zMu)^X+qB1$ZXX(p4-faPR#v#{g!!7XX8G>eSLplMCqBt@fXy*#w{z2E^}UG5Gl#1y?G ztw$BcuSRgXoqD*_*-!NIEN|Yat>4jX;CbOX66K*faPS*wi2h$A&Wc)A1o|ST#vvUY zjz8;wePFlW9DnAd*nn&HD)p71A6h#~4?o?a>FXISfVSR9kM zTN;snwBnaz!lr42$K9I@(z4*g6OFJ_@!!0>-@d?*5ZWP)Qf-|U6^!7Wg+z4+AF{_rs z$*d!kWyC&|mZHen5TLd>e1X-FQ&DEbZkDu+w)I1n1KaP; zW!2!h@N^OlD(6OL4+#blXfiW?7Rav4vHMX-x`2UnVVL9w_bt0$cZl0>UJ77bLqN>mt>> zxWSd?KOO$F8)o5O!?Wdn*=`$QUH^-yn56*k-|P^r~d#R z6YZPVvpU%{jJLpbtD!}a%3vpfD+DqSSphOyp$tcQ zoM`ON$iW%iw;{)}arvz&hfQ^YZv1lplb9AN2tD?jmVgSbrA%4hlbcD&l6|Ue?zO!{ z3(h=7H5i9d8;JdIo9GngbVJ-%RC8v>?(~n-&FqZx)%M+=P&8@Em;oC}hE5spA#Ltr zCN5lz)SMh_BIZ7Lxq0)8bC));VNc$@d-A5)#?j`n<~{gRJZN6pt}!#j+pfrEMMB#0 zU%Q<(q=N#H1(|>>N~&Y-G#kHF`pzKWMRtnH$G)dg3vs=#xY#m5an3v8F`x128*Z$4 zNYHF6`2|1J1?A&)baylRm@ZHhm(^qoV9rUbr$+cYi!qaax&g0}ZD2iSt$FpkR~hB2 zpP2Jj?eG8QRb#NyImR0*CVZ~nMpq1PQbuRo4M@*8;Amk>%((`v|zkBHcA)J(lxcT`iiTSo`_pY?~ zbz*Wg;GW>f$O25N(i(rM{`yc-P!h%)Wy&OHe+iWN&`5GR#G5Iq8tJ)QCwboxm}yn<>~2}XA&;l-3o?B<@GK#HW-3Lt zYGZCO!3((469FvB9jU|K1xc-r7fa&iU03By?k^PVr5AXnoG=xX)ZN^_Y1XK#wg5!T zS6^l$RldL=-9Lj%sX394LLn)=?%Mig93ITA1j#BrG=npXl0ezyTUb|pNQ3QiY z&sE9xLX@w6^QNPtc{Ek6V#O!6atT~I!$qsJMhk>$&f0)M>&F7)a=gfATQ_a_mBba{ z%sEe&#(NpbS>n{g#mBqWdY89bzWLu^g8mOl9crggo756PbdJPmJ9~*KX%zJmo*XH{ zXYva5$<2IhbJvQ~NIRp%LwU1}MlUvczWACzhO z`|7KkE(ioDYod^{l4&H12_|R_Fi%JYB5dm2rf!imuM}`d+1&n<{|Nw6u1jTytOT!DfIJag;soHE9W64M zKGqpb_fe$U8(hV06J^mo4*G z`l)yV-lb9UxUk*xwsF^QNA?IMOW7vLmJlUnn@X}JDKa!nktN%NP{ha*5@RRZ*eOY} zuVY4~ESW*$%53k|v;5zC`MuBoe(wACc|Py^e*VKbGuL&^oY#4r=W!nAaeTkWIjl+6 z0z~k%xs^GDjg1X*4g7ky8; zoI(eVnCup|zrrOKxKHar$_sA!lT{re4#Q-HqgQW7^6-j^?H89&R8m$ssH&}_t9R_U zzUe75a|=tW)921RUU0g2$=TzY=XEb{AKyDc!FNNDpu4oJxR~VeEKXa zzo4+_WpPRAtLmECy84F3H%*;g-95d1{cqoW8W|lMpO~DQCXyCEfBCxf?fWuiV{_{# zbsPA#^II-92>ZW_1wQ|~V1FamUQn)GzsG>r@}jDPNB*b-S>)>NVO~)MZK5LOw`l(+**_*&T4PJcN~_a2{KIXhbM ziTeOo=X8GLr{lbD6a?%XI3eL_B=kO{SBJ4oD6N8i8M)mQ&z>D>$}wMu2c zN+DISB=7d6xLt2pkSIbeS37zd)|H0rTHxv(;s%y18ya=DC2&zKR(_>3W-8)GYU@Ag zmqARXZT(0?z#&2p<~~-qO$4a#})(SYEpEC=23YwN?*FW4)Tan5N6kd4~l#~kTIu4kadR8`iKo{;WQ=Gb;{sU7c>gI4YUg<0%Y1c z>Bq*l=q&p2t*l&*6}T43<9uowdE7pED@$w*O;ZF4Nbd=3hB`ERl>R6xI6yna6{*qJ zJ1*=yY_(r;`1PlATGi24OAuQy9?b%r8&W*j9_=8Q51 z@MQOQYSMmIHJ@rJ3Yik+c|PO&e_VBo`O0@6b7cD5lE}M=e&@_6{6bEIA;3w~`__bA zouh0M35kZt9j>l`IAhNEdAPJsjcZ3pP*V}fd_s_rpWhN9R{55lP37*n#!?7FolyYy z){I6}W7~)JMGRvl>6T4)(T1=>tCAI`Uai%tVz#|pn(|fWPf67@m3*EUa_40l|ynQ7xd0hYNJVFtzX=P% zNdnv`T%-oN5DU^bjqXD5xFh3P5R=g$O(9p{W~aQuba_VR>nXzAOum3yqUc6-9!HJr zSuL}$+y~&d6&aBz8*;1#uu>Q?Va%4+rN86Cf?S)23A;SfP<;NX?}qH$S4Lhl$A<2& z7aK*UyHwc6w4sk+0(7TRzb3utmV5cjPPt-jqV7tyBR?&#XIj2FTu9<8GFO^6-IwyV zs4JwcZXAUFS@z*t{1E}N?rXkAVBmGE-KLu(>QN83R@#L0W@XH~h zk@|rw?%9~Agsil5(CJ)wOt2)B3M%|}1I zX&@j_5f{DG^w2UXJy_UOQslD9OeD89+s+snLw(H1g?qCFMR1^yWUr)%9>0J)_gE0_ z%tvcFFgc$0WgIwaD9qRL@>6*!t_`e@fvvi7pIxK_65XA92(t)_b~#PHSzwGMG0q@k zCTG5d+>$=Qzy+fHHZZcV0Mq69f!x z$U_J+7v{#+N>F{h^^A&!&f1=Ic}VD|0fA!gVyQZk!0jWO*tG*>u|-%r8c@cr#ulS4 zQN*Hxyjy%IeN17*G`ky+P5e~{gIbVmeCKKQ4f)bS?EV+mUDfg+csm92Iv4gVV`& zbYTMZ?gUC0Y(In&(pMfpZDi!u%~^EAc=JB}B*bFb=&C`%Zw@00Gr|TxbDe9}pcFQU z4jgoxO;i2+uqWJrW{cdU4w29As50N8YcP?haOroh(!q2DMI!(zDS8f;D!13CAvc0P z(3mDOn6TY|%9F{#Ej#J!M&B!&@)47GIG&MeBNF|Rp!+4=nma9!{m$OnYY$Sj%uq=z z2vm_#Y%C1W+C>rdv2@|_bs{#ZO%eH_uQLo9uMJwOafzS3*=2+y*86qTtdVD33O_+L z2V?vmT6prGxa_y`qLfcsn1<-a1bBR&4xLz?kbl?E`Q!Gr%LTrhn}Wy0NqD&7UP@Uz zp6sMKjq7AV#1w(Fd>=#&z5_3*Q9%CMyL2H&jqoT}z0NCM+GXInK$hq8*M#e?yWczy zE&hDU8S)eU2+_cTtOdf?3MgFElZFf_J8iC1&lM@(-9 zl|Al>=u=U6SCIZaYaQ(Yo9VJ*yh5E@42Yc92jDo=QJ|Fk>FX!B6v~Jq{a#GVr7I{& zv3#w3tXbQsK26nlDvd)+tNK7M%5TB08xc!@!WU}sbx9K#dPsqHpbdG1_#(!}<9z(o zH6EEajio6ufho^}`hQlaEj0A(K~pnWkjNT3icyWf(kg69*^X`5OI9`oLOfns;j-2F zx|zy04lDLq`|I|<$Z>V)&<=ci$VbK8)Ah~?m>qKtTL;CkAlG5QIW+nHcL^f`{SX`@ zBSW^WV^jqk%C*I!sTlo>hHuW@SbXR9dgkM`FOrFOcfj89r>O$xdJ+~i_eG4rpj&If zbStjdG_y8JdZLqm+50L({aevR0ylHK9%S%`CMqRHAIO9sh?vV&WfU4-S%6PvpJ$XX z1(06tOgY4iUB~duH|9P{Vbjmrj0YO0&)ig~sa5^*KHg)apKad{gUV`jhXE5pm!w$G z_mxq{fC&;VW=FS-6M5x!^&jSZTK7G0B767syYdX6*EUdS$^5wg|F z&g7M(?7#q(mr*o9`Xxrp54t3AeZLDbcc}v|h-;I(#F{?^zfhkBPQX$|rsr6MCenQ>$PR+Ct{cx<>=x}mq#QK7%gh+DZQCQoc|*8% z;g;W97Q}xA9#CwFRkzB=et8HE_hY@Zw>R0IVPi@#=bZy}&>r5v$%0rzUK5LAWRfKyilG;(>Kr+DO0 zhQipHdm0Uc)EgxxVY7*x#w(PK^@Yqn&FC)(P21RC4rc{iI@KFR{5o1fr-ijLI$ij49C;w|I^W{)fb&6c>%`I*pH$ zlgoqmyw5UjDDIbk)8G=v6aN+dOkC*RAHt zwPU4PEb;*1zPrBCbf4nhk7t5?eu#Z(Yt$2-b!nYOrAfMVHNtLwWHZ}$_Ma~zo65a* zGk2)Tj9fzxe-bp2P}P%+C0B2gG;)Y)=>6XbLZs%g(PfDzVgAW^I}TH?_|UHv6i34Y{rX1w1;P$r@C6A zHOkQzz(}tw7oW zZD=q3SYuRg)ruTd&8|jgENQvC7Ip=ntk&qc@nlF?CG$piugQ+-#kQCaop=34V@%ke zPCYZ>;!4Sa)S00SXngd&3>?}AJ6T2)Ta#SG0IHymBx&T=hu@hx0_#ej(I5wJXUhC}N4?R+8?2&_uLDV>{E|Jf5=&>MsjQ60}$aW7= z$6tvH6cfSwBQJdD>?QX%!v-U{$B*l2z6&|nzd-r#H0>qo03#LcX*acfepF}ZKE6h? zYn{V=ner>FeCeT~xW~(ZuxB5p+~*av_%~yevto(3)exFIYColQsGAT)s2=LV@Bpb4 zX$>%9!?90ZMA+iCqlP%t*%oI@64L5^ypyY@zW!f$1Q z%MJ^|LtsO19z6jwpLq5iR^_!C2+9?2Lm4n(q#27(0#iS1x{#ei?O`bA7f+u*+cVv% zG;3$jaJ_}b+@C-NotVN#=0_M{3-GZZ&$8geIJ^&H#~QQ7#)3RtzmNa*ED!2LX;>Ym z9iv+Vj|%AsAc+PCl)GirKb}>JCM5-QW`u+#KYx+H#`(Ugw(@@Wb9n(PqvdcuDhl8s zB2n@IL~Nvt;79?0-Zu>FIfT;j<0emA^F%sU^6aPWM_$UmBGBI7V{6hA3jIfAl^LRV zEHrcsVIn|(_95S5ac2NOFmjUS>V%kVJM_{2r=*?n(WhTFZtfik%XO@J9Jbe)d5~Nj z-ExOGGsFeVbU8;<6)x+p?S4_TT<;U;A1oaH?Q{GI$I)$mZk+}A-)rFTAF3)OthfU+ z6-D)7JeO^*2jdSf3_Ft7U}Mi#1Rps)H={w}Hh<)_0wFQ?O~u#$h8c2mvHDcUP+dAX z5j1dbMsg-3a^u4fk<^2vU{y-f;RESU9-oioR*;~;q38}x{6d!T-icL|6VVP(Mz^W)6-K&bEs9|hB+BAXC1!x?63Qfi$B`qf4 zl|SQj{KaSAT6sbmngsSbmTVjF z$X#Q!O%`R&xZlRJvO}#2Rh-9_ub_Er6PlY>VhwVAUS`bYI$gCo)SYT;Rk2CJt^Oo) zEjBzR(IMoP%{7VF%AJPkP1GO#_M&wH+t%~ z8N_&vjCekr*CzWXmrw_CCKG47zwEX;w*T|jmVtlRN<@(H(WoN}80aOyq9a18@{@Ls zK4+BOr>$n~hKBk_5z7`*msM3(ArQx%U;lWm1>U&l_`f$w1%K2m$c}FO|Ec;ftofhQ zAtgBYRm`;e{0EnZU{KkJLMzH&hX5h%7yHFokQbnjm73pzVDj`(O#ASXATEYQu(uth zpQi{fGnFU-opu5%#s|i(HN?w@+zWo$q5bK}BS+z&*JQb4~TKxuJ8#hIi4jmHf4e+}Azb5O*bYq(^Kg zeC=hWT&l{0j9#7QYU>L=Y=@-!^B%-g2=VBu?X@Fd)Fw``=mD|8TY#EgM*>1HVo)F7 ziRb<F%qbjt44H&U)+OR+*L(@#X}6RB#Rhhhx4U&MUbo{hVHH%oED z$wOirF)c|w!g!4K`Cd*pqggTs5Ku!jx^U8ZcV_q^eD{>^olD()cNFXbg`bW#C>&%r zoG3jnQB04z310!B+iyzu4cVDI;P|M#Vq~!(t5Ha67UT@IZc7KX$%1fz7|avLh(R>f zq(zYhd3n!yN8&!O+7b>>g>{1w5;y+Z9t5eO%WfaSM{gcBUR##^XzSN$!;H0i{930h znW91)Mrz;GLWxZeG|E3irQ*b;?fAL-b;b-25u;+{cC6kRCmZ4rj*1?s`y1^f&+YNQ zvi}`JB2#N*xMwzepS^4GMSHV-kUy!g4a|IJ?{PmcHgjQ2^WohPAI@6Q2^_E7FUxKs z_gh7bvd$f*$d6bS1m1*W3TVKArnh*|k(i;0^})#H=o&at?md1IJ`>?J$(*7dqhsL% zzv#H5H`nmGvx`||76h1GXSg7kPGt>zkvpR<)XHdlu50rl0hG0}p)i7G9uMxmOi|ES~99?*1F*fFh(&$+`aseVp z-?qr8aFsG?Bd3ze<<#NH_C!iQ!<_&1JJhThtZXdkSAkk^Lenn;&HkT-D8 zx!0f#_#1H%&nYOyM)Fdp=A_?3Upry?9;q!hA7xJox()Q5tpeVZ5oYI|p}Ty=94_`4 z&5aKF-{>9FHRU<=pg&`;)T?TNJ51hllsnyceqqvf4A>D8^phhxu&FL$&clY>qZFq# z$5-Dfr{=9a)W4)zysXv3lT&RM2b}cp_jJehL!0mL+h9Ift{Fl#F-pMAG-Lto^d33& z(^i*s4qU&SdP(kVLv+OZPd}1scJ+HYs;)k8yZGH5Dc@Buno6(>eZ4P6H>KM-XEgDI z_%X+;@53P+x%ZuxFl#2Xv-CUTL73!u{b6GjWR&S+aCo)FdZ!(86DnL?Obb_PG@1{L zsyj+?Yd8(W@r($u@q2{y9~lF#kC2L2`)HP^1LU>XH8qNI&tsH32n}3$g&c=TAbXeW zAFk-TN+^u)Bt3CBVso4Y8L6<(uixA4_;_{pK?(aBVz-YwKV|))HeSkG7)Ujc_OOeg zelT*ZkObpM9lq`d+)l(fi#B;W>dQx5P1 zd>t+UeoYgM1BLLucJbrRnsMkW3KYA#~=aMp1p7O+WW+%mp{&26!! z2=_4g(1}dJVdlZ*Hcs+32ELGgA!0veJt@e>bk!-$786MHmoVR`*qN&9d9Rz{jy#aI zhE-Ebzs$CCi3Q=^Hf2Gub51M>ml%xh2KyVCmeQB#+A<|*?|>;N)sunKakJ}ca3-p``4~Y~tzJ$i>-m;GtpZ;nPl-$TpzvZB2geF<^!R}YlRTw#R z4+`JOu?T6@tv4ePYIHpceoZ|@vL#z;8ejf5=)=D78z?WuuUN z>lEE+_{aufIgcuTu@Te&*h{4IkBya%2EOpSn=L|a?MhD~-TaMd9iKt(i^6#OVp09HAX(oVyZpW=wtkb+AFUUkSo~b4iTSr+r(&iBJ0h z$&z{1gnhI{o&}Ku-CPD50j_A@RoMIB3n}=Gc-$0@*{}Byzcu8CFm)P&lPw9q%$!yy zKxRy`6S3n}hkH@-U-I!KI(ENy&!Q)|Sdfo4c4vLD8+$-gT>)~&d07xL6i^7D=a*6a zK%Lw})MOl;p7=PnIYbxIW7<2?bU^rXH#w8}Y|ZbRFN9(Ts4PX}iANaG5eL7^xUZ{q zY&+$7v{s6NQ?CC~#Q@cV;zU=5*Zc-{cfmLs4aL=O#c5m>(Vjt7U9ps-lD!>YqU}U@Xn6MX_fl=le-Jds%2ZMDQVNXuB zM$*)Y|sTxfy)O**M*lOq4uY%Hg-;mo}R&dI6p z%yU^|<%(^Xw~y|+w$^CxIowry=4_RNnKea{dXAA9aTs_9jU$M`uM;3Zakp+0Dnqns zJ&rEz8Gdh1Vf;JlisA6l*Jmmg@(e8Ra@_WNK-Q%1TcIieDFLr%fpeq}c`NyV?RNJM z%@*O;Q}sm!&MNm3)^u*F3od`mul8JhxbQ3GLF!5_14PJ%9{2^5N$+5=u0Z9U%0cp) z|DmO9+Ety(aFe2QQrFA`>yiq!2C8=3>jk8U+}9*9hrq@Cr;E#q^tlD>!<&~!>2ih# zOHBib%}&;>96XRup1TdLu5na9xG|NmjoYC2>4T%rXS{#8sXiUIj1|z9%w}-eycwxA zm`Rf1%C@a}b1{Q7&!yJ1zv$u>sDxm}fyBcGPBM4M*;G0sJwk`>kg$M%&~opU@*_=k z4UO7ts320)X?VmbW>;HKhOn<_WluzjxV5C$V@?B%)F^^%ho9E#1yckJ-r1gGtP&?L^W|m*nH9Ko{%gc%XTBvxG z`2GnP*@?+(zmR~Q>T>{AXczsa17aspzLbCmD7Xi)oR>*matzmVAHAV|prN`!OwTFA z&foKEQ)g0Yb>k_M=(Qj4g#v6?4`BgS*|C#!h4y*Kfg@rAJ)^DNIxQ#2n2BckR--+V%Op|;x& zfBIo)bc4FHHqf`vPUo|y?wsXt2d~)UBWnD5 zP4TC??EN_HUllj5*QbzDOb;TxPNgb!e35p1LOvnCkFCFlCKvJcBT5Ed^Bz{i)m2Mo z%l=@z+m7?%9;vrEGt8bZp^`Np?t1AsIWivoPDr^!^vAT%1yC&kkI;o2jV~42LtJx6 zKHId@wa;pv$<=zFIr8T1fNJDzzmbwWegYTrXa@@-K-ctGz{E25jRdKSooNvCX)?xK zkuEPUG%XA?+kZaE!t*Sr7t(@h1_GNqOwhj{{^Vrgy7UnJ`7C&VTLG65X+;Qf>5t6# zucbc@l_2}<&HkVujBRLFQI|iW1Mtv@K6gV#*6S^t05H&%D?8SrxY$p>oheLEN}cxo zpfK`jG?aJA0plkbNoc;2N4jqumH!5o6t>={DFCP}!)pj#n*2aRH{37DJdHFYZui1D zl*`~V?YyYiPz$QjPI*t-rpEDKM)-x-Fo_mL+OddFXc?v&^R2sBRlwAWbAtH>N{@0q z+|-sLnqTQM7|J_uS1@B#JtNsD7qX$a+eGyNt)-G{W^8vU3nDv%*#YCy9Uhz{BOdO9 zmtl^!36IhJNd5-kX{uZK?A@IU#pHX7*2qtvzMeVqOeveD_11q59vm&>bNO>lw)y(y z@rpJ7+Lh?WH_2L;QCzoG49@>f5elMPLOGoUF{Ij7uk{7|90L)8dIV_qi5iSdx=#`Jwzji@ zy9)dBnd+fF1`WppR&fcLy~cvkYB4XF_L@L?>-9> z8_MiQZ=D96BCZJnJ)(72SGR;-jG;}rGcwjb{%yt0Po${|m#gn$_PLfCN@+c|@C5KaVLC_3Rmzl)ncFC>dj_rIaHY#%e*Fk}a`J3D$=AUl z#8>#qIUnaAgKjgX19;**x*En1$^=LC=Qim56)>fjXa~TFb`VX6A<&eapD^H}J0}Pd zHQTZv?R6}O1e|yUDaMRhGhjh>P0;v9rP)7CK)WELh=+Ai?dTei99}M0lkSkk2t6+LJhXr}y$J9o5 zc9Jz2d!|E}oH$^jHL`u7dRF@ua5}KqRe04LAl7Zkl!N18k`+Xk3f!0kufVk9t+e9Kyj1ee=D5OQFHN*+K2v0H$q%808I5VoLfzm0R3a_ykCbr(Y=cm_{ z1L5-z2$UEU3N;hGB(@p=D8W16j|eprs}Ek%)qxHob~P!C&^vz^pV7BgIqTohMlmi8 z_oXd_n6^!S6+bMghWF#V{LUzwDJu2={q5!mo~(}U#r$oPApoo$TeY+nLe`xi_Vukv zE*z&^ctc*KJ_JfD=23M`seluUYg z34RThB!TI|?5AG^T4|ENfIYU-sjfG%C>99w;b%|(f^f_eB|nI9%_fq7`OpyWLh`8Ro=?~nN0e+ThGQsDL396`j33Ahvjg>p$?q(Uw7B6(f5w^#8tye^H0`n*y2Az> z4^J|bF1*qgp+t7-0%0MX38*~SMHrunX#Vtox1GpfisUmUBG7{Z|7qAxr+a^)zy3s9QDq1_Y7 z^{bsE(%18wZjKrIKWdRJMzqgzy=6hF);ly4GqLr3-Iy6zFT5rQv0EFq4-h^@{(_HG z8mnC?w{~f*dDEKn^B$#WK&S9#)tq9j%k>QNQ&+#7FvGbsUf?}oQ(V;CECR@RV^-_P zn|<3mHV-R(LgtT>{WFSno}UTPI<>v`c#`$)laG%MhNYXlLig(~G8^H{NN@J`e?0o# zl^M6?5+DSGu^?Ry*yVuLqWVmEz>OFCaiRlDw)VGOH(^xLLx}2D^>cm*f!djYr!{ov z@YyUR{YeSkZ)+Fcn8|a|f?UJo6A8|LbXphx5{R{pmqQA2#X2ON*!qGYx_j!a9>YO3 z(kp>icH^oUaGhaiT+-IU!G-?C)JL3=PH-<7rt^lySC-!DxUfh2Y+WT*)@swP$ z{UeVH?0HwR0;bgAOVe=L!ucm>IndW?K@0^1QP&?>{J;79)$viO1n&-MfsQj{@XwMU zXypn6(}H!3{@D@#AHqA-h|uXMn^+>AE|N{u>oyU08rO00=7@SBb~i+qAAW2R7KdwI z0n@Dxv855I<>V3S{h9`K<>;S4z%@_T6sq-rU%JdMdmZGjf}lVo|7qI(cq=gB24^gT z&?NLSElM+sTsV`v@C~uACV+zCKNISe zd-rfrY2@wI;KiY-S3{@C>xnJLNqby6-3>8)2Lp{ZeAHjQCfOLNTRg4^R-slks?+M< z38=7t(^u=lFTB9kgDZYz!|FF<{TZa*sqxQgJ=3C5()dRxH%4@^i*8Y=$yJRCT-)Fs zsvq(U^9JUN?p}#Eeh5ItynqOsOzTF}2lNnjBcb+nBe?up@^}u}iS(2bTpXv=m^=UB z-MrfOLpSV|vv!F7)En2%ls!AHng_G_E8eL^lj_gFwJFg}f$zA80eo%4D$S3)*f;GI zeS>cQa>_^Mhy*pK%zImS;D%@X%4+@l8dF1e*N?TP0olUx{kX<^FH8R-RLO%v{#((- zt~M|5^OZ<;rZ9d=mvWAGJnXq=k^B#Z;kP&MtTkPO-^aaF`+)Ytf(~Kl6`D43!ffxl z=f|R^q`meENpk(WsMeG|>b zUnmYghmOwlfaDyaCgt3d(g7Ak=_Qk#AFuXg^$!`*%`1=4J&4|Nqb`^Hc}Q;> zB3CZNFPK58K`XMU3aGWo3IV^h(<X+BM5Mn1JggH%7tPjP|#FdwL7hp9ET7@rjf>wfcirwWDYTLDkWYBiX4* zpfeh{0;})q!`#?4b6&|!;)50mv=LFcAQ3DX3eXZTKMf#`>LH( z?-Yztt^3Y`)OxLA$TifYT!K6BfP{_PLG5@xtejDf1T%>fx^6X9&7P_9(HkHBF!k+| z+F>d7WVNwZd(JUU8el&`v}8;8gbdhuf5h|trXT+(TtC5iVO+b}Ex@vAIKb)fyo;eQ z)G=5UZF<4G^ohr@#J-2S#Nmo9D}8}emv;64nRd$+NJf#ca09a1QyzN!&ODE$^zTi&@g{(9VR zf6YgzzTRb{vMt-j+v2B-c|B{=nv*m=G$~hJo$tcul#ZZ}mK{&@3AFXJ)jxQF3QkA? zrF|5>FJb9o{UElzC2;tEYWv-vM;-iEn7ry=j^w$~f05S*r@gTvqcwtS873T|fHEN? zVlSK;uS3U*wyBU?$bl+jTV;veV5im(E2KnCS|iRd-=U?zm1NqK8bprmZRWzn zg!d^YADx`wq3s@mwqvFZNb9Sv8?U}+?7RX-+Erik>ZYL^29q^+y!d+r!)lJ;^d2tp zJ~f;<6hP>@D(zj-Hndmbc1HG0?hhIwyZpnv>cf)7jn{*kcZ6flE9~4mWBAHkVlBe% z?#xVP5$5%8RY;co|BEWLp|YvnKt~>v5Fx`zZV9F7T>9=VP05TkG$}(`_oV<|G?d$H z6l=$xTNM|TY>0d)k{Q?I#NVAV>4AiEnFV5{QCBDh9qxP>2Y`?4KGf$fo5egPY&qI; zAxr1=k;7Lfgk5WoYUQu7-?-KPxv_IW$T$$>{6+M7V0`H7a12%?LM3AGu}_2%Blm~; z3ko9g3Z-+_r=fRweEi3_i~o6N_RlS1mfdfCQ_UTtERtTIFz)!P>47Ay+S6=+ZE6uY z7q^HI_91|TTlLz_#fDFH@F;?WOK(P|)|JG!+JvbL>zm(cuT?g!xfE8X_3kPCOD+Jo ztnkVxlku@FK6!3Pm!2@Awim_o-5yP~ZxXdEi$}w@Ic{@v+*TvvruC?yz!XiIjv(XL z?8y!!w2-&EV2mGpb>KUnm1z3W>u$!E4qrUQceG1WO)7TD{dUCGOFIs+vqp9A4=UU~ za7jMJ28<|oI?z%~86ScWoik$OqVL{o zeOhZ-)An5*s-IU=Bvrd+WFMOt8KFuk>=x)rcG6fw503NoTXI*A6A3EnopK=PQw6ud zO*#7vc1Yz}=-yg%k07EnXwAIKX=GunwrsM(|K^KJsw1g-Ys{G$w@vf28A+YKKMdbmIooHpiq3s~=6?uUT^3uqrZQ&JX>)sE z4pp7F`089jb$WI}izn=^)xnbdcq&xkzp`$B*0n#~O8%WA1>mS@ct)iLe&Hp7m#GDC zF?oJ2;)RiF#D?e=r#u1@Ye&IE=Ox(td)6w)f1T$mcUF6_4?Z>WI?^&_v*Bvvaa^5B z9yAB;vz~-+peE3aks}U_EcDf1I&fIWqQ%D+Nl(p$tw$bM%L7M9y4DG07vs)eQ@gxr zp{6faBX=bDGu{);ja*u2Sj{H2%0L0-rS5Rk9-$R`-!}P?HbtPgGtCdSx6a_%Rk5bD zS!dr6T|YUwT~%ku=4JecyZ7$B8y*PyBUqXYT?lv^O1puwWK_{TXnJO34-<;r18qA2 z7eLP6#kOg*u6f2Fdy3$MmvF6dHeZqckc;#*Xq3JpVk2vYcV4>x?TLCbg?#j#e3<-mXF9&0k%Go<-evJ{#pM}Ab_;py@9RoeoT(KEa+ZbFdwsE>gW+6JobKt+t5ejTk0K z{Z5^4+H959yWmpk4-23YkNCFK7C;-FJ;2AnUZJdmQoGN*z=z*#$PRW|575!f7(Q~bYPC$a2gmU*IWVu%0wJT{ds3> zyNpk*Y?{fkAiIkIYqc{--hg*VgLct6?!z|_a*FR2Ny?Njl-(fgXpNP3?EikqO z6T&ZKy9-rPz&(aycn^G>eryLp!fJC+Irgk>O3CBfLdzZapaJZU`)rEwRTU#7<#t58 zF$}eX_zSFh0Sdg1Q$fY{Ml~=W=PgZ&U%0~sJjHdO!eIFsls0vDwbWUgo zCuu;GrZLQ~TDr=*6n506S-#9H2(M&7pJabY@>{yCq1Jn=?%3s{m)<=o_@P?kRN#DY zTEJmDuUKVc^W>|jrLTc$O)EhRUy&&&ZPr-Rcq-jA0WO11p<1{q#)h$O5-d zUCs+A8A~74KP;OV33dL;RHPpU5R^^g+Qj`f1;ERW0*$Tn=F0ZJ#ooJ+_&RR+_G|sJ z7yMA!_+PmLM6b0z8rKQ+Hg*Iik-&fG?S`Rng=?G+Lxx~z;d3XlL8T!7@CV3wlX1K` z70ExXdfwH?k$GXwHsV%VQxtkY`5p6pM&DAz$dR|77u`jUdEm>j!snlu-W8tI z{B7gb{I-iVVDMFs+S6QbnYobJGd}vnzjc^M<`^=1T@dq#dAa$)!oox_8QiYUL(^q` zG?q_mm#~Y|Y&lBm_ncCWYVo@qG-qvOJ~^BIIOW6W^~=X)os4dz>AkQSEj?+aK9|;z zo75aSJ?~{EwXM{B{`~iegtRHQvbO1xvS02@i~3{ z`j_bYTZlTlt|1w^E90rL0!jsNB4JkH`;gdn=iW48tKSe3U$g$>#><%b#W2*{YA1`? zmZUdQ6P<0M06S%o8U;)gF^A_5l5;Sz5r+T+l3k=Wb`PL=8mRFrnMKNXWCejK0amxw ze{eX2w|b6V@us#UCwVpQJ}FPrqfSxGWBqA*zy$F*yjD5ZSPEIz>&`tMG{or#CV_A` zqo;y-4$Z4n6nI_kdz<0y`h){FFdI+*3LJs04lJ@DGAKU4hISm?0dIkl4MuFa0I$lCP-+c|E@0429C3OVM)t?PK0W%qP zKS#R~F)GM>hvly&?E5~?)XGozNLk#A^O>&41RB=&UaV*vxG|u8wCBqp-znW+Ra%DewAiM9A>Hpkt@Q6#HF%GV zFu51g7I#>Fa5qxC$w|$TltDH;-Px%s zpq-NEng%~cF3`<)5A(0@-qbMtoEPs1)Ee9eLgk26-xBn~A6fsiOS zvWYE&{IzA5mHy&^1$494&+kzk$Jz%kFavLI#+0_Yk;D=}GDpN%SmnkZVnHNTXQm9m|h6d`w?6++#oS*m_Ki4U^TXSZ6l!e4bVM4a{1|`;IS_* z5UrG+h({KBonOyd?=MgpaCG2RwXVfUninF@|Ma`~)(FgWDNiF+zioJMJF%dk__$Ta zHl{W7Trp>ew4wqF)Fd7# zdc3vUAr9*P3(Og~j+yBruJ^(AT9ae@n6Op=P0_QH47OtfGPm#4u%%L~1x1&&v`k!e zhnsKC9{Esz&fY$_ax>|o`fLEQVk7FbRp=B>+sEQq;fi^Tl2F0%8^wNB)Wng%fOnM4 zYeRQIK>D50t^f3o{;b!(S_S?TKkGkd@*!UYV`)CQR#zW(s{ zCOym~eJ%{{tHcG`Hg!`89Dm>3Xfn0ch#b@Xb$m^UoO-R zAcS@{;vKaR{`Zh_$*3C{ z`{|vpXt9s!ry22hKg4tc7%+jqSHAu+Ep+2|?09rM+ffs;*Ai??C^*3wHl)6!H7mBGk*L#M$}^+E0pXBX0HL8M{Kf zId7$*pv8V2#eG>07w+xR%^6wG@z738J zco{0pXy;ZSV|2zl$DsZLE=g{dkwAcdyM5=a z)e&d1k2^Jl%*Y<7yeoMAu~y*&%!|*-#|w^FOH@~Osqsgu#ZmfJC&)08Vp0z_-gw`) z+pXO<%0qYONICJPjCr1rlKD00W5KxhZdrBG)ls54d3JQ^Gy)Jg_(6Z|wR%NqIe%FN z*Wf|@W3Sbn12Zrgg{pmEI&&$vwar$})^Ks@W?KTdZhwSVnwJHTH>qFrj1Q67`(-~G zYy0SG%D3w)=)dN9lS1js(h~*cylAb(0DmWn6w-xD$Q6(6Mf1@EidJ?&@^yKAD|h1^um(+j z9>U!S2ZNm)CL||&aeY;Ck&QgjyJ4Og7CaHF$@?v=AEwn6?ECD|i@;v{6XLUul=z(+ z1#!1+m7kPel_fvzcbPURdY3(ItF~%ga)~V@dXABJYF#G)nw0Jg+~S?*5PL$G_fuk(>9I*$~=n3$Q=f+flZJ{evAE zqyM|zglxNuUVfkcfHh7ln=}pfyCZB7=pVQr`Ne#{>O16b{F5jK=WEl5c-J&Le}Iu& z`>e%op{&%|&AR=Z8!6lfqvWD|<8xTwTjNAawe?Y_r0i=?Z;W-~juty(J1&TxOWJmR zStLe%p{Qd0%I2V^hw6u_LE zK9m6yTF%`AJR{btzwoV0D_+@ASj@>2GW^!@<*_KcGUxuURV{j8zVU4aMyELd%ylK7 zU4yRaE7b6X^DSD$zP?mTd;eUH&>f{)BlCGr;S%K4Gw4US7}_h~Azre46jJcID2j-gxDJZ|%9I*zBTpTO&eEuutpUu^(RX#OIjfuPLFT z%FB(Jt@?wv?FOdrRc2mWR&^9RojSRV5J0AN%^&_)jf-5okN67WK3%2y)%-5@_D zH!=BP8T;}cH{6Gr@RQJ~dp(=<6O7nkH>a-D@(yedp{5U0bkD`_Cp>&>;z@eUVx%Mc z|JB`hM>Vyr`=X-QAX}<{5EW1m5Cs82Ai7nm7)9wqRHO?5r38qH(u)wKDlIBqN~9|# zk*?CDOOTQvJ&{BpEJ*RKbITZezvsU7ocrE*|0FAO3|PsU?VI2CE6#8D_HRAHmSf%G zzUGPWhGn-S_oKB2^I{i!;Bt-(-m7d&-qU;PQqs}5W9Rm6doWD)K^+W9J~m^UFF0M1 zdBImyq|%dGpf*zsi4~@!a@T5V&Dx6h4yd-P2zf=N%GO((EbV^E^SB1M%CfvV%of~X zrmtd;35NdfCF}pn=RXbdzdPh-qlFa%M!OkkS#B2QAgbW~2x%8l4plQonT_)*=3HYQ z4Q+sJpTJMmEA`DJ*iS38T70;$b3{Rri(zD2zGaQxD5?VB!yfLF7YmGADvo`-hsyM0|fm2cDg z&yNN?_EsS2AyhSTtw|;GCKtz31h%!1Yool@vi7uMbTe8eT?VJJoKUIq1OA>L`^Y-{ z&r%iN^M#(j1%LB*I_CXrp!lozCgwVL9*x3^RuO23XpG>%T8NKOTVw6szPUQs#QE>J zQXAV?$zW#$gOS-pF3b)8rJyanCx$9>GSK0i*THO;M`t&-g69mg@a@ts(t-|dEjYNF z9T3Z8jvki~bGSXvn-hUXRtaBogy$LS=Usl4DPNXpm0(Z9j>gIF&Y-{TgwQ*9B!*LieIl&0&;Dx>ueZAl+Jf_trqeg_6lHx?^QqPF|j5 zE+0MkEKkmRW}~9XXr%n(fxeP*-!gb(t3R0anX$Law$du;xuKGqBx; zUUl~ZEPQ;DR_@=e7fYg$s(i|9GPecXt6rjvvBjpv-1AtT=M+vCzb#T~d$4fP7qJEF zh7E^Sbv!KP$#JBZDK(`9U3|~?)%3a_(TPRjx#9QY~guaS6 zo?{Dh^oK`S2%v>+hZ(?cN1Lfd6Lmf9dSf`|Q%b8!3Dpu&)3sWi5oorDN2XV&wJGWuP3fO+PIsw) zt>68q-)ig_Y}*tc8VuG@H#-xjz3(v<^5@DqurktmcMUc@u$GU`WgBnNKDve-ALL{^LAE$^0mtEgx47N zg4saljclhBD}Nu$yvydcZ)?ix&$x{wnFm1+0BO8k+TxF<(|}S?R{+*d{n3o1^@a~$wS7Eu!OqPA0mpY%!%VmpdQzH=`?79_b?{^^e6g9j?Z5kJ zq{{ofCGt~rTZrbG*tys4!7jB0@SG((KZ(?|tEa5BYOj<$?UvrFuKwmwa>7nMsTJ2J z*YjV!L48_%7jW6TN|^r9VQ}D+rreAZQF2`Aot~9Yypbw7G*64Y^7;DON{UX<*BPQX z`T+oAzUp!SIZ87Luh`lqF7iLV%R%WE03UrO?z}?;T7rEU+oY#ScsQY$G~G)prYJYw zZ*k*cqcM_SCiNr&JcR7#jGo&JK>a8MZr{#$aa>D4-+1c7wOX}VXq4Lw@-9+h9v~Kb{*b+--#j=2> zsQWpjz)O6;+c%wd1mSQdFgWMbg5sr5R zh2a_<;Bn=S<9u*vkVH+F0Rxqh6xO7i(i{BUle7)jWPZBDA9pLGb zbej40fQv(PDpc(9_1Rhrb67{t{LsL-78p#FRVM#xWBK>m{_&OXe}xMEqalC)Ui{SH z#ERfP`pM@CA!}P8Bv82KN;gN9M-827Qo!YJ3!9}4VL2OpkByCxCk)QqZp@CIZ@FE5 zT9?=#{?&Q>47P=EB>jgD)8^7~t)#r@(994`EIw()T=Oe{pmuOXdRdfE)Ea3kZ$MUt zIX;vi4XPF7KPGzZ{Qe;=>;1gTr_kWta}5wS3S{eSX>br|#*=UF*Mk+;Cs$Si zv7yRlYXZ>u^w#~6=f5`0Sl*9s9ho=3%{dGBPC;UM5^ovSt-U+k>sU#CFl+%RKc;sn zL~B@J@0k*Z-oW$owv{YP4zo zc3myMF{JQD$jH?K-YBesRe-BR-|drttuQst|K!WzBgl?mr=HDGlD;lYfr`n+}^Uk zYEtTWk|awm3mS=9~MU>4Wy^I^~7N=mm0Vsay`!YS-1}98g=wOO6Pp^dWF8js2;31 zaXnHHGoBSQzcs1{yL;&z|D6M2pJP|9v9&Ru5c^Ka53WvzPe2;q5aax`$(oN9D_-*C zQzxP6>9Y`87!?^+K!;W0IzUwkhilJP&CT2M3b1wMou6FuY2^f2n+dh|^;DnaB_l$4 zkzzM?ET!LK)$(w!_h|!>_TpPif5py%ns8?j!G!$mMX=r__tlL6L4fJJ8i|(kEMyFV zkEZ9~k%_&Ff?g$m6PhI>v!3C;Pq~!1|KzKUdjG|H;1Upqq^j+h(0O0Ygh$OjoEuP- zF&cb&0o8dh>}uO!s;oot?|vX_PBe!$`6&nfBClDJHjpa5XP@k)E-jK?N zRH%b~oglzy?kuC1|55)%eMuXib8BKm+iYjaPice2hTdoL{1wQ|#K(ybq+{VZE;^}Z z=`%vG8##(fcgHgkA*RQ#dik50mlX+vRmR2azmN&Kzd6}Ph;Wn~L>M!nt-TQUb>E+S zCDp{?7qJ&Ni%%aeC&aYey47vx8@jBdNFbW4wUQH{`(n~m_nX42t|hTwWpIQ-V`l_9 zkdNjH+TpvQ>}RDUpzbc13dQh`9cSa?9fcnUlcMQHD~a!9k;w{yF$JJ{kJ3-RA1y`B zI+X!7!Fb>}*mTgp%6iv*_7z@_IB5x^3taliH=-X54wBC{Bj!`R2LKpVHkv5UfJM_a z?RTW)*w-taZ@0a?|4yKo4SM8QyA{g@eNpGrNR{iZGl5mkN0DblqCc(ZDX4imhErq+ zikFXZ35snuhW@Pt`0{BunDzS=r1vivxLbd@=qk@a$VdN{ln!C}{|#}x&5?5ih^LMr zqtaDt2q5MsdpCn45sVF`6&R)JARnjgGduK~+>KLKnHLtBvJHDE!_`Y+#Y3&!AD_fnxV6%z` zHL+V!EmUL&ZnU^#a)%#u`-0K6L4R0`ww*|aex|RvYHJ_XX)zG65tiu>A|iy3bpPbj z`V6uE)3ZUT(Z<1~tSEqFUql=A^TZv|k^qjuwqj~W++k`zN}?VQL*B_&>_3dM^32fK z-Zf(fs~Xc7`?}M6pR3`?Es6R|U}Ev_$Ljbq$@ec+gl!G97$CG;9=JE%^po$m2D$vD zbj#0epa#0FLaVS-{?#V_gpLDN)nZqZ!3}-SXTnjqziEJ4^hL{hu@aL|jG##8P5Uf5&Ib&&$9oxft&&`mRI!aGtt9ZyB z9Jnp*1UzCSj92fmJ{EoW?sKf`@vy$}_RF!(hN|?{A}Dxj*g9wQ z_=o9rsMc2RFuKf5u6yVBM#s}^Wd?$``w;s8CuC@f(ecng+r7BttI3Sf%cYYaZw2&& z3RnsWlxtnsJLCP0xGMvGbjLoLrv1+2FP_|*dK;~)J^0(XvOUuz8lJ7k>#q|fL)+mI z=~l1%%j5<#LRCo;}^kx+@#LuMK5~*5fvUf(mk^DIQ6!0~Te))l;V0H$kFT zKNvB_#+|(SANAR|J}e9?g=|lm-RIi^PlVGfJ$gT0{sdDhoZNWiRC%|?)TZ!$(}Oxw zYg}g0c<031T6nENDIa9T38HFt>1BW@e4rUh<62C3Goy1) zC*w9<><_pK#Ioubus1%VT8B+;IT<3YJ$&OZfsZW*zGqz8+>XDkk*VjrR%8|0=a`$D zmXev3PV0Beqe6t{WmWo*eAa`%A}?xbE7ydWeG#|$CL3Y%n`b-7rFJICq7OBuwc;zR zTebT7GC#t?H0MIHj!mr{Qa5`KFO<*TaoxJ^s-w62lbPoYITlrRMbOAx=?G)+xZqn= zI-!=XRwB9J{Cw6qx&8Y5N8Nis(MP3$G~lIjF)MKJO?T*eMA;EaW7Y+Bb`b@cf7HZi?g zuiqRS9I*?QaajTrU`-2#Xj8}uE1ic)Esj;zA2rXYDc1+;Ax*dqFjvovp?=QQvc+u4 z2Jcj&eBV)!pcGnxYXc?3!$Fdf4wH5p!4Iq$4K+!&XBAyM3B6Kc%<(j{?mOjx>8_%O z7*{veA{)1l*L{duEZIrWW+G{}Ao;fW{QV$C8oUC3CUM%(wc-wPdCRtAbBlrpGCjk| zekb>IU(Zn^9xs9MAK=KuPKeFJj$W?9EF8PWv7I+K$czeaP!J%aEvZ>&F5x6sH22I? zZalaVyFwDdjFBpOBW|>WFvH+=NvC(^8L7@r({33ehF>tis*x&N%}s<`1CPAAXe~c$ zmgTY;l7D~E%1HSqU&6>Mlq+TKp`K_cOaKKn>VLR@oB6=)FdVQoHgNhb{F46mC95hD z#EGwV$E{7OGR*Y4^)&sCPcVf3j;)NsA-`ZW+@Ffmu?K42Zf<<29tBI@5qDa7!6GOAJ=q|qe)$3EQtVxBU{RU24L5=0B6#nr z=5mwjuAMq?6RTSW& zSx0m}aP9~?7$h;4D6B9XO>OJ^usvpZ^wis&4(^Xv369#u4Q~dya*jD1XY)N7p$iW( z&EZ)))tWV2Q=wy^az#+XX-0Q%v}Pqf+1*j<-GfQ<#j1~>NVTLX4KP@~tYDGpAG8i=(|{Ys!+g zp*6sL{~37uECE_6Tg$#?p0@)9Lib2H7bd&S16Qp*C=;@mmjsN!9q}3{Hs~rx-ODY0 z;7#BO7JMMo<2qZ_+4W$TdEq2TrQ57yhH`I@FWI6P*m3L-pkciDmBAe zMSE{%7kKcgd(4Q9Ms@z+$iFMR8V*G@@57><#C=CPrkPh~2zK`!@^n5v=uZy)Q@-U- zId0E4-~RK`l>d4h{o^}l9aw|4Lq^9fseLr!#zxCRvO1^!5tlHt}V_ly*5idor2y~{)kRCQjvLzO=<;(P4Y+ck! z`zG{nd0pauMsM89Du#Z~OZVxW6U}5DdvoK;BY6id!m|tOlZ%fJ>p`EZlAO4%(<2ci z548w9NWI*kE{9L% zd%5+zh&~2?^(^!^v*~rea&zrWG{cpF;5H$Bl!z@97nuALi7y zAYyBe{22d??L1@O6r;*e7sFV7>^!-=7B9ANtMYDu6ux41wt$m#qmP|EQSOUzO^f}= z*#A0vEG@BMrRhiSvi+usNf7in;KbXV!suuwCp!vo-(U`)lBU*UYB%7JXupNyr^cdq zYmGET3lNL@%nWJ z(6_bOT_;0gR?q00wo?Lnow0Y5E*@OH<8vNTNXRxg1j>37w@ob3nGjasusu`%OP*mL1Nc7z4ny~iNF9YA}?hF2aC?yBFd}*vOzf^JQ&a0Uy-`8mz zc4AU3#^ER^GgmKmV12u!B2~1owqW@%l`w|42@YAD8xXxNWz+t+eEmnS zJD8I#lUvbO7+-l105?U~aOf{HT3+SaeLQ{$d%n6!5`-F(YZPBU&fBGQakQ-1@DgSB zQ{Tk8kb>9j=Rex>!1%MH9B{>I$G~nQ$Oy7&p|V0+G_aMwLAJg;g*cGnHw#;v4sUk7VX8zt&sag7qRz0{5M^hn-4(_Y5sLH8q0&)sec!V`cFGr;%emGRktief70{5?A+FZHADR&&ptlY&H?MnplTP zgV!0kKs1`1GUAokohVX(=kW1<#~RN{mqt(``6%j-sOZ04{{xZH|4acvoW}IQ7}th9q$F*O-HMm_K z;D7OKPnVd9to0NYP>gCJ?HffKmNVCzaM6xim_{v-9XK%{?mbUqKqbinZ-n}8r`%~> zRsXi?oss96=*Y3hxldD`7#e>N`21&%2?5GY!g@~b!wl7#*!;ge+UDfPYE4wfsm%vE${0z7O)P<%WFx!qU4R{4wMG-&b|`51#-39U-yp4xShK z&?b5Cm#8@7YEnhr)>L!t@}gvA#_nAyj{KwG0sI00P9K+MrBY0qQB`R zT>(N~SC=+S#3>YYUDRPm|0cV(S}Hag-ZAZ?%HExzJ?@7o4~*N$_oe=z3F#|D=jVCh zEf*Br)smn7$QlU;(Za$72574#}AC{e= zf{MS4Gt=YYcF1;Qgp@>#TrI!7L1}?}#kp+ajc_DRpY_y%MBabEZl?XJ$xlB0Q!mvr zF}oi0VO;UtEL;=uu2{gx4@+;~4Kv^=d$u8=qMg@qy$Vb&sNH@WYB?^-FqSzG9D4e2 z9wF5q4=6yIu)A--VnFwnXyQr0h)}$dx{i?^nR{F1x|%*r`C=yd{ZaA(rvkC<1W>6; z2~L~89LG#!-5=ncSojS$jv`UUf7l_^g;s2g4T2!K=dSpDTry(0NDC9f%D{qKYCwTn z%2+t@8vFy6{H#Q|Mq7wVN|z{tW+i4~d_220YhpJFnn?`H$x+i4+qhem)-FRl)HemMyt6yr6IWIv{bQO$t=r@y&Yqx6$S*qOlHCPm=pFwDYNiTu;+yem8#1pXx(4x{C zJcGDd{HAp}k2M^(KiQtujG3NQIg#Jc%TgX`ILVX)Lw5<183I#PRp#i#yC(!`0GG zoL5f0Vvq~EjNZXU;HyB1&UJ7t9|M|yp_&eqeU#T}^F6ZHQUDU#b!)M{N8kWpIsOc4t{H{Q$9&%M;f9PEk6? zNtuJDl=xy@W=~mvKbbh-Y;hue)%!EPXI)1v9CZ0@f+T?cr-Y`-STZDaeW%efZkzj2`Rp4p# zD40O>b8^Hn_E5BH8P_O-23tmmswiB0#n`wh(=+?nI}ykI&xyua@1K(HBY;cw!SO>nFW z7JMckGQSf42+J-%+v7V(hK1%kDU8Z9WV4G~ed*!?Z^QOIA{7Bbh zEf^(JGwJbTu<|o@7HZsw(7`r?iW;Gnpg!jO-_)Fe_)YLtv+6IJx2$@Jt)y|3p@1@s zD~qH3?Hfl=mi**fu{Zie23c>ibH3&=nF0{!wo!T!-VSi#2=)xoZ;;79{0j3ml5 zW{||EDZ+$ztjkK;m$MC)PGVj5SIO90vkk%ZD5IyBg-EJY!oe`pm7aVL3NE5&Kym*L z-fpn}jdIv9&=f>_3C0=j`(^8Afxas19t5KVI|cUsYA_5OK|DDO_}c9S{beiP1L^;f zTa5cNrokWK<)zTU(ZbSr6E~c}5x$pF@_5RX7fGIi-NA{2(*o;(9Yugv;-~P{u=j$y zFoysU6XwA_IE`KMHDk*ISFzrwbJ*tV+1|Yx-rTr)?|{dkD)Q%l@@Ssc_$dpE5HD67$x8DkbeB)(ehz2JWL2^iBl67vD;I_@&vrK{D5&6SRy zWTYI?TYj@Kb|N6CUfW>2bsJsxkLS}7(EQ?83x)bL#g!wLN)@QZj+ssI2}PJdDh^Kx}SB2MrIpZ@SbmO+6FQ`V8@>l$zn33N0|Grdj#g2T>0% z&n*8~_nP?9qaHkx^A}o%e?IjJr1J*AnoAV8qn|+?9|Fl{<3ZQpO!G0f&44h0DDSn5 zMsMqwar%n^8YP>$ylx_=n|pbyvUqc-=L_H{vyuX*gG5_1poIZ%52yntP_^HaFCJF0 z^LvcsF3Ni+yk?plh-Nlie$A{?uP9d9Hj5pr?;r|-4vk2{=rpSkSS~yF3Vo^%tuc*+ zcx0;#{zwl9D_-1_Yv!yv*m}3t_Vyd%BjToLNv2;1qb^+rsJVNSXSdkulk zK5`rk6CSI#W5f#QjXV`I8+)qJ_}lJIv98Zw9S+^xlh^zahnL^B`1+42y&p(f%vVGu zv_*|9-GZ;EjVB&-v<0TyYJJIWy0-x*i}VR9Wh$F;$n!%fd00|P+Wo71VzF4vo6`ha zLKt?c3{hiE%VM~)qPS_Gm$@e^p5Yp~h7;m_7RShqRCRl{p0MvNSP-NTgzZ&1A8306 zW`mrfjeY9Jt#uAR)Mrm0rLThu)16pRq=6Lr8t~Hf#nQU6dm)f&4-=gDI=0a18|Ud+ zYWHJ!oW45q*;^{)U8vp2(e>d47J!Q87|W2Cs_CHE&Dp3(iovpbE&;j0kM@X@qpt!!ZXJwhU@PyNXI&2|ztxW+Sp-<*@7M^#}eBemAF-CrnA7JD4B>G!mwu=olR!Js7 zLV&6!JPHVOOum7G1qu8P-e=jppf$NP7{?nmfr1)t2;jP1W^j9!t84yZo$?L$)bcTj zpo10ttAPD_Sa+tqiH4#&{}sq*$Y?LCk&DNAEIn}N?F9N4HZ&omh~Y z$dgCgyaAxEP)l}|Oz}!zU*t=BAnm=~FwU!_d#hg3Mdv`Qt6<|&iQ9LpUGiu4J=^*s zoxnKH;&G#RifH31Zl=LbjzwjX1!yaxr|;}V=^Rtlxay#{5qM%DFwwrl70dF;)wMHI zOD#0vQ<9sWBx_HDX+Ux%s{=48!h9mtFoA)46M9*tJRuvhoj##u6kBP%UC90IV zs>Hz$lSc|ylr^Vw-4cT?>MW1q|A{7u*a5pa`Gr8yQ-ijVLMN2^T!B)Gd^u{lK|~fu z=?0aslCCvK*^elsy zq4lz4v6fJyZ07c)A@StW!xx!*lRj_tbRjx%i5LmY7nmsV5Lz2IMQBEB1Loe#$_(ZW zbL=TlG0ypS4CaugXJE`KLEX5jlcHZDyzBTSg==W0UAb4e`sp3?5soOFCDD%&28>t* zVN|)V?l2K5TP>`t3eWFR8{9*ZDbMQib^mN<@bYjAd;R9E)^np7ZfLw6C>jQuW_5r| zV=$~87Up=Wf(eQAUqZl4hiLns&7E!Af9v5LfB!pvHzj2xZFX1)e5>V%T$|p$PFs3` z`7~J#qnGBpft*QCFGf-a5%O8@#>An-74w0$J*BS3jiDh6->M&-TJUzN9Q5il@kdrD zt^{6^L)3wS)ae0v+O#?9PrjI4uO{3lWWj@5`=k_#@Sc^%LV6wQQ(G&JDfQNS&gYg$ zF>^TlM5r_3^hV*fMUOu`Y7LGpHA~?AgxDa*i<+@qv5;an#u z!ouf8Qorgs)~5(1TW0Pv=ty-Vq(xBI;A z63V3D6%JpE8mig+_vk0%+xK(cv~F9?VIjFh(w$}UNNXVGH(($0cL(_O>VCPQ{uT?y zY)+n8Nd5<&oricF76}>il4QJ^o$QKI$-#ELV;uuPgJ?PgESyY&U*IuXfVob)U)oaY zh7}}=qTT?!b@M6|mB9(du`XEYs*WDx*LJaYm)G)&krTh$_4*{f{?AkTBjyb6E4I>z zHaV>fkAjJCTwjx-34Y6Tez3|M(IE4BY5lV;(yi?)Ytx~$HFbEVZ_xZ$?wy>6t;M&a z+Ly-yS^C^35~h6yM}W`sm8j59%Qq_fI53LLd5d@KgHKHSi{M@b_T~zWZxg-xzjhVv z4-%UDYK|6U+qO|~q1f6Hy8U!78N#Xu(Rj6%OzZFl?P6YUYW8o;?ZcUR=k9yH(kyO6 zitaY7zAwX3*^!}`a@`^A2Rz%c5#9`~D#Nqod87LwWl!{9&}(O&Q@MJ|HCgfcwAku{UwFm**A%mUsmNtLyBtlLs)WgE zszqlXVZxi?5f&h?ic#CT)VsT5%1dL)HC|KI$)UP@NGKzu@En1BmV8M_Ryii8=FSt* z7z-21)8|ej3?xT1iM56Z4aC+Iw5sjkD1wj1`UTo&5ELpxVNC|s#74ZU3OUwC9?6z& zZyB%JXC|`$IbBL(C0xC(8}jVwL4kP(F$S((4Nx4R2T~Bb(azkmhHX>JQAmczKlHi%zh>}u4~JKqlVm^`yv}&%YRD!a$9BLrN@kH)J5lon>rL?;2#=Qo@jHPk% zXy;yL+fC+}q0Et#;};T5f9u%)JDrY~AR&7b9EG?wWyBqe6Q=<$MrG60NjD$B>>7|Y zmF044dZr;#=_p%>AU4z(k^E5w=a%|IW#v?Pq@wr*{%!PQe^}3U|37kVfWq;1jZg?e zXp>=%CnJiL1=LQ-h_p^9rnNfCF(BdoOZoF4Of%|{jH#b|2Y~F!R!0~B z&t0nN1CXuh$&Qy@-^G-s&+z9@i0ECpaZz+ee(*SQ2oVNJ_dF__(Sv! z_5foPSjTc$9U+EuRrWdC(8IcxsGU8*`T}=S=)t+>eo(O)ONqH4eo}p-_@uB#qUn=3 zqiGyRq`n1FrS&e*ovF-v4iJ2&yiAwO@z%gKq4I*`Vd%TW9;f)2a_w@%Uatd*MuLNG Tv>Q)X_94#y{gmmUKS%!;Ayrr4 diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-4/acc-2D-lr1e-4.jpg b/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/2d-nproc4-lr1e-4/acc-2D-lr1e-4.jpg deleted file mode 100644 index d5546561a225aaaaea4632eb7cb975dab870f62f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29143 zcmeFZ2Ut_v);1bMx)|w7i;94VfJn0dQ4vrPQBaDMsE9NnA|fDxAianY6i^f*BA_Bf zx&;z?M5Id#N(v$hL=s9^kam~*D|hd+J?Go!+xwj7`S1PXvzBBfS!>NT$C%?C?|A3r z5A!LIbw{iYTR{W_1R&?Ze-QpC!~(Jkd@CUEW54RhYqj8yz0m5_f~$pvgoS_EM8rge zMZ`peg+(Pq#l(NSz`xc=h_Cta=Eq6?4o5X>B9EZGYF-|6yQoXn2G~rHy}{_%iwR+Z1zd zeqoWd1T3%oXqNy)@IRUb?*F4Vfs*kaq#u7nM1n1yYHkhw<8wu~Bvn$eq8p9qdPabtc*C?wLENw|K_{c}(yjLP4;K z3|_m3DjyPm!E~*;G`jh4m;p}_!CQx^h{Ui(=u+GQw$luHt^++seaK?s@d@v zlg5Wo9(gEoZ*g3~2j9+pyZ{^KLyiJop|ssRfvGPM$i(z-o#DE9S;2MUj@^?!mn*{0 za0)9FawuqDK7{m!6%4d9?Ra8r%4{4TvYul`GqZ63UU`-^`abe2cQ?H%Mvb&r+y2Pd z{J7qw6hTV4li*$@2<(y@Z;)J3h)oUP2!ybYa^M(`H%!w!rRrW-o8#uWfD1;+kNX{A zCK2wBx+M%D;VJ3|P6}^8#>4%)NA?o2vfcoUf+WRK$12BNJvY8$B-|c%mB_VMTd%%! z+FC2~YIJbsxQNl50!KQLf^2+hZps@$RGHT@YHbGUT%q){Wh1WBGT(wedRPSb7W;;- zYYyxZyOA8Ij=poFQa9==u^FvQ@Znn1_>h@Sh8pW-k2m*D6l^~Zb(xV@Oo1lC)}mnA zK)=7MQF#sw-LeU(e3(sdgvq_Fyr;06W9Md;pbNKypLkNY62kq_}fkdzw61Qko} z^mX6hxOx|+wyl5H%DN}>UQKON@CQd?qPZ?-D-)Can1P6k(gY$-4W8{S4GFKxH8~v< zai_n`Ce^=LdUw$3qTQyi4h^%Fe0Lc!LL0jQuL97{!nn_dSX4eFZY$6|Go}EV+eLG= z$ZT)UChUk9fo(v?>fPq%vu3N75pzST-i*OS`ZCjiLgdJd)4CgQz&6Zv8Jkq94koq? zuM7;@i4tXov1>d|a|!$NhAr5*%H~LaS$)~%1AfkkJ3S(w%oTt%#u8DntU4diCSEfx zkD!*?u=loHz2@;$uI2Tq$vYX`ya}~)XRr$9s(QBlOY(141@^ zp6G0Qd(^>lP2TyQc*l7s^UCKa3x-Hy9ijkbHpB@o7@p>--JG(6`h?~@IN{-!eDTSW zBvN8Wc#F+u!+3#V6&ugZ_$_=$Oh;5NdX>38AJVuBultoCir4vSzU?&-OtV`xHMT3Y zIVS}nsW_-7=Uba$NEh?6a*f!V>>~E@sp=QV&J3&y!G~}iyBU*@zNZ}9@7&M zg<*O$`GTi>NR!#K)^L^5`ax`Q>V?qcd)D_qzdz}%?x~r-Iy!Ri^QbY{i`m4-2|5w1 zcfhIhLsEXy4YlZX3&Zczhc{#`w|#4K{5aaBVYYe4A(*cfF@)Bg0$;-s%pVTJ5)o2{ zItsP}MiMrX2FC}g!vY@Gy?&kUSrenRPnWdVre3j4#`mW5bO% z*KiN(pcQmjhSCKKGdJoOoS%tfJ=`WT(I;qfpG!~Y!!(ooP(|Q$ys~-F)1vYhHcSa>Wh*C z)&^MW5UO!8F-r>4uU3(oJ{`)UQIk&Yj}F}vBhc!KZgk2=3@47`=(}m~k^YlC93wrSEs*ybqxN^Sw+ zA{x*bVN5nv`Uiy4pc_!T0h~yR27`Nt9TXmNjwSCNrN3PAsxjWM=n zN7k)T#NFo(wz;~f@(ouyo5G)8oe=3nNbrXD2w_s;lpHMMI9kFlU~A0>LSUryAf;K$ zT1QXC+?R~NTzpDAMcuYcen)-ceT%O%&3TwQPxSu>eI-CDm@X@UPkMq<^V-k3;)8Aq5(1K z^B0fO1+2np1UQ90r8H$(Y-Zl00k7Qcg0{h3E7_97kzZ30GNK>{UB}&H5 zBJk_Anr*oVR-T$nZ4U(mk5-XqNT%(fEW%Gu?b z^mi99$jEp4V{b?tS8gWU+vdj@Q%RWVput3VqvUdO+!ewaptvcA5Ep2=i9vHWXic=g z=_Bv1{=IPU)93ov-Wh#gOya8gj!?-!huUTVEqKCt;2ye=*TNJ2V2FVNXg6>i z4}Q#X%9^%ks(comJX)L6W@{`;Piq(Zf`0rmsr-2N(i6e^OfpB#Vjp9PBPY{f>+_* z-Np2cv)WCw2{|Y|4I+EO@=tEZr|h;|L(N%ivUYYZh~8N&B9gatm7g}1JdB`V=n?Fr zyjFxL!H2924k5fiK{rc9^$`}bVI)C5M2sweo;TPBv+70c z{>cZ{KRdzyKTbWU*-S{lt~Fm5c7)sj6T`#4<|*`)s^_@6%QU|}Y3z8buCK1I!b1F+ zR`~f0tBqc{BR3Cg)S2+sr?N8nkjF21Juu)HAi;;+%Yjpq2)>9F8w_0ljJLDVgynk= zp{McO2WanHhh`5A#)V>rQe(CvJT1if1IO!eH#Ne#W(eo*>GZljiC8mrLF(X|jD&M) z`|RHBAgm>LBSw~}BF$`E99b+%D9ogZCx@Sp8l_^_0_XaOb}#1@&flj~(>x>Jt*SZs zd5v<)oE>mZkL1M57-#Q8Q}prS~aBw^P`Fqa6mnpk5KW0=|m5F+- zz658qAgXX&bs{So#D$>^2+sRnK4gt0`v@P>=Z5xyax_swe8~Gw$hG1&)%m;ts ztDqA{%L*Nb2u5q)1*#b~6e6)(0w#zG0W7H)#jgn`w0b$N4_*Bni}r~^C*0JEC}NWv zzSMdDVT(P(P-xJ*4j0F(b~A2M(TxbPFl1A!yrlcLP(DNyegT>oC5yGlnG29dN&?%OS{Iged!@fob{s6yxVJg& zymsuyG{N&*iT8ifzMqdp-}6=j!HgwpVBDBI;6<@jYR>T-`{-eu`|w+kOG99wx^MAa zqV%)nj3cvZ_?_QpS~3nl^N zFBlr6D5iVV@BNeW7f9=`NBg(^#ElDn_V6GJ*i={rY$lgE-NN!P1}51%P}Xy0!p+;Kd%s7(PwgG{`|I|IJOmV2I^3I43LI0WNRwUywgTAcSz{zAeyu-~*@VmpFEDbJ}V};)VNU~Kp+Zo5oo}CJ@ z6XlrqUm3k6+PKO!OzOpcwbDz3-(UFecx0rK+tp(BUS9VKIk6fvvuXG$j!I}@7!HAP zV_Mw{p{5?zSxmDxzCMnBomQY7bh*^uugO<%mvPT6@*B5s*dKK;)edF^B@s-4p^Z4k z5~IKzUqi+hU?x!~@|cIVzF zcp($`5GbyCZ%C|t9Rq%Sfve&AGP8F3<@)Da&maF7e{P@YQ0^bL{jXWDa@&Rh}U9z#E%nX zz>)VuNVh~skX}N_eXg!n*+OET%xb#7!PAJV85%wx^X3T?SohEdnn>joqy%K`jOWA&Y4Ik1|xes>ugohr; zaMFPMaWY7HGVE~I$s2gv!jc!!GGVn;&BPGwx~U4+R;f4julvvB_#Qd-^f6Mui({|x=sc%1hV&WrK87JHHpxfMX<+XM;Xs&84OdQDa$h$VX2=CPnDrSc)-{m4uc&K4Ns1&{oIrl$r%{_}iD&FB(H ziQHVQQN;{ydxq*wTWKFbeZqeBOe$TxrdzzdFfiQ-+3Ow^p)$nL{siTrLl7KgH#GAK zx~aa051Cv#1t!IgFC80)K+$_N*xJS- zn&{8j1|ImSo|z_3e61$T%U+%rHdp*@_Q*`^|8(&*0_7Q52JYhm_r+W%S86sZ;mQGj zqGD*WUiDc%L?UIW!S?x-e)p{1aIH8wlpL_3@GGgi6~1BWwR>%n>8YYHxH)O0o(d)9k+KBpT^%Vw!G5os(H`oQ+tzssOEc~3g-$?%nW9ov0KG)V8-}o zpmu^rn+7N}z1ATCa|qeH7jK{H(KCNfsvBg@O=)bH)!i&(v;MZHFLMXbjG!Psq;&{`ZnTLq^uY?jy;4i)r$<}*-NLe46=4Zg zVRf3cdFpgAp75cRPH`L%5V4DXBa}sr^1AWz=&KV5As<)`jPA>DrA*SZ83o6HY~B08 zj9iqM`TZxDlp*-KKvNn2(vR)P=Wb}fDdS7s`c;)NHdMK$cN;G)U$x8J2R}vl8!IgM z8$$s~SY=gxXiv;=KY|IJ8De_^JeHnJ7`E{=@xOK(mS$@RKHMJ zde6940s3X5vqHOzoa_PrZ?hwbl-Xuz?4GM7`6vTmd|YpXWhl%4e*QQ-xpqu5=fj-< zoN{et=N7H49T(TA?cJHO!$+`&VTTPcFDDx7-3 zZX8s8eotNHm3uKA1LVwtx^Cv6wXb>zx!ifzC z&Na-;nhB1f8=<*FeW}f!a}hQze+a*4O0Vk9Bi=VFczpNMt+*^cggSxW$V9X=f>~>T zQQA86fCmC_xiWp&OyBTkGwk?Qu)4wq^BZgCq^bxv0i{I&u-sGRG!9y2m{?-zBRBMI z5HqtKg-&gUGEc^Ec0cArAUtwMAxHNbmQrkwz-)J-MIN?9{qUDio;U#<5|@?X{)EIl zp~LV&OA2On&7+gogz+Tnq1}&>BicN1A&LRFmOC3cT~}!2UVZ z1#W7Uo{Ecg3u&mSTbh1GB&71Su&>yHl68M!1h&EM27?n0ge$a&;+4aSo8rG0)@EBV zZ)UQzzvdZKr&_*x@v3WWIB~xl@VrtczL@TMBfollS9{dj=}FluRD+|&y10rXCvtHe zyN>8F-Eb2cS2Kmd>chrS%nF4MSrL0WCF#K1Ukh7o;%(fmPKHF2?=}-Fhl0~F@NKNOD21n_gEfO5 zrUM1Z!o$6Bx~Hu7XWh6B^ayk1z|bd}2RehL2pB^Nj(i9w5Ca@s>|WWr2ghBh0D5+Sz=tRm@TAOwc`0j4_>iblkn;TmaRN#iNGg_c@CDv#WmhXLF5Qf~@j!Z~feQ;>iJYMqayGu?<}Hw;qr__&k`tAq zp8!>;#AywTsqSTF{asAEmuI{EU9)M=zr7lakvxrEI39vvFCxK!|5ea{h^?G$uuDNp z2JqIPjsa>^s1&egaF<#&Bd?xX;@Y`WHvT}JMI>%K;ZDh{oYm;1A z6(=4StU@zEz#uaO|N3x>urx&mD)jji$+E3WJ5_unC8izMXs@S z$KJPdO6Om^f1aVcZ`T>Cea~TF!cplxGtEHS*7xGqaReDo@fR0Wj8)*rs~P0@fbE{I zE92gutxZ3hX3>&BqdX7DM3}n?)K^yifu8F^8E|j&Azs)R(%M&4L@c(_DcV%M5Xij> zWZN>jV=rFRSv*@kq@dp@{e1jl2K0f++U8d!1mRyOEY{`T?;JC>8AqIfj9y@~u~mp+ zz@S+^$IZmRQ}ujufTA{j*T?-vVE_M5CIrco0b%B35_)DEVZ}}^Z~zMZD$j#eXRQBT z^fD`)Ieq3w-(op7S~E)WohjB=obKDaD9dI#GDb(q^d!v)8Hab_h^Z>8Rb zrdx=7NL1uQP_3IT1kIG*CSd?DyL8k`MFB+Cb&vt6dFN5XTf_LAQ%%xba+zJ*K&Goj zq)hQzOYOL{(mhd?wjko%F^NuZ=f1MSv8%UbqML=Yc|r~&xc4h}ZwVi6P_YfKZNZMp zupEK?U#yuhRxBec+th{ZrBN&KkqPP`PUKhsI^pZUA;>9Ascn)DtIDRUGo5HQgcY`m zcWZ|0x5*DWbxKDBzpB=2QgJg-_6C&N9Jzs9=^ei@XM4C)DCykTM~%J7WvF(6rBBx7 zZ@+Rvicjz%m&LVb?}Lt}4n<;zfUQkAJkCcfPtX<|_HL_&n{ec{S)%iKW_xK-rBNxZ z2M0Z%A6#InaSaAwn-UwNTD$oWRZ?y01wN$ALkth^+OrZhMjGo{U0rp3bBd9Ka?q1g z$M--)AP!$w+71%FsG?wTL&@O3?iuF=p9!8>l#bV z?e2${cJ5hc#_)wx!NKYxTsDY7;Za1aODIU@IeFHwqS#KVq_|)(jfz-=Yy8TtpV`i3Z1yO zA%G7#(=FQ6a#j_Yob_FnwMADS;YlVRV#}>X`|Z7NrW$n~2w_xS#hw9kua4c{?=KCBUz0l%gvn$Ny+N(x-is38 zb*|{K6dLv7Pg^IBpBFP~EyPdXkA_b^pS|bBhg>f&n@VYx$r%qi$-SMe1OJ?I0>fKX zigwuzYB?nCoVk=~_6%cWH+laumM5e=+P6*nEr`PAF^B#Zi*5ULSabNGSj>qHgipuv zAzwA3O$B*kAXMM_j0;AYnbrcAwLhA zKXSQ7HS~lWq7uQqg%m@7%NxWeJY1Es^`F?pAm<+le8^euAYvJ(pd-*mye^)h8c)*+ zZe6L#)lc|7)aaptJ$~v|$o_2tGU+|^%K~q^_R9^#H;n7|1vU2v`$uSx4$R;Kl<(F> zNeIYox3yf&dQ;7nFXh}ePkV$1LqOt9jwREYlYs>^6WcnEo2)!t{Itq!+Xl0ijJ-D^ z`t926fNg}9{)sRs%ZdSOev62!XD;37denf9=qMMs!`=Z;3q_wDFW@+dqEj9#Pzp5x zW!NYnh88+j4ETOPr`}MYMcRUtAxIRKak1zNlI#2tO6D5hR`wo>FEylnaqYklr;P15 z9~Ji2{bAlAoC=s>9pU168d3A(P8?a?S*sfKVh+3+Az|TJIpj5t-8=8#PG~>?n#7q= zh71A;JME(|rp=72SgutyVzz$!uxN6dO*;&69Zv+Cl<8%L4t@3db{T8wwonQRX!B-;R`ZWosa#TH}mYNy4F z_9k^RN#8!k?`jlwf7`%l7?=e>xWRD5CAa~0tW>~@sWGOv>^%;buqm|Lydrq+$oHiS z#4`UWzpdddXTRGw_aC1b?;_uSfHjD^sa+wI@h*yc(bma)3Y8qb)$+3X)MIw#1P<`; z9_^>juE5)%fTnKYQ>!{j2XO32EyWEw&@Wowjs28^0erE5mL{f)oIqgM)x(>!=)4kp zCK+>vLn1*caQzzi!bjPxJZMO4ap$x>8GxD*l)|BGJtan6z%j&;P(GyLl^%e2zwE>L zk#yFe4#JqHz>=K=4`!)%is2pi<-PlwFr{0wnh&`->A(rmRA16u&xd?2 zjzyV+S34Kg2qNR$Kttrnn*)RgDxk`4lSWY&d2BQNv)g1I1n0{j$JJ?5KJC7O77g>Z z>nm0GPWo1JqH+DoHi-6t6l%+O`Ag!pMK3t>Frjmlv6nHtX04bj^vDj7p|R(He)o)o-JE1KeT2&Qw18y}9 zH9ei1>ps_Z#EInX86>t~L4hG&GHf%LxOunoAxWoU@unt)Ruz7E8<`rVl4)r<2G6u@ zOlQj5Br2Y4Z&r_s_4Tvmfs{$3XE#yi}3`&lv-7tW?yOjYLxoJ!$uDbEI--L9uGJ+Tk|BY6YbIN$nycI^e)*}M@`N03b%X$8rpme&;>xI`$&J3`1 zZP(srdegR?@Hg>MBsS-Gy&3aU*g(D7xNdtsMgZ$~yJv71Wvr%f%06F2%RJc_OrHUy z!5S+~$=JtJLpgCyj(rFW2cAAH*UdQrlrUx~xEMXW8s{W4BrEZeZ}04D0zTUhV|jdc zsAjyS&|QXr(L9!8BtoBOTSSes&T%u;7*=U;sj%j8$@ZGaV&Jx2;?7Cs66vY1ZhT@4 zfs(fecY_mD#Bt=-f|)o>zB*FtJ?Yr>kYa1^hu`LvQQt1yb8&`1yc^DTXze0Csd&BK zsq^OZ0%W;-vB^Xs73N}dtlRTlRzOl~nH&B_A-dL(Y8%cTd{D0@hyLRFAk`zP55{n4 zU$e*6A0`ncc=XcJ`S3kUH}V(ucaH=(hfapU=X&(T7WJC!V##ZGtHY!_sJJGC$jH}g zIx*)Y_8PW*`Tm`#qgJ!CtEE!)@z9~9^oKW%#GmbWCJd3S?o4~{og7ne%)BduBtZx_ zo6wtuA9_-{_Q6`0H9FQF&vTFTjUgk@+Z}r_OHR)p8Jkz@OQU`(fuK zUxL67%7;A6$$K~e|Ev_mhq!^pwBJwwONz(8@UoR&KnJsbl)R=9^K@fEa}!8y4GaFJ`*C3`lJn z%&yIGNElNj`4)Ibe_yVM3+@c`C9LJazK(4gYJ|KR`R)Z z!h&^A&cdz&o*tS-B5ZA-)ogvZhOiodH6kQU)rvT9Ac(@%YhGE4xsa7(mHfz;Npcw; zeb{8VMt|GmLj}>#Ac7A|UjGVD6+ij4*FwTXIMOD(6Q|flph8NRk&n;LmdUW$87?j*0muRx=g~=EUT{J@ic*OWlHWdu?EeDTimyu z(mr|eTk9(`=_7Q)mK#$8g$M3GwLhYNy5>`c`>40OGtUf&rJ&>SvUSYv7Np#;PmWn! z#P>y&BNOLR^oPALTk;^v8T%l&5eDey)4UNvwZaMv6iAxX#&h6s38oUQqWXDYBJZ@T zbJ6*1wanmGP^(j=cEadJ1C9<2Ax+{O;a1>v{K@M%hXXd|NJK_*(&W325nlF>n>(xQ zUvtF7J*FN#^Lo8)(BethoK-)zpmHLSTdy%Rw_u8<`vILhS9JZCC&C;So!me)T6^v` zFR)06OgCbmi268WALhw|^{6wPqI0ZLK>k8ArRs~cyT&E*NERk`({wTnav6H3L1r3T zX)F zJp+{2S*$;FHh$G(KjBqRl)iEodc4lNord+Dcn zCAHd+23(Cx!5AK+_+(|;WY8Uiu(+*@P5%^-{Rg1;H%?___A5HcN3a1-Nhi#u;6`w~ zwHX@g;O{DCvNxTA5Kj9jrrRB)~g{5=uaGWV7~ijHvSLD;Fsi^WEftC*NKG& z@-$KBm?8Mvg`^mn_ZK@tbj6&{-az%e9o}_nNfRP%Sa_uLT1xJ&SBJl54Ag%9#fWeJ zFybTh+Vk1>6~?=Xm;`jCceA85jogY|r-6O^t(Vno6Rg$scGJ->hm~&#seZmCpf*V! znWmVubdx|~ZX&$|q|qa1&Qxe1;<0N2@jh49uaBv%J!GXyZ*WvfaDdzrwY;eZ%1>7F zw(v&CvWByx+N{@%&#wdHiw;kswgP9vcII5M779`NU~1M8z2PJ#JNA0K;B*S>*@xoW zeOaoh$Eq@mjh>}MG(ic6JHwlFKXw9Fd{1^gBTBg&9nfXO_I8#&v%NYk3fF2!WF7=@ zSfhFL-zIH;_Wl2dl0NNbI1){=Xn_mxWJrK3)txXJr548GUV%Zrl~Z25$rc~lqPprp zXEPF(bQ#s%7@pSDbedAbikfg(#ZJw-TR{P$B0)Os$`oDqa8Sekf}G5; z8D}F&{;Wx+ZEYHFqcyuz!6F&T(u}`z>)_QFwbF-PZpgg88*<~tJT#QOkw+vz(OxjY z(z5%I0aHZD!OF0XHj}b`k7{h*s@GF-&R(pb4E*ST984kWk^wyiq`OY0CB*1+@OJPE<{8@YX+3){7 zHl#fXC28`+0~isk5CGI(3wm%gfHc}DGBHP|2FJ8($rERA6t1CTW{Rz|b|Y_OOWAB% zr{-qpSpM+X|}| zGOmt2Toga=aBJNFcC@1vznS}xw*`pciS;rH9J9tITTe*Z(E>|-$j0f^;g5kBweRhh zhWA0w2})Lc!9QRm#?|en=!t*Bi-HrG;Xj9sO&CoLs#g{-*RqUR( z38%iSYF5!5QEK&^@JfM0jMW29b?!z_42a--Pt~M$Ivks2&mrFV^jgYM^B8{z-u(>n z{Zq$I{+ibM*TAr-2+lR^xX8=|(}4Acp^lxYXF`ci@GAApq5?+caYkai=7)=VdgmFf zW$_xh=+O20zUQ=NvK(~!Efvy(GB0>}l_E*atWbc!tmKL5jL!lZ3M0JD+L;M z(dtkC27&xZSnmX5vd5j`;NFPXyiGv=_*2%mF<5^db$qR$+u^3xRo=koTaWbKq|9i4 zMuMo9c^_NTmCV+01~t8aAobv_FbvHBr;qTvC|2I@S_v^wd-_)Gy~vuFdgV$nJGta~ zcdf?VATJ$q#pCkT8>orT;jhYe7RgBX?HRa#Jcc(v#d(C&3b%egji%%xR-;_0NXQhS zDzWXQ1jiaE4|`2c$~(2`d4Mzek@Hs5N0w5E+t0E{!7rS8@0@V<3<}X7e(IE$w&1#@ z>V(JPE!p{a5fM9qxOh>&N3{_(i+deks93mAX zb5qRmY+7608@rc@EGD;Z$hqd02TBi+xZ*f?YPoa~Be!ef`qyc{hi5>$f1;+X6B+I? zX26qqA=QYxc{SHg?{i|idAQz}I}xh#5pwsL;P200koyg1{@a4|K2?Fgf|`+AZX>`50mj_8ZjQ_{+?z}i<@J_OX8|iT^+HW{ zn4!Zn-j|$e z9dx;>GScpG*|jN;WS)^%jyd_NxL37+ZORd5D9m&-(3XG!gGPMUUlnmCZSVCjpWcUO z(d_rUd9^)a4do0(#NSeG;#7%!z01ibvs%WZ5j9KRIIFbnW?TD;O4mQm0LPRc6`c1y zKSN}H)v@qoM|$3>BB_-EtbYY+}r;ugPlW+CbDATm73se zEM$t}rFo-tzwIc4i%(xoekg-x20kMY69Oiz)jj>ll8Gg$EeGO_!Vf>LcdcFgd@n&; z#zrI|_gYzROu&jd8bBIjR2j#zu(dGyI1a2bZ7-Uqk{fbuuVBS|fzEo6(`F^SeiPL- zIO&HwgQ{`!y1w$g4$D&GVnx7Ndw;FmY-w-$%@N zQavpF=79Zd=r`{-UFqbb}qgnl&5OHe0dw>o*WSu3^y99E9 zPFdF){|p=WM}GUSi!J{J8wD7(L_!+5rn^-y8CF%BMDT`_c0yPA!Yc#PN^9b&x2f6^ zY@~wU_<>lrmx>Q+^{bett3t1`p6RHH$v)3eKxg_7MZr1#+(+g*=tkW|I7xxEy$>0H z-vZ#Mr^@OXytb}B()TcA8U0+|`T^ba{PgVi#97(m4Qg^-yj~dNm|ikk2EQ3)H-s1E z4F+?}7_&qj#i3)st9*>Gx@>zMHCQIb);r_qU3=|x{_VB!m-IDzJHXl&`s!xXJYx4D~5K4gq{z$0lsv1gSw z8bQJd;1v3CL{gmv6WRurw9WNV7Xa!LVD@l-72TGj#2jjhQs~#)er3diW{5Ap-JF%~ z>apK2iTP$jk$(N_r&98p?HsxJgp0EyBCNAHh^GoX{I44K}k3ePW3JFM|#(MPZNp{P@e`~1Jort%@OrbY}0 zDl*AjC#)$l2bp-~-t2eAK5yrq7na?Qp7AF>-4$or0UCBqe27d~1jTZQXqS8o-Ri|e zI%l_DW53w@=;meH=YpnjhmO2(SUUQ=zJ%0)9@&vu-hHM10ld}lJ0C(H4u-p^HG^WW zQT;oYBrb446<(w8D+f@haO~7K`A8Me^HL(N{}J!`v)}pGd*3f!_%BGxP=%mLi)PG@ z&$gly;4iGw)ws9Y6l+zqU9-%!@V>GEvpn@h+_O?Tovzc@H@G@w>#Z$8 zZ4b9?T@MlH5g_4RxObS$N){WSvaM~{qfqhth(w;!{U?Y@n*X8*4{t1WvwNBA;L&xF zhdY)a8JSeXhY(L_b$mZ2wXCXf3}0@K2TA>o_LoGC>}XuT)Q5?Au$EfqIr@hys=uzT z{+Ft2r&LipjYNPj%lHrP;iX|S&r*K&0W*o58eGFl^%qr$sV1C4crw{ozZ@Z z%Z5{)U5j73VJ-)lu$+4W-Sv1n03UL!*L&_hR28dSV)mhV#Mtu9NLuJbnDe-S_(^X@ z?QFPZL?0Sh<1+c><E; zemPuN)jxI8pkxP+k^IQvd-Z~lMX@hyj}=xpv*`3=`N0>@9T{yq${$6`?VEu>P7Y+5 z$ZZeYS+v=;%a%K9Ip>H_b6YFFOIgm;^lF^_V#NUG6Zt)e&?ff$Wt{&%svZ25ee>w= zELim$mh4a01GFdnF#w*xaC8O%x%*?zIsnyY5+q;wj3M%{vj|$|Ft?Jyn(IGSnSGq_ z^3a(WjSS!O7q@M`k)`d!gG--&=IgMH%L@X0Qqd|gMC0Er48fl%fPwaKc1nD zSvAs1RSNJhbFfW!j?4%pynLNmeAIxO>v^tY!|4@7E24U+6&Z(S*nlFGsDYg9W);uY zC{qFuLS;WSRMFg5SATV2mQrn%;UBU7@k_R$!!a*xd)jB`bGOeK2-yoPRR4;c@SjKW zLbZpzaFMNUc8ol(tA*S^=%FdI0ALswj?)AqR>HIjImeY`UMVy)dJ z(^MxdcKoU99$Uo_)!-Txa=Z~$)x0t2$bAZ;86*E~Zum%uD^>}09N(i+n+@OAs#22{ zrD0!}<$WWzt$#3j7bwd6eS!HO)vFkGfL3wqUQkSrC`aBX?lGw;r)i&Lu##6>OWOpY zXk;;7Puk}o-o!OzgI;_VOpqrE*iqL$SVyFGLrKGiNxQNR(N1O9m6(4qpYaC;JONdR zWNeM*%s4}cnClHD)u%v>l94m?(}|){WbFl&#;fVp1eNJ2KbAh?WjkZRasKiX9;okz zHSI(jv^Jruyq$Bp{rqHyKG8LSciNwS5D=49zw7Zoy&%HR35kCL83FQMs@lwHp4cb0 zGOr)K4o>w(`2%}i<8eCi^*5jN_>O{+t>#u9ZJ3~$$ z7tyA|W6*0*P$s;UumPn<#Z~~>M4Mtt2l-><6O-~CS+Ju{AFAhuI5PKn z+PGwLl>!}H8ej(#)_QLtRSUzZsqEjpu8G2jgi61FSs?6(%JU-bz6B@AVJ{WKa#BDM zuf}~-w?uN|5dw$v)E=#AvL4iyz1R;T%$pJ5{vY#xf7R#z=b=owc&0^L4OTKrF)Wp) zvy4#N*HgBrRhwtLU*cUbpc6O*l1YqHXeIYBZPr>Q$@hK^+pW5`_hP=q{L!IM{f4h_ z`u(?P@tKYnWH(}8K4&gv*%$}7(cYKC?K?`&!83NE4RZ-fpe*Emqf^uxZY)PaRwIa7y_Qu?nL=bC=B$g@wJP4t(DHiZ(h%l zuYK}92=4Y?+ld!`m~;OsXZwagr74#XW%%3y?JpUG@k?M~`3C*p0!M$q*Z9ZdM1c z$WgB!NL4-Pl)VJ=(wb?LE?wx1*Z_NVBNM&6B%ZPS#E>xOThQ8F56TPC-S6&D=Ys9C zc{ak4HveA#`{Bp`4j)A9it2?k9FVc(N?Z$67EoNGnzSJ0M^UD|0oCY~s4WbqM9Q0f zj)K=7)AA0~`Ijj!qSqCvkE(CdPR5CCkaIcAA}FAaR5HN%M{G<=lpLe0WLzSo-+t*> zKT1>CMB((~K-o5(xD0EhtTm$Br(S-3q1+>K__s})gN{tI`Wa**$i~e!5TsEG%*ke? z@>fOmvg%+eJW1YID=4_g&1Sp4N24(|G&I04>Y}t&%G(o3i@4(INJrv1+Cw_zdslHIR34=cq?fp%q#5u8LtCd)y{1Y*SM$_@i${rD zNQY8)`mK_NK2UT2dMeuSZ0v%%EJ!7CWJ;*jUy|8@oPEGO2C@OMo(3RFIeYs9LZC9d z9{C&si{AA{=yTg%*A3Mm^1N@9;%cL28jLP@URJbj!xznOaFPKb*jtN=d)~}5`JHg zbs~Z^YViy>T~o-63u?Z>oI#M+6}6gM8ztiQHE$IoE0;>&JbsFsTkRWISY26jb4(+% zzx71PTYIDJ?~=sY#S2nq!0EzZnY_^joZz#72G78;;)44e5@f|V1t73$ncf_^Y?d=O z6V$i$+p!%ukRd1P$y=zp!8@f=Ng4$ir~2Cw8pcxqALzB2&E_}NiNq|(3T5-tpzHUz^FaRV!j*Ws8V zNZAzO2T}52JaV(nRkZh~3kbS_E+|L`SJ52>^E&(GUAzx-SH0#xBR`tZ5msOd>G|VP z@gW7hTr4PGP+G+D{Cxz!lC&rF07r1jAbQ0N29{a*5GexKQD15>KRp`#WAYK0DE#zn zf9LRDdJ+GwC}W_5C*Hw8(wv&?64TiB9LNW|Rygcqm{%ZihW)&id-q&VNC1Magr;so z?uCy;9eZD7%4Of|d@jJs{jvC+HVn{)b5${*VS;Ht8SLqKSL>f1{m%vPn#w_{2Q-4- zuRPFDQ) z8Kz)Y3VCrsDB*=pAXoK8fif+Kp*l#u*lM=KTEZ*8S6;p-wD`cu+&cG%)r7w>?R>-1 zrN1H>q(ImQQkFY-au|*rmWQ!sp+O9Q97Hddf{Q}@?$|?Ufe-Eh;pcu1|M}q`?*A{p z08rU%z7y=d3IM|@XBN-AhWjC+*VV?E3Q)UcRDyXMmR!zWHS+)DH#Ebu~0)W>o}4hz$rTvZx5kl3*iZj5Lj?Xi(Cq2mxb6wjc=+ zkX;DSC_`x36=aJ-Ac7$gLO|IfAX_kmBqAyx321m4lAhO9^TVm>nIBVAGgb5N)%)|R z?z`vQbH4k1-zgyeKgA6(1ZF^NKSR7kN#|6v6A=NtZ?y#DXS!G9{En`U7Rv$vSjwM;67 zg+T~6$Gv1K*#%VMKwlPr1KAJq;>bdh?OX0AMfMd3V#kTUEHhZMm)j@a%)gRFYiUy7 zf30i%Lq_`N>(xt2Di)veOEujInk>zIW~jI19clY``6X_ ztyis2oG$L-B*Ca&EF&9{W;@J)JEzz@GUOKU0BKobwIy?7POD=L+w7WYm&CEA>upa@ zZG~n5h-NJsgZC&nIfEHQ;+BgTEo;-n!|q~ECf%hP*XYxO=4w!?taGgLXg=BvyukF1 zFKcf1R0XL%HgI)laNMzH@MWSMC&CD=_t!1#eP> zitYCmS{^FCGh^UgI#+Kxs6!1lnIXt$_n$WcDd`Y9pb49ZjIbw#+jL@VeijEN=U+hI zQ}L04dfnJm?+ubgfd4o)h&G?w`yQd3&(n>Jr+$#se!*p6rmQz3Hs3>%~-JQHx zO6n!=Sg02w3Y|)~$eb%P0|OtQTz{x!9||mKq)UJ$c=tXpFAfBjB{kk@#}`i}x{JH= ztxMJaGV&=lDIANI7EA;*|7e;s6;p5_eS1aJ{^`#l^10Uuy@+7}c1qJe3}e<;@=V(m<_ZJJ z8Y@zQyr%9`C15H-)XY&d&a%EswIc+vXkVcBD(wb8XBW&j>2GaG+vbWqu-P_u&H1F} z->#p;qh;GyPXLiJPv|0B2;a}D#gfQDRCuLFyP%1GQmR{q7OVGp|_prr=m3uq2r1$bN_3#Xy#tDOHe2#9)3FL<2NS=S)Hb=J#Yf z6_ec}&tBY86b(3V0Ua!5YW9(hKGopT-aMowzaK*QnBvLheYIm*Ic4yfjkC;4#5BGJ z!wRo2Pb8ja-W1b`n-preO7~(R?Pe}p#1J354olf*sP6CS9+eo<_^M~?EykMXt*1Pv zbCqBxzutzHhC)cE%^JM9*!YnICiFBA#tEw;Bkm)Rlg=FVr+hBTDe8#a!D0|)_xTcnvv3aeQ4%9v+pv<~65t9f&dt5IDb zNMCs{_{+@<%|%{(iuQMxFr`a`(CL*fn`9V7Vel55{-IFE4fS?H4aVy5a!cQb=P%q< z-WZVlP0q+j&ftxU5rMxL9D(HW#hzZb^!SbS@iNOliniwFf)bpbppMqzWemVvCE>Va zM8AXAOY5e)CQ{YK%G^1|TLt$o~5mSka z?FLcx?RV>j8LU!uM6);_-Vh&SQhwepAat=bGr`}`iT92GQzjL zZW)~*!(_?<@DLh(=MlP{yhGTQzH)kaVIh!vEc^$!K>~%$+_}j(U(!bZmbqrtpRXHf z(I9>-W|ZLJF~$ooP7}yp)Fbwm_NT07XH+ClEz(!-0=%+ds55X<#2KGetotm3<2c~d znn=<_G`lrA1MiFnt?>z#2U}^)pb=A31LH9o`aJsD{iU&a?Otk^gZ_mjmX_uaX@V*J z0{WqbqO^H{+q4H;HPYqy*b$O1LLI>(XWh2?P-yxYU4^ug(_t8DP)9}~anF_pkVm*@ zcDzQQnarfZo^mFMiuJq2kZ%5?0h&fr>8d^8Eo6ygJo`aW221iDHlcm`E$w2J7OwzP zkv-9DcI!;zo~o|6#lH{gZNQ=|OwkW%B;KlnZdvt`6jdus+J>3b4J^ebaa0Y3e=xAE zTrYg`T@+2n&oAG@+pan%H%nQkE*)aX#PG*gq^{>I4*`WDx3Yernq}KHjt7We-i%Z~ zIy;|maNFoTm*%?|yE{@LoQ4SBQxWjmsW@z(*kK_^FBz(;;G>$bBv>$vyC{T#;|SRP zI^rpc42s0Lb?PCnO4jt(r;bu|=|At-Jkl2}ez^H^*-7sx6i86M{t31&Zm5+w|Euhx zy9_Lt+Y$Q;VaikZBl0}DZYZ`kaUff^{S%(bV&@K@_njESGMzjuda-c}`o zTmabwpt=){xz%PFfm z5CK0cPM6rekD-M#71|^%+PNI(ca(*fhV0h=VAmV-=1y!ZTx;2ot@7y5VA0Q|KDT@a zKmJFNn}0xwP=yRo@}JPRihId0!4VG1zYT$Ad0~;-6Xs zatp7QzUNwp$|YyXD%Yucno%#@cO5$)L=8p_X!Xq5LC+t$0V$2}?SScnmkFpQeiD9% zBp(zDY9=y_nC+x^f2Xz=&hCPKpVY_|QYV&;GO z{8>4^-fQkUkr2t`SYP*L6qM{1)`YZ;VCB|B>)3T3A#HO4h+B@l72Zz{N_P8*w%YW zODj-6<*?1hm)w&;weZZ=J%{V;)}r>pu6w&ecCUgqQwb+QH-92c#wM|8w$UfAFLkCk+Fb-2*JC*{yD?bN z3QLn~D8#Y|ZtUSR{h~U^Q*K)VT46>WHA~p>!GzxMIQY_%Y5H37$aIbwvYqTh8D(2`C7VJf6_Su7*-nv=kdUEiM)rg$LMbNuBqsZ82V)m$ zvfqW7QOPbdVN7QCp4aodzxTPW_r9L{y`SHG-S6-B$1^_1`Ekz7dCswpV;yUK*LSVs z!+F-km%Dt?}{i z+R4Yyw{zz%!Cm|UzrBF}gaieIemnW?CjWlv4qk3r+I-M1K++6 zd=BCfmWffJmgNJnW z^bL+3H#~jD%-rIv_qyTj<9jnWB=l|=JUk*gCN?fUAu%Z}{Yge< z*3)O%`2~eV#U-ywUsqMv)YjF%YiR7icXoC6^uF&K8Xg%P8=sh*B9Z66eEqiYeQ}BU zbA4lzwnhK7{hM4|Antz`3poC}VE>z3qJUgGewzcn-{j)j5e77FQ6Ap?M|O&t+VNcr z55`PGzlrv5lKo?XMg6~$?0*UNA978Bgt)nY z$>SCUK|w5Lev%sKzuW))F~G{Q3ZZ@|x;G(pj>kj*)q!J=b3n2E^Db*(4#<}Sy2-*Y zAS>GrV|fd|koK&Cut`MA^MY>HbgDka3!4NbzRFdl%tfR8!4oB&t=d%K)QsXOrq9^a z@`aI#ps~D2do!ox=yGdrACaq0cbfG8?&?FNnR!(y`2_wKX8R39DPdJayb3lnq z!TWyxMLz46h|*$r*8zShD$MLg=t0RZ`!i>B<`kSkj2=?`>V1X^;s$kK0S}G_eg=%k zO474z91Y{+oAu`X1eceu#*18!;&(nX1wU?=x{)otief0!3&~*SQJxpDb_ikXLQP3u!DOn~(3F%g=8PlYaZ2n@jDk{ku{S zOOsUq_0f)nRbko)639V}9P@0WZLBe*(DK!?W4F#qWeJxkpSEJ6(H3*12XrVZO_gV6y_uYwna&q+OP2as z^^&Jr;ew7?MD7FNYGqaw!kQ9qL0>LJj+=0$;|;f6IH2pZ5OJ48E#()ldu}Prd}Fg8(C)?4i}OIi_BGK zPfI+0U)fRjUa~TdXFUs2YrE3!OTvKRL(4QYMZ($O;hTU!gld`Pyr6R3Sa~_o?|I^? zvd&6Fs+#E6u;i-3n{EjgtPw#BD0>jbjk}>e6-c8DvyBlKKQWJl`BTBR)I&~wvQKW? zh87b@Cp7nG@A@3~!uYL~SOQ-Bz>3T*fnQm)+jI-c+B~`gLRX5%6Ov{ns>XE>hG0c! z-Fyu*8v6r-l(Df3jWK6aGD5{q%Sm1}osQb2%e6g9LDL?wa-lw4AxItsoZ_8=?D7i? zzQ+OW%1T_-gB;-hP{xC$g+p$hz4}yfFW*m$uaS+WYL9KyML4Q6R|7o{iFG+g2`DhZ zkXh&9aTC+u!*0Je9@|O_oc7o7Q@Ztj|K$ryXSre#P+$%O=ord0oBue&lMZGJfP?d# zeda`rttkW;g%5peV>zU*&T3jsOK(*;LlG1{)Gt!vQ?j>~EOO__I%ai0MS336hN7!t zR^m$#PE_gW5T9mWY7bi+HpT5me@^;U3jte@t#8gU?CK3iYr>?ySzXx_OekMhDi!M& zE<1kpq5g6iT66tD znvNMFg#!XBvr0_F>5MihSY13*d~A(`iEdSff9&aq04M1J)@n?4v-^43+!3oidg@m2 z3oeC2VC{i8zZeTIp_5nrmh(yl_lYxLo#;T%FH_;;E8~iN^&LO%T)$dybA5gHF&Q!r zYAi}EYr|0-wWqKh9FVj!Jw4wSR*h@N$!Qf(zI88Lj#HmY%+;**PTJ?ve?uhO>&2V7 z8?HMWUWAu?IpYl4geJo3IiS@b=xPC#k9LZc!2w;viNKjmeag27Wqwn%7Al&-RVe#< zM)%0+fRM6BUC16arM`lU#q2edCuACL$$E{jpAU?hHKaqah@b?)s zIxhH-6y@^O^3f(;$GY@GCX?wrIyzPRyAggw+fG>g92iQh!PTaWqnTj^K0(%$VNzb4 zwdbXz$?N>`4ev@H#|1rp5z@O^p-!yt5=PN7IiRR&CW2LkyVfFpnz|L=EJ{&5O%L;Y zZHaxZDcH$YwZ6D)m%XQUPhO6zOS^8+dktSTb1&E6lMq|>45k*0=74TM==LZ|~_EiX^8sby6LYUJ8rP&BmR72^i`Tl0NxZ>B$8|0Gm;(s%RW>Z8{UpB5(&uhevl~?o~)7_yFUxMA32kIh*fBOjR>85 zeu?#pEduv$V;_J`+qMr*e`iZj3mZ3UG9PH2J0GA_U32K`ha}IRy<8GMjNVqE+Kt#C zrX1CRDN#lnrH_-baohT3WIt%K^S-#}Wj7;2!o;tf2FHg?$4ad#sEB#pUa{v*wiC0> zdTZl)6}?gvwjw6@0up1aTv-HmNM;NKNo`=OW3l`emR)}|?q7yYk`hArZ59non zJ9BY@XO&NnI=<>p#S!b$fdyE~%JLdTYQxY7X{SI-YqL~SMI7EIj>V? zp>e&ebZ^{6(+WAjAQUz0F_f4*W377h=ravt0k@dXp(1o|+uK@9)3VQfbu>e-R#Vny z*N4YvN_ez>_I$mp_+@S4Gzq&3Whf&1+3J+7l|(E;n9d3>;Uiaq?o`U?!}E1g#b^DWo)SEaW<@ZV{dP`K^KgB=a{>ghzMJ zQ7q@;okc1s4?B@^(Rix+^p&OTxk)6ULUBp@;IzkUDDF8}Wv|9*@s zROiE1Mj6ME!-kreI3Vl*N6Pclqf30p%@=InQgN}jHhna`4_)e)bu(W=*JONtxmWC{ zyB!B4WX3qh0S&pLe85aq1TP2lVE{tugwoO^|`8U1P@$wEMyi_7bW8m97;d z_OsGZUbd54mqzpmk+^DY{2Gt@67^R^`9h4bjAwCw#Iui+?z2idLhEs=+3_UoN*F^C zv4`41=$wn5t0Lgh{PZ;HJ}n@A!E#Srf!ScUq6s|eirk;27Fxmy%p=mP>Ku?|5bGtB zl0%W^fR5^-@S?~e+(s5)ggGGoIWE-t(UTDK@n?&WO7E2*K&~WfYQHHf-9%=BKKa8M z5APtfMIfB>p1yb{Jk_D{+19B3Ml*xG=OGPn>4opupCEJ_I$#=}WkX2`IA7Sd6?&D6 z1BzLT#Ql2q66{E=Um0W^V_MM@3z;x_3Ij@4?UdL2bU`JSoD$fP85WWHBJUv=?}y5o zw~?JM6h$mgEZrBRA?W-hI6^Uygo%>hJzPLXNet43H4u7!yC_pu{80{X`S&pPz@751 ziL~{0*_d{Pga1)kk{ zpu6Zs9Z)dyxG!;yi|%MWsQYkNbJITcvz4h+Q*lNK1z3-Mv;VSGSS+ z@vVYmn>aDLQCqft`YU=iOS{?fm2bA|1>4s%8kPctJKyt49+~Y8iof$gR5Cb{7Y_Tg zU;mR{p44Tltr_Y1QG%v)J-wlt9hHrl#e1qg>zO%$ZzA z4@-^CG#kr`buxBklfpM&AEU{ZX9m*u5jh|kAE{KVUmhKc*KbUDp;17{}T#+hz7^SQ&+aL}BEml;?$%EgsO6vPK>NGiFA#|v8kJg7yH5m=t zi+(r$3u^w~-uC|^;Bix(Fi}9us;|x@L5UR*$osRCktX{nQaU?P-<@893twRYFB27n zxIkU&=78v`<85on9FU~(nQdHRvkN61-$+OZbGjb_ z5Z34KW5k!E&@Xa99FQvn;DGc_n7&5!>d&*^LD^A0+-*?eW8BXq>?D@mYY>CmAo#&f zI})Ijvva@999PBx+DEPfw(VVu&3gsPZ@mw6GMBzgw(Y=8th)h`0=9E&%Dsd%V1K?P z&Dz8Rt%sZTNWGE>>9~CA?1to8#Ek;qvcQH56=P32je#VQNP?PK5Gds z+XYT_NU_83)Gz*M5!vf*w3KE?b5KQnLF>4;Zrmc|;YjY^QR|G?0M$nZ2-AlJY{?(- z91ye-%NEgs(i?v*unmkjpv%BaWGq4d*R63Y=^wi<1~~Tj`@$TM zz}9IF2s7iz0r5#gm~Qq|a6FTbv~LHaD_4VR{V}V^_wJpUCFjfn^1-Y1HT5Zi?{%w; zA9$KMXng1daVbTrXRw~5NHt;0P|7%rQm#*)V~YYh>4*vZhzeg|6|ACYJvRun^{Gk} zu}EGrx*JEawM~_md-vSV6xxw;|I?!36Iyz8&o$kI{ImODyiaysxV_}GN2mJWdXNMX z>oEK+xqM7(`O6AUaRfiM-6d(CX@|2^*ITOKfKH$-2AegBIMMI4-2IfT1g!Ro_G-bq z%GRp!Z<%rlZl{$VH}*OVK8e~Rwo4(Y`fr3q26s_1=kb_bxNF#G2sg@iEooamk!sO3 z*gwDTR1f(Z78M-sW6b}`&C*k1E6iT6wp#Aeq>!>0-+?P+c_)!JVkD$E4ad*XL;e;+&?d7t3ymz=}a($+?eDBKQt;Wwujn-uJs+w*>?Lp=1BeBn-EUzb?nw4Z6wQAJ=AhQ=AZJ%_PC4IZIUAhY#kWGVm(u9TR<5yY+LF(%gr=CtO&{C5JffkMdGLZ^VtGYO@bvpQx%>>VGsiDGTs=7q z)s~qzjtJFl@$)|NcD<-|mUe4b+mlq zfEKoZz_q;#U;(6Ev3Ofagd`pR8eVTs51v2aXxBi|PAa6#B^5BdHlQ9O^L2HN_YnOx zRtW*Ca|o|Tzn%&=Tmb`-Zq!l4NvbruRf6gH@Cxf?=vrc?JcO0cw9kk8b(!oROJP>| z-YuRof~VbD^%)J0OXj)b)b{HmGJ>kzX)8_FWFMuUwxE~1rc2{zje*-GI?+9QTJ;9? zz?E)`h9`PrYkcMUnwD>jOCO$c3MafWuzXxEL1A*cw_R6A5uKN^Np4k>DkLD)l|=K zpHBw%1vs>yeX|KLp^&}E9X*r+S{N1GsVV3{52bc2SktkTIlw#B)P8pU*jkpa!~soi z+dlNjULxhEU^N1ml^VCsSqYT~_B{d*ic_}E^z@@%M^B`9%%hpeXC%L-e(cWAnw!%f zDVAZN?2Cdrg{iNvq?Uk!XPe8|{CH|u8%~A;@)B0Gn1q_piLHv11czd3Em5gs5uAJ zcL;X_$55CB@*u+}%I^!qfRS!jM17TnRl``10{Kykxnr$yf9^fs34*8aG^bId)_Vrj z7gdABGcfXa#U6twO{SN}cBA1AH|wWaA9+tIMZLJXP!zUF*;K&Y^)pg#N5X;8p; zh?1VGYsX`xbkVX*Px^Dul6HabjkTG1QpJr9PbwpCj0Jo(lh!I>$ADcqvwzLgHl^_M=v81kgmiT0>QH#l zGK}&V)xrODg%LJ|VFF+ofASN#h9<3{-sTatQB7WR?sy&e*d}>mi%%9MFb1j1RsS`H`DO zV?9QZ9>&K*0V_{3hhrEpwkSk3ka+-lV+h$!!>F>I?l<-R4C8%2}wt!5ZXbAGpr~t^4l)Kr|kR&p#0&0kW~@dyv2iW!*70ER(@Ci zpzlZA!OMG2cgLtFcnXP(_Pk)Kuy9Z>SPDuapRP>|lwV3DTlHH#ELXvI$_tKegp86R zu{p{yQ?YN3z+UgX*gg4$%VxeuPns#dMoXdMR^1rd69_+gVv+A<=koD~2)m;9f%)wk zcggzLTvkMUho}l}k!nHJv!mP(@6>Nkc1RBJHO{+Kyz74& zk6f92b_xiuPSXys9*00DaKz?0At1#l^`0zjb(sqED8bmOqAp7~%6-Wo?N{1U|EsK- z!d>~tJRayMChZvy-hG8O72igPil~oASdsQQ61QS2LNGFOC)Sd6o6umF6Rk~ms9oX% z;WF0s7jm6z zFTBv_E;Fc<_%__a!OM8}t}f7>BgkG%RS1L+y|T`?(oR`52HjXkKS*!C%x$6LeU}5O z40ktQSazGZAC*_X=m>I|dKXE_~plGJe${bMf zJ?CxNR={2S8CE#SD`EGy4gL{`x_G_F7<(BhRq1U&}v~AAj;psHi^6s5;kg^3_WrD_9&#M58tb)p-{N-Z&cyku0<%Kcv^Z zy6k_=D2b_H&P zzub6TTKsXedQpu}1${n#7Wk46XTX(><+p~}g@5GnK-6bfs z#W^ZPYD$S(va`BJ^_@}|-v2_Ehc=clqXVGJ(Hzism6BI_w!e0MMve1vK%cBZa6YD( z0o%@jp$Fi-os=y0vsJ(EH-Wg6UycJ(d(Fz`fL5a6RvgfITJ44&C>SGwJP5Z1c(zy^ z=?L}BaMv|juYc|p1?|eB+pk>)?}V!sAy^DxPP*h6X8=xpCz3eKRx3&HjAxX+p8nPL zU@;-~p7-iOBAD*P0dX}u5T6p)RhkCB-aFGB9JFWMg9GYKsQo#<;te?ORp>OEaV7H> z25=)gLUKHo&QtYhIl-(PH4X^A=L^)->8n%f{)Ig&uZtQ~<5F%$_(eFRy?Vavb4ZOe? z8CW#Y%WG_?1=EN?JIw**Uj(9mlU9J(*j$2A-Zlfy@bujr4(Nd&TNl;QLD6OjPld60 zvGnmAcHwPrmY@%c_zLy-JE{T!1HLK%T&<7#b`7Dx%3<$D?DRKnm4shdjCT+fWX=(w z&%UlrvTLUUjC&T77jmO2asGjkjXkTj40+~7RyN`UId(9Rd7r`oIZ{}e6fc$t+m@?YV+z8k*n7MW+P&WSkSI^) zZC(WWbg6Z@%}i=a&bF4sCwI>u-Fv2T&+D|(i?b`oG7E2Q}v%y z>)YKA(@6|zrstp@f!8OjZoUh(8|vGSmin&0{@M9{CP{btV69)NQqpy0)7+_v*SyMM zYLZ~4I0r<7u*#6TnW3ZpY#n&Suq_4MHphFjm6!Ew)2RHi;NUCY#?(^1x#^;~gEFgA zv)$8YhUp#8}`1Ez*H z5D0&oDM1mPIiM8C|JGwJ3iJWglQ6YL^vB}S=qZD?U~+jdsb`g+YVX!e4-X_;8ww8C zo}#LhzSgT{<#Z00zy4f!bo$)>>??^$;g7QoHj zWB}bA0u@B4(@RQx<@eJOJLCNeOoR3@+^3xhp}JC4a!-dydDjrV2x1`yQ0u>^AuqvE zwU|gq-Q6aNBWWuEn=7^z*2fXX%G6sqDLj zKV44y(V`NKRjkegYMIMlW604=hd3ag7IB&f#Uz8_>NjRs()r5x#11+TG~0;&x;X3k z?RcV?{%!qupFz79N6Z`)L~v{==?AFq0mC?oCaN1vzv9lz0p&H>02r{A1Cj+z}b-BIwv_x~Z}}e=hl~`;lPH&s2@1 z)3?I+J$V#yeK$%LL?zIb0-5<`G(QfgN|>b0IyyD}X>^_NXGR~SmY?*WwnM_d3*ziY zWPj&TB8|EK+*=^l|J)}>3NTkF*_bFO#hwt)7QXFSUq=Ne>S{d|otSkl{~Ck*^`sYN z$P&jA0ON9~tBfhZ%17>H4_{cb(mVhfMEsOKhpS6j_&5eYm7pJp4 z4Gq)r{mf&p?-aH!^+j*`tR*9B@sp|IuVan?5JF0-@FuSH;hPq5=t;F=Q)B51%A+;? zN_P}|zL@uVDJn#N_Q1I3wMp2Sf&Qwdf41O>$Cc5@?|W`Mr&-zlqM_MzNo5Hx$fPE8 zVG7D1+Wr6S-95FndGc3Yp(XMU;x}oGjkY$;4JmnJ&$TwGR~}%#%w~tSMDbih8^nE~VSqv`ME#+J_QIq%Kl&7CYt{qCtx~a0s)AN|# z^dBbjmle%`-az%n&()20VkdCj5XddIman`lMJti)hmxs?ExEEJt2z30V6|ycHlB7p z$hmhVvox4r)J(y0ug=?vg>jq@L6SWwFP}RLo_tP(;n7lyKPS$Gx?toG7g|55)Mr5t zb?vK5L8@GA&h$=>?j9bCJO(W6f7~vqe|=)Y$4%h@?6`G7T2$Eb|{^kzFT`)B$AB>fsbu4hZM|6()dcmq!E!lL)d*JD@DqD_fk#KuIiK z_-x|F{)F>xQMfA@Bp`&U^ZU_ATS(4(b}H8-s@C|-erNTnvRxnNXvc7NaY(#pu_+x9YxY!gL#&e)1ovXwn?2I zk30^Gf$l)=LK1TK1oAL>=msQP?(f*+&obO1$To!9&M{YU*dos z`S-AHqgXs6n>$N6Acbl4HV`pw^S=ZGfzh8uqko6F27g?2NT*!^uB-Z{Xl?RCH1WX5 zt?Ka`|L%dqYdx&S-X zgDkmcCgE6kJNC~a?0?03ze{=FjEMf-@PFcqMEz|-0Go^YTUPk*C!zo7{Br-0a@XY| zLjEM7UbcYw6B9QAvNMz5TENsAbd+Hp&hduAM81=Jh&ToBqUbz|d zcAP2Egh$DWn)H)w#Xr(yKfFHtcw$M6*JI#?#%=f5bRS4)SNNql8cNh0_H#{6@|3$n z-hjQZwi9w2fS8M4q&){KT3hgIY~%8_N|%0*K~Ks9vGhOB_yjcuuF5b@EJLLOpx!LVJT5wb8SZ^^%K*t=M>Q+V zZr-gjGt`pZd*hPau!cQ%#FwAyRoGOM<4#NsN<34_h!#Fl;|BvThE2uep@~&~^-`rg zVxK?jwn|P6R}q^YzmR^fc&neop7z#AS*r3qWa%VXi%nz?3 zT&%3hJeKLCn0+r?{@A2v$lc3VPc>E&quuW@t|PSRMhtJ}UfWm;T1GXcCR??0K=zv^ z>Qw9Sj{sfPy|b;+7n42|^`=2;_#Vi-%-FN~@r%NE7pi8VdoFSI5bF`nmoSMXqq|z5 zWXBjF5o$#d#wM_pe5X#7r^Dz0TbG7@gw8-OSV_8E&K~Mt-9gGfJ2L&TY2IA;5qJm^ zJqKy!L4KOz?MY&T5Pp=b)Q~2ty4uE(W(e8J&1P1nji>Xc%h{z0E}qik3q>JQzY8c@ zP3dP$_JZje*u?l17AuAI5XexC7Xn21JzzTyE`w%UkGe%*p%;EZR58HG3W4xtg)N)` zC~1v%5(jh(4Ym#BTN$8elX@P8{JN!VSWZZ3RfGA6yYyo+JW{94apDKvt&Vn^&igK( z+>83NVCX+(z<$$N$@O@qBsUFN36KYRb4xo2ADSsYErS{^9A#yU-(m!JU*gF+k*v3I z-|W7pcj=^c@siYgy_QaE2}-X{Me^Z3i`%9LnpdVl>&A`6@;YDA9g{ODFPt-uV~^P! zPTTM+J0XOXty;Nn_!DjHFp^Juq4dI=@9VL1LCCU75{hC?nA)*Li(+NexSj4C&xeW8 zZ&&~fN>Z@OH^2T1yU*F8^c5u<7N3b2IyR3l0Q_A+&?|N~hGI8^qJ@vCK1M*P0xD76 zmX_1BWuGa(8bK(>hJOG(EBoDDy zUTzm8weEDftW&}De&FKWb_YL z-BEcfGKq`mh_ySpSM?#(i!k}TXI5;?m}?$8Io_5%7E9h2v{{ZiGuo`^73^Dj(+1=> zop3#&aGd8*w0hJRwfE#&gfIn0T*HIo@g!|LEQT#Z*@``*(`kG(@4G%N8hX%~B+wTm7|0QeGK_tsAE&Q~)A9Nw(Z~nAnD-SIv0HEA4BEgaPTN50d@A_gP-vm-N^< z`$u9CJj<+^@}tQkOW%<;m3Q*c5HFHkO z8zy}+-S(@VY-(HzCj8U4Mr1%t@i6eAgY;S7jUov<*f|6RaM!@+@9j5*mTar%hUzN4N7597H+e3;neB5G7X?&qmf1z*JqU*cQB63ID3H)xg zGG)QqNy57GK;RhhV#FJAY;sX0Yju6a}5JC&Y-&6M(~S@k(OcS#|* z=kIfqZKE~_@^JU%k8ekB4Us1qk@$>2bNdX%N}kIt2m=5>EYKIr5x~|8vi3yue0S;B zrEIBp{vnJ>Q60)-Eow+OTi}Q9XeK6kdXiDP}i;EtS-6b z_ZN+8eOupVk5Mxk*Uu{pNP4;me15x`b>iTO&n|Q?|Kp4A&*hj6-Hvd!YXE@poA3vJ zQN81TS~vOE!qq?OzN_@AeF@^)81x1>QPy^4J;J{k6-w0ffN;C=Up^}JcFbNg{Rb)| z=y>wicYEHGTeaV2(Z13Lsfu`9ZQSD$L@>oVGlir>GJt)5qfvOx^@q;<&DdK;u_JhheYW*Dr_JtyCl zd4jUWw5c5=`_(IXya@YVr6I_r~QvIw)#A zQ@p-o8?u6B*dtD}5;nn;7$Q14be&i(*Bu1M5N)GUA9z+Q4!gNeDmm8K{CqFmDQ>FO z;hmwBddKmaqx~A}{|>hKe=G@-IO(0Ezk3@|J?P`JlnDJo)K^7ma|f- z{tQ5-S(g{?AKBXT`R;zcah#X-6yJrh+-V}Ab0$J=)hnbI+pJAK_IyiTbtuQcEL0rQ z%s6}{l$hR&xaZ!K_noaesBNrA-ls3uFMMju%5T{!W@ay?x5#ae{Oe1PEb_210I_#8 zi-hh3l2UZluug5!&%TBg5APWyK)tp1+AM_B-RgE6{*dF8fAzHLMef5OjqooY?|g7K z65KnkkwSfUW$HppYh_$hz;nQ=KXJKJWI!oBhMQ@)v-af1mGihgKR@(cZucmH6HsO3 zTWPOw4Nl6kl`!$7G!_|cFsJMIjQtV1w-%hZ9SS7POcZ=QEvdM;>r%7ILr(;vW^KF0>{ScGMz*}_@8Vi{G%%V6D8t*#7f*ztk+0+_!fC# zrJbrsvgp7`psvviI@vqMbTPuq?)SP_7*w#AuKt$X@HgwUMw72!{CmH(N>b|EAZSPp zq`O6%mP)maSwR?4z1!!+5Tb>^HqQvhuIFy%^X>9N!@#E62&>0|w-2PRE^8GCTGW0C z@jKC`BMO|kzpCfZ=+@*~*KDkSyTGD)k9$Uy37AtGL3CF%vdkF^9v4H!TXQ5m5 z5y!`o;!HETf9dD%?#lEy$vl-*5p>8t7(8_&v4uGcraekO1uT=D${ zn;`doGf+5YWt{Zfe1wXdhDnyi1l9V;yydqqztEp$VHTKTGUavD4|1cgTNJ!cq@y<;YKKLS&7b%^Xm(W!w*m&pkH6 zem@TeyS}g@VnjF6j@xthC-xc>cN~Tbyhh$TfwvC3QI zG^pBA*1V>|`VcP(Gild|mR(E1cQqT&(vuGTxYWEfJZ9&Di#T%m#YKjGl9`v6P~mQK z(|wD6Q^qgz|G0Gj(nkyZp`zq3O%%oPA#EI|mM# z(9+aWyiFaDNCf%JIGt8M$g>;RZLn$hAI}>4*9oNl1qhb4Y5e1QtKN>Rmq589mv0HY z$FAlGy{RKiv-&(WUql;J_ZU&E)RC0?*USP>HNRRnD^jyQ_EF6B(+lUC z0=1UCK<(kMwMsDg@Tbl@P47MmpA&!h^@U1pa-Q<*wEIYF-6@lE0Bw8=IVs9KH~58x zK`M5I7sE053R+#AbKjFDhllYsPY?5bOb|IH_lWyJZP~K(FFdV>mH3sOgKcM=LGV2b z2~2d>=_X&EMD~yA8@ar$MpW#yQu!cduQGE`h4M4K>1%3L-oX=IhU&!hu_AdV zC^E;2Wul`qKmqGkId}|JvW4Nj!%&Irz5ON@_-b8@zDW6BV?_y1vnV*`5+aWB=>z-kY zGCNE4ExJ079yHuOz-g z*be5?S1HJ!TL7Ai*Z*8jOZsM1ns%h~aYp-5^9B%Vuh*l6uacX*k_+F$5l1x)s=wWS z;rv7ulJsK!4FLDuwF?@bxaKL%A5@*m@+`_CAOCfUq%?obiu6LcFi+iSvf}ke9lIya zPdvtZac_H=5~r5QDwW<06XbAM=4F#(X;69z^K^ie5k#UB^vyU(_!CcHbaHwIw~@uT z!@BKiJOB%3>3)nei}i+EQSf8EbY=1|~O3)YAu1M6-wf9CyuNxk|*Vf|`I`RBZ z?Nfv<)z}zbz-nz==YZlbx-{#V7u{pXjHculx7u%>H>O^ZO~b~fVxK#2C{EQlQV;fc zB^JN%5A+wf*o>Lgy&M?L^ddnMOk}F5mi1wy_6uf9=pm9 zZ4}9zveCKhLryL@R%$i_(1Qns{%m^wnfLzn%k~$rR}grl5SRv~_fVH0Q+_>D;M8x_ z^0J1?(GZ}bfPL_7(tFpF%Bizc;}=tBXAhbTaHXluUwYD6G>X>S;FaL|n!UAR2dtDb zQcns}44y^04m|&?bw3cPksAwP}W}gO%jb_d}AlN4?a4->+@(J z7rO&e6@~{MXQWOS%wsClw@`XgXT7Pl^@(~$EglPNujJJGQb@I@h0i zI0o%CV!qp|8+Ge3N&k<-FqV zNMeBGM#;(y5bGYK%TW6r>*!(gJ^@GwoqYXbt2FgY(!=nkSU;oO15M`8kW!(x>j(Fa zf65xucg}gyc(=$~va6Z;bRi0kFHA5OsLI`?7$uA-U|0aDyA!(BE;jFM<6F6go9#*c zrOs9a(9+fcaX-cTiyPBbVNU(S3Wx2JcQ5U5%(0bA?M*95j-kTM@}Jyaao`ohc^6DRo1r(e?|6sBoxAu~odqDT$7srmfCQ_Zf(9Oy zMo2}<+lnEMQ190jh@Kpt4&FCVc|yv^mH&V-qcYEQzEU{q-lq)oeT=*-Y=f4h_ z{oVro>n+Ofsv#7ctE)#ebNkq1$4MD?-wO@KIm*Y#@51h&16s-Riyk^ZU}o$`0b(pM5?6f@YF{A!nRy}%O#@A;rE(JZ=Z>QOiUleFQI=r5o zKRl|tX>>b0*DK4iSvh<-5b1PmZdBEKQSH56fwd3fK8j(EoPtzlV4pK=>0)!;^1JJ= zqUSZm{Mo0(BZrTj93*A5;x&CzB{VU}jt(%pLH1Mr-hg*^N<+OAr$T5r@p7}4x zYzWv>lXH4j;28Em?OmpRpw*J$pAdspOmGsjXDf#p3M3>}Hu3~NoOVQZ? zYulgW7@z%lvC|uFeCpeW!>P9A=b=i6BKnfc%}X$fc_w`>kn4(t2}&t=b)m9@y$3RJ z#{naqI48*7;c5_v^TqB&Tw!5qr|<3=HP$Shxrv0d$n-pu7<}{QQto;0AlJw%FOoLb zVqi5{$;M`IRTe-V`Pm90ZpQ67KW-v2GIYY|%5D*?=&{GWKiqB~-LgDT_CLD&(x|4k zE?X28u?wPrh!7PJkxr#knnW%F0wU4~3Q812IwOJt0YV}uAZ-Lu6wpZD6X_Kp5kjLN zAdNIZ2niy+Bs3wA#@bfBs(0_J`_-tg>bv#ccz;618H{sA&faV9x!0O=E-}X#-{@N< zE=j#C^PnV!ZKqYZebEmnGg*$+gOkc~D0A(+eAmHZxF;*yNtE16S|GX|R+#SQuh`7A zDz#%oF}fYk2(f455{c&IS-;$T++obuPBAAublGNK4ZyqPy zFDNkd$}M-ysqpbCx}GWjWgnYc4ShnF-gZS>c>YH9mAygv3f)$|mYF~#_AY+^e>VdE z>Zl{>^YsmwkpLNj(=_`>8!LkyIckm{{L$KTj(sT?W!Q9Qs!u$il&giM@F(0ZM3d+^b3r!k{jYGHnNavLSr#vClWBRt5k*I$r&IiE=1=t1!y5pQU ztJ^%)UG@3?+Oia-RJ1O#^-NZYZKANyIAqXrbiNTT!8PI)qYXJaXdj@YrM-|c!_$%* zIF@_RH~N>~3^hJ|>T6oyS!!C~^i@Mtez~bG(OmLMpI z!)F*h)E$;HxUn>I2br!G1CwkI&Sc~5d0Bv}CgYBgL!O*{Y@_I1aHcS4L^)Y&5qW0I zvvEJuhjungopX}Rc{;v9a&CzPkYQ~Xr-j6e5B5*$iUPEWcmLrSB&sZ!_zxjs2)8#Z=*6I~j% z^MhB+DMNKN>kmV)%kNIM`yt8+Ef8t6S0z*2Gl!Z#?l7S_VhjFwOQHeV#T z&$qx>EzuJv2!z6BAT=M5VqNEDdIS2k0y7{q{t4Zk`l*r`89t$d$={Hf_42OEN~~^R z==)iv7-%A+IWQVkWOS0|qh{i`UBRM_Pc#TFM=0Eiy6JcI4(9L&M%&iN!fkevo_TcV zg2O)r`fG3_G0UDj$FCsCw_So`gi`y)&gIC?>^hCd0B`{EzYl%BJ~ZAJvh_loI%}Di zcBp1=Kor*?pM8t5T+ZAP=_8o`LwHLoT%4eZ*nq9KfV(E(BS&<)hJRr3rAHpb z{qe`R!U3P|RO!@amCjanagT#3_Yl)T-OUBUjxE7s_((Or6#G5xIwQI9g`T7E$I(59 z9tGMc>ZzKQvurMFo-m1FeD*v3we>`b7Xp*yCP%899o}2m{>C)0yaq49czP%!`?+fw z*)}NniRTvrEgxi$*xF225AleJ5BDwtdzMDn!q33@4nxFLIYrd?JPw z7EHrve&cz?f^uSGLs!wep5h~5jMf#NGyf@|ppPqHM98vH1-gjeZlxd%06v-kEd z`Co0o&2fi=%UTN;1I>2;_GM_WEos4)d!Msa5wmp1OpoYOg69j)g7yC+(XGnL=@4)w zi%uT%e+O(^7ffdFvZ8z4Cf|-xGC?*FVX;|(XE2sUW zO0}l>P1S=iHU38@)i8tiH4I#Gq2Yn$)h)9^pA$Vmgj04|faL>GhHnBtQ)d4yAgZ&z#HS+46Thn9qAOpxkdwE+ukNpf-`VB%`H8V06Fh}c zJg0a08edb#>)W(V4rwQBJ3Yo}&m~Ll4l(HPxh(2vu#s3J8_P=M;kniYcj2-8y-!lJ zEQ^fx;aAA7y}N7XbnUt;QJ9{rY5x@~;3i=n_@sZ)+Usk>cS($nHrX!Rh&R}bSUdZi z?tFU>cURTH*wLQ?b&>#Bn?);I8Ho+}{;7WN-`Is0EZh*EYfs+rgNV%7Jr$qn3m$qB zCBzkeD5jpK*wl)YtEo!l3@6iV59lOXMI(tZ_A5$d!(!tYB_2>_+<*j&n115QXE(DxYmbUbWtiW)uWm50H3rhR2PW4 z%#mvulabgS6>b)#H72a6k&JP=PV}`bbIXnXP+qiSr*r8EkDi?2q-=@ZLv!t|)K<0j z%#8nNmtDpk{_ivHC!iDszVH(UmKDtorQ`RFpdHw8G%4?gK`K=VOiMCm3v+{b3N}vp zl}Ypo!h@#^ZIVZI9V@EJhFTePE&mJWVeuLFPLKr9Vz1fWbiyta$W6>gIkhy3uc7y1 zBW}%)g{YBzE>U8xTWC?+)(>75SWr}zY`!fd)`1q~J*pK3IgnNn%t!b(G?HGZqP6`H zzn?E!pMP4jM?NVjz&ETlvTXQxTxqUX9;b6P+Ha1D%MZaDfEj*XJVlxrRU#FWz zA|HO~#(`r75aqZzxbrR8Hm+6AveKL0)Z)E%S}Ux9neUEf;-}V)`EN-)aj5QitsCH{ zg_3S~Q*^kf%qHSyq(UR}L}E`fOd@S+ElYGqVqh0;Jb%qL+6T*8a^^(# ztKXD)mLk)I)l}@IDvny#*z zINzlTNQF7}RnsQ=w`~jVx2+H|^@N#HG@xFU-UQ#x)nXDK6Q;^|ZF5rRXGe0ek~b>2whS`yEIo}Ka7%&h&&c5Mm2w~eU#EH54&eZKurHN%B${ec?f z+C`5yT)uvuTI}n9o88J$ak6B!!)Z$t*A|bZ_8ZfOpa$Q?pihG3#c1?5juAd3%M9D= zoSNIDkqrFUoguQ1_Ri5Sc`lw!{udOy-+m;#2YaM*oUYkw?Jh8p&UN{$T&rf9qD;O> zp5%hUTLhYyyJVT#fT+S`wALNy<0>Ho-VMCdGU*qatFVWzJtu$WZ$AkHa0I_XLq^;U z5+tFY|Hf^RuR*in6Rk7hP2HBv)t3qYc92-3Bp}It$IH{$x?!L2ELv)J3As1wgwT`; zS6!_o#D;H-A2MuGcbmsvpR!}YpEme74H`kGF*a)SjtcPVAC~lgEie#T8%k+mpU;Fa zXp;&nHNI{D%#+29{1(07zy7JMET|CG;)!v()G}ehJ&J3X`YG^Di7C`lJwozgUeAR; z5}Ql1;%6}m*{@5J+1b{JwF@3wd3WgH40<+{(VT`|O0m+ju&~cJOYk@L~ z&s-MUg;CBAn7mTWM#!V?B?#Qf0XugurQ(YGFs-#vI3guqL$ND9QB}EXycFS=!pG$&}=< z<-2dU`GvLG+!$tM9>BqP3K@W&qFnE!F2v07Mj$X5o$Oou#@F?x)zsEeT&|5)lInaVEz-!n^PH)?;W&$MB4Ws zdNZ`V0D*^t-|$o5%h%>yX8BnV7?9Fj;8xn@&6HTHPwwIGa5TWiPTjZ8S(MOYJ>+Vsz(>B!|x zCO5%sP5A3&;T<^kReV??dWA9(^CJr$64_5-=B*SkyJ)x+!Gs=c=!Q&mp8?4(nS_o!rN;pxt78ybmBU4Qxj!#?zG z+<{XqT=TmF=?c?U#$KMF9F_MuJt8N+D=gNoRZ)ILE_oEq^IR?!lQt{!eLOwnJ&z92sypQO zRMDPO6ubG_tPy&CjKG%9o$b%}0^h;{UZJrWVnzj+02f>NBNmq^We1@+C;qLoF#`fxtb0XI#MD%jGLV zqllZ?UowqWZq`~}2O3gx_^u~%9jNYHRLZzTCn{fK} z9_KHqokOgujPh@r2(j$cApEJQBZI3IpvOV(7+kblrX1 z7pL}yS&Z-EsutXl!j`>%NFVugB8CJft+OeW*6yiRVU*|Lt~YL zU*P2P_?_?R%k_Qzt4AC}SIxMIoKbE#FB4Zk6_@@4Sn}z1z)uWOOn#&+!d_<(gV{i2 zyb%VB-1bweBWf+j25UFc_gs{ph~OV|kUlg$+{q|6=VXN3e>{6!X`}*q$A~GDDqc#P zbME7JxPH>Mv)`)h3%LNwKMik=UEFtT%jGxGzPJJpxJmh{R?vyF7$IqyoB9_!yK?#q zhSA+B0Lr)e{ilRNf1$`nY;~SF4T*(UxB>m;e2YJ#Qlb&Q{xZMQ-!3JfpJ0PKFtoz;sT47htAA)5{`@TDKcIozVLDl+);c( z;fDA+Zrih$BUIP7mogeX&+l5Y1j3lhXk}hnZuMu(ev~5}mJZ9krrg&#ihlq*$do%= z;GdveV-ff1wpNgGY>~-+mz2NDaqAz&K@oh$e!{RBU1>I7mxI2@z5?Kz=x+sfnte+T zZ=H4F-9MNZIebQQlrqm1#gE(qUE|o##9ca(VY&Vw8@w`ILPpn5iP#y-b5l9Y9WEJvN^x zXVyFR^kO7WBUYM;zah}?;_t2D!f6NJvW>iH?MTF@`%aAObJ^7|SKqZ-q{Kd^-@?tBOe?7C!Fpi$74$ z$_Fni+vYn0eV$+a;@VPi#07~UU&NZxIXj}YTW_!Z#~w^CwW?({SILlbjk_VH`K5SV z8FeR2tf4y<;EFoGt872PLY=u&n>H;v6*KTG1`mhPl9Q!T=ODDy)TsBzrdCqI28Tt1 zH>+x$vF!3XvI>L6>SwuhLcT|mps8jv-Db-DkQc7Y1;L8ugS~uHPU5Pfu4!PiwMlW; znfSz4iQ3zWju++bk9Jsu*#5rm^ZW1r!KVFRFg*CPJ$to<1u$)xD;gL}68EEWTeR`w zL|K&a5M3q)nT56i60ur?jXC%+^TRvo@k%uL+Op=eJNo8SimDcBPsF`=VhhR4dj&c2 z1?&3oMi1H}i)}MD-{`$3GyTnWLAs>fX@$L(AVqmfuSFD=MpJ8fF#M6*BwF+(tOg&gbj_R&-K&gBS%& z=ZJXkk3*f$vAm>5!~1~YWaBSkW=mm@dvPB9*dC9?uGE+xsg?})W}*}%I#S+UQMKEY z(4;;-2p@d}E$v8Y;zJF_;G0WN+A*l{YA%OEOi~swl~x##+%?ADXck4U=VfTjM~}!n z;Jy@R=So@_$?|7|9$Y+5Jof3GEjFNknxfQwiRG$)O!R^aCBug6VC__3`qGicE!j(> zPqx)*(b$LR{f}cz+8|&HU&nie*_#iqz;buJqaTXfCd*T>VO(hN7r?H1MLs zbbnBPK6(?cisaf!v9hMppv}@pR?c5roDA@la$J&~7epStv;(psPw9u(7q5?eT?`pt zmOCdeivW3}V_759%D$TKG`!3i z2~%e4ni|Hm2k9Ts+^Tzwx9W6oOYR$?NiPaUnBPk(Z<&+KDamHo(^*l3ac%wV;sM7z zu#i@Vl(m2j!0Mv2Z(7S%ONwpID)zYpNM^pz_?FzBnYlg9^-T@2^@UL4Y$zYx^?FI(-uQVQ}n>O}t?xs>0GlROj87yiPLM*H=qHUfX9W7ph@ zB^D*FFlKQ%Bf{{B^kCmGla)uB!AP;Lh$i}Lzmt=;w?lfo7N`?O28H{CK_kl(8+8`&;*{h zzPuw*x^42d`jjJ406aGr)5zE9Yl%?fD`|&Us>eNb(&#zVy(0K|^>N7uMK{iaKV zXMUD!Is9y)Tb_9lcYklK4}-F1uX?LiXiPWN8rrLzV}y7m5ae7(7*%U1#*X%r2Wla| zc98?j%0_ML4>j{q6QU!>kFC16K~abIblQ9@L0{Nbqp`|ZPQx#>?ojKSwvMU(FxAli z31Hb>21btb*xuyqG#7d7^T9vR4gV;I3!KP6{e6aY|0-wsv%mkRAWmsOX1c@C{oCg5 z)2&7J`$dnX1-ydbN&ns~z#asCdG2f^SZDQl&wmQIfz&C#1G9lw4IumE%k_+>FKBZI z$1d<*RC?UXt}L&LA35}-r^%}DgSDn|>qFs>TQZ|(L8=0{I@AN1&Eygm+yU5^g3lC+ zz?xPxo5ZSSx^cylSr>Uv_;UOnOO6d!px>T05vu7eD!m)iS<_xh64&zmx+)XEmA4%C zUSQ1+3kZkPbI7PW8H>tAm-Fn?`KT|8~z5catl^vX6hh5W$4*s&3+(`lh^}Pzm z{eq}{6lmUjWoD=Btw1ul4 zQKlY>Uz75>$4k@$L%GqFmk-$Y*a&Jx_b>X5fQCHaoLDF{-Zz4m|0wspxp6to9%cfJ z5QVJ+-|4l#W2w%)rKm>>aDIzkZR3mKxMFxd&V+>pg=+6U>}ql5pImzd$#f^Ak^Vf+ z;eUSlSNQ+s4`7#*VZX=%Wo#U)l$kqQ1NTJS6RCcnw~p4iOZpC9a^=!Be@(Ap&;D7y zC0E|i?Fld55yPUgVv$##WB-4O8>}Z>5?g->dxH=`0Ter4K1RSx)`|&&7Ol=Q;J{|I z_?PNzMTt-Y*q*bds{ne{@S5+hF@_kw~7nSwHxF_A4hq1X3~md)Jm< zMn`|XZnWXT@eDi1hcbZmh*Y~Lpd%)S>Db$G+ZaBHe6P*+IxtO;Aayw%J)W>>Ay$ zw(&`8o}2I+$y2&ehN=5RKTHCBf!&lnRc$vS#>O!@O|ZiZsVffpZ)>|1QVw@7%RW+4 zwpTawxf^~@HF@hH82S;UAn$jEk4hhF135mJPuOiFckdmT))6gsXrS8|PJFn<+ZuwOYgT>@6W)M*pP>-iTS(h0mO$3T| z{Y$AnJzdVJOKhA|k0wE+-W!I`(vGM#y;;2|`})|>^YQot2O8%6E`N&_SecK-S5s%< z92KrrE!%@>Ne_Q{pYXLfCzF2ZCRaQ7j(4s~vr2JErB|C{Ky}YkgDuU8Bc>v*53kNy z;qGoY3bbw}8|=sXn?c%3V$`i+D|{%1mfY0tEN$2bhib4Zz3)5R1y*`$jYzfg4A+Qc z7mw@>HmvWCZNU#6PArZ`ZHTLc2J%FCX|)RU&UL^SOOT<4yS0t-BWAQ+mF$r8kreV( zE5Stl7GJrhv@=LxcJg;`kH&{#y0F6V8;H#m6m@LJTdbRFo@Z3f)aNOf4DrjYdDW4V z@)xgFzK-zt?)#{9;7rH*WBN)-T_mL1KtEmR{hT8!7pytinIJr5N4IhkN@jHUsrpY^ zBIeU^Z7GU#C3N#t-SmANz1%og*EbiooPMQ=0FZ{Yzy=4L4YNGFD|?YyXcc8WZ+G8_ z(VVQBzWv#k<>2r;-X5&ygV(BllT8>2eC`r`ILrUUW$43NbGK0_6jKjV=H;NJIHyqR zV~B^|pwT3B1KNz2j-^?7`poS0d+{>0V6xaJ$D!rsm2Y8f<~=m;5r~!#Qts2M4?9o6 zPr=>bloeJ8udsMImn}PHUN}S8ahWXiP$R^RT)hU%&eiBzee`^`a?k#^QSzr8PVPOL zxY0ShDlO;&+JkeLYs>cI*Z~ZD9wv+F0b8|YG*Zu;F-;!DMWPk+P(W6>H0^UwLF^ctvp;HDs=?b_J__MkX;NYceggnoSndQCXZsG#)>^V8y(u4nuy`v zbi_I@2mPvYjm--dVo!#7X<92KrI~Lp$~~GjzPcsMlT;5G;;^9XbmqYx&_P_5Y1Uz3 zCRba*SbkC+sdlrX-RSP(_*ZiFh^e6~|g-U*TNkTC>9$Bk(YiFd8z37eEDACp$8|AE51-J0{32wqJ@z6uN6a z9_&$jzE3z7ec&1#$GJ~4&_Tu{uJ25-;U~TYlKPm7o&Mf z_OG2?UI^~K7O^hj%a??%d)M!}UX?rpO{6j`EMq08_n2QQ-`6(cU_MnplDTu+kipbNXzLGakwC}Lj@syU+ zsWuEpc1FnhHp3WD8X6kWW}DEZg-pB-rzq0vqE{~V%+0w{&fYm-Dj7ydAsrEyNoLm7 zt|x8eI#I>^6v`NH7{pg-(?FTI&lf{dEGy1vDQVfMOpI?}zE_CmzF-e$xWn|=7sfgV zCit6IIi{%2F~~hX++aXcO+XA<4~-0KiW3q{rMnOViyr5`tG*pU&qzd|$SH)eEG?Qx@H{+WgM(tZUzN@jjXi?KwAz2$(QnK~{R> z;O=nBJRROJY~SQ0$`~PVU=Aq5MhS=8%#H0O5BmsWwdWQ0e)$x@c)j;##sw>X#79_Q z&@y~8rn?3^`_;7%^OIHh@*U-17He}pM_dK@eheliA_oueXQ`YjP3K7PxI7B^(VQo3|1tUDnr{G0u zr5ND2m>CR+cPT3GtAp!CKY@~eF*{#AI(IL`0qXBb1eGi&` z*S-P1hcudrsHBFY6`9Gb8V2M(QJPeb3w6@1`>ba~Yj@~gtu-YKc_bRj5JHdv(hWln zdxmpG9P4VEuX#UBR6DwH>4g+4l2-wDr&84Ei2Lvxo7k>2>IU>dW;h|dvanNnG9^AU z4;5ph6Y@0kZWTCTzN@l;?V*vX<6Q5)Ri$;je|24O+Cgb?lO;uh^$tu1L(W}H4}Mbx zBC;QF7QC3npy64aLSt2R*>up-QMNgm5a8vetllDazXiGHZi?+&QPabQ`>t)Ru0lKU zNU)PAT^6N=Emepbqg6L_ZZru=C(~3c=>cJW&m(b97xrY~XLV}r?O5VAa%K7%jvvZD zbvN&Mbx8yu^WU##hkplV{%?L?rGs9`h=|&gOa(_Kp#e$>pi}SNwcLfuKt!cQmq-XT1O*O0KJ5`;!eWhESyF-sBL&rk+dfW5v+-l)5kvzFbP5T$tjIgSBOE z+;1$%MsG^_GMX?JC3x(}=9PlOg7sXGo!YR(RvYsMl@us($23uac{<2#PBPzVggp`* zJakPzHd#mi<>UROF1PIDMJ_hBeJ^S}(_wn#Hx+0+oN;b`JFs9JAI@zgP~7NsEuoc4 x#abZ<+F70Ew3_YAGS10(twW6?O;Y+5zNY)Mtujk}O+FcgZ2!mkpJRUx{u8!thN=Jn diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/acc-vanilla-lr1e-3.jpg b/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/acc-vanilla-lr1e-3.jpg deleted file mode 100644 index c06130523b46a90a8bfaa33bf946fb97a8f2f4b3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 30033 zcmeFZ2UJsQx;MV*QWR+-HPS>xM5#)P9;AtgihzI+6_q9;A`m1b3L;I2f`XzDq=le!=i}%9(-08c!Y?2w zz|X%$c#ELWuP^8?5n&;bUqAeMlmEDsn~#TwPe_1Y;7_Ogx868SfY=tUDegNwT*?5q z7#EKi7pDz?0RR`@pQgtJ{O5~{n}?T=UjUkhFmyuIc4+!MJkX^1pczAFM?l8`UNJs# zrGrQLB~D)yP!5#Tx%=ql7L{Mhnx)S4Q&n{@-Ha3z+9ADDMpkXN`W}tFdin;2hYlN= zAG5HuvbH(yaMsbu`J9WZ=jAI`y}W&VgKh*K)S=b_<|(Xnw7dHUPT?A-Ss^R(ra)iwG$ zxUu=`y0`$I|5z6E{vQkbi*<=X>*D_P83_EkE-vm+Xz+;f@+lqU7e9Jh;9{VJvd-Nt zlD|B9S=KD5qI-rab?Iim&<<5Sk{a#TrTuATe{5lq|Io_*|AqaZb&UfeJY3Mj;}HYk z0E?L&zX$k#bF$Ng5`wh;kE%yKIET>Kohn_*+HR>!l~* z^6iRy@5_kL@;dT>OSWA zrIu$qQSw{ATGha6?)#WCe%8~2C8_6TKbP)*Vb^fe|5DQi&vwzzSQzRTsz6Jrpa;kS z$T3NJn62N9u3fUTavNDY^oF8!>LHJf-lx}5sv>3bkIvz@VZ2Mn)~7}9>~GwMQU!lb z5hX#_&QFYhIMiWsxGlwu*zLX?@b124qCtVk*u^_Hk)C+;?;`;K%)sbThIC>J8JjKQnVsYx9Z>^qAoQe1Wj?WOkQ_ zA_s`RYO>8t9Mfup)Md*e*xRw?5m<%*MU0iha9+SHcf-df1D>sf$ptNw@0fj{VkiX+ zIi@)^G#0G{^3W^6DT^?_V$A_tht z_K;;gWV%5c-^I#Zg^hB66X2Cl@?kdD{A_qY%%dMYVf!;vgX)Ex2y@;y%fl`(^U9^u zr!YPoV7!hV1a{Hv*@BFzrF$G;C)1Q{YU=>L_bh4l$@VRCHwh6WMVwLDmu+BnO8Z7U z&s3Q+kFgv8yK$a9Of1jCCHOPBf*B{6aI8li%_LK_-X zo8Z?femIc#)nSwO)}stLpF{f8&}IU7B?po-B&%?QPT~Of_JD+iNoh#ju9>Msqa5$~EyaSAs2X7G@ocVGC}h z`gLH0IKU+Zfeze`g?d~pOlr+|DV8Y4!z~jW;p>U)J7BuCUNpd0;E2{8d7)*|dkAq8 z0fsWB*&pvsQK3JiUC$s{v(1oR&9~=TiUSb!wr-}Ev{X)}ox~54Gta!fuJkN5V3)yi zni_pNnQPU#88hBWHffZm?Ji^7U~*N1V(RyoZh_m1i;?u!Q+@ge!?hn>_YYS3Hq+Db z;e>;QNap3fXr~otvx+w;bE-g0Jt7BXI>HRf8U4vtNt(A8^$tycaoWQ-?%M11tpUOZ z!rE+=^`f~(6>L3~8&x?#WH-7W!)x zM@|6;XfZ8p4^t>^7{=u%Tn&wTX7&8rr?Xyap6V}o??f2CK~KWoF2&^D*9xb91RXDr zi1~^))nK--j(&PHDwDd=@uS1(%S7*PQ{@BJFdxg#U@{>dF2dw_If}$}BEAR$o*Jq``e)VG);{vAj?_@SP;R4Jp6i`kQ2}HnYffo{D$)x+36ZAxWl#=RrYB%Y@U8g`G_A%bDwKf|&-#>e)f1O$!>auZ zts^ZZ_SzPuv#zop)N2+XrRm2d-fny<7?Gf|zye&juAQtsc6jJ5I--=CG(u)gmj z@5`EM%0^eOH_yP&ohVE2Hd+aGFRJslAyduMQuaNylb(wh=dss$YB4uf6P9U}E`q-P z^?Apax%d^_7@jOZc|Q#k_Z`Mm87}lsA$WX3==0-R>-X4u>`QIK_A@1uHT1fVZV`@p zr~8d|ceq}iEtEjsDo3|ry=(*^fwQghTOCBfr#OJ*d88=0B$v1^5b4z>^vEjc#&oXJ zc&_ZN7gXM)@IHYdS$rqqt}74tN-KQY{@$DCO)X!*q3cX5KN%|c?tv{=eMei&3HiTDDZ}0AGxuf}GXYWIa7#=r5vR$Ino;HtSdl6J6iF* z%{NyM*xs+SInnf*_eJT>!=^tCq&nug{`*?$?7`vmRiFz*=Wehh5IC$ zPunkqK~mgI-C#XI%O5Rb`cWtIsT_brh%`~q1edJq5aMoc8n5RleDmS^@Zk7$d>>s* zrS0SosqZ}NuNu8sP?QniZ#a!z2qM{!Ytpl5*hmwZLS#`VdN*}4L+rEj$;$e7gKIsm zAIB%Zxc5@jTE~4}Ia!44JLBbe*1hP`!AUX37)vlI=@fa5c1>4@?9lMtjZ{e8vz%9RyRd+~&vmOChhzKTIB~EUFac zTi`iQYpoQ2Nh05J9>{fP8jdMvE)TE3CoWVOQ=rHo z(!bvImht_DaIHOT;b00d_#1<}T2d7~am7rt(*i;W2C;jHF+x(X~Mz zGkNTp_?{}w=rf)*5=7tRnCsM1r;o9)Q}!Yl@BC;z z*c~>P(f*g@hTYT0U)bhrut(oZ@92^h&WhZCc0t#rLG8xE*ZO^7^{Bm44mNZHud`2VA@8m&2qYZR8|1>*c-X zYnh*hQA!LgCO>3@>QMkKX5lL>6-rTv4;q!ff>?VRe+(3$k0|a-X-QZ4G3eg)#)n|C zT>UT;bz|Z!uZ5mNrFB+`-)iq2p*`9EgYi08J=xSOOzC0Hn|-I7WE`TR2$Ed-Qq-jd zZK@T$f%OV=sq`MAG9U>)Rm1`0udS6TGWGhH=0&bP93O5jZIp?}OO1#ZtH+&%1O7 zzwsxTF1LrZ` zqD*xZ9|!o<1Kb{3Uzlc?%xB8d45RencP2&N^)vkn&GM(Uc4LIcO{*Jj|CDVQTCg;& zaX1ja>+)k=F1Z&RVZ6 zAmS&tcuJ}~f7gX?&dHcoG4ud_H4Qe-!vO?|T$mMIE9OCIiY5oJ31nr#sTowr?da-p z&0_@MSC+qX#xxdX(q3HEk^Y z-k1N*B#$WmfWny=xF}k&=goPfKvTLwjCySFv*3t`)rkF?o(1zd%BpE;{Q8Qw79Uv_ zYtD>X-bt49jI$q2qw8Q1!zNC+a>G`>ArtfPt+V(7QQ$|*w8Ojmr2X0HWP9~n18Qxj zKb3rYlB|Eje2>aq4sc(Ym52GUA6_ZImT2VW0JB%I;3e&GXLj;5;|PXiJf+RP>diA` zri=artFWiYu`=P_h_UIWNfUP0%}oSzH(I=6$gHDN#=CJ)V&n4EjplvH!@_o{|MbyD8{K{S#I9WKB17 z*^cJ4nG4R3nm@e3-Y1sB7BZ;5OSX9MDB}Rin3}sOKa%2IfedJN+M%7_6%w0f&lRb5 zP_IkUPwBi2ueY3%rOCwa*^=KiMz^Tyt+*Jx@U7j{npSOY?@9SVMLL;QNfEG(@8t;! z)U-3{IP}E%Q?hfI0xd1{s%o;`8ESrWKvX6#jTqagfrOC^@81fLKsHk*+8b2{D&@k% zS^imP9?I3l_SxL@bh=`$V$jzn!s7wubibp=u)p5EK7m~3Io3lCaCr&qp4c9( zk%)=<83qT^LrTd6zH`2YPKAO>o~dN@TR`%S+?#&*va+&)g^PoF-G_>ci^=DTDN_%W zkHsG-EIf1yf~%Xj8$9<0=JthHrx@_9-pvwa0XR z^mxZh`S(<4#I{MD<(;Sm_z%5-MRS1ixrJ`(FPj7hP@7?dlxUGQ3Q042mj4BbfXoUi zwc4Rae#OnQ?^vEv^119s)8{o!tlPx1uv&j~)g#KR47}HOGch+dp${cYy@?d+QL?q? zRll#r|A@QY=*uy2Lih4n)sN+`1($2@@7yI@_ZlS!niu&_DvhimqCMnl8)+d}#T0V) zMB=uuv%{6(uw&8y<)cIp_nVdI_+LKo4gh8$goNL*QonCXDMpxP{5_+S#Uu*mIOs^K zb6eW-ec8uMqM@dbGT<+5|LBs7mX`c3gFc;Rz&mH}J@cI^-_GAmLAT+{5uj&i9^s^GM}^PUIojU)!nQ+qyeu zi6ute^iURV^()U&>Z=T@cJuHiBD&suH=dMe026UlYkX8>Oi-EMQ5H*|_)_upd|Bp! z@*VFVr+!Mrsr=q8f7e2y4cVV?6#*3eymQ|XmDq)iDNL+^k$bECt1)o8CFFLcQ*t@Q zN7?rE7t40!zAW)Wxu6E)0J;ZJ;e>sS45bILGEKHlZG}5!Tpiknk|1fC?v?A}riku! zZA(z$o7%xU9lwnKgG)A*BjqmyF_cl#wBpV-Tok?>--;0g;}_o7jWBhm!i9y$YFMn5 zW}y!gHiVd!6BdZU;SmfIbdLu>l^jow{(O~+h`M&Gx)zudm-Bx3jsQ0ZD`7X@ z{G&JiKCU^Z{Rj8KEHV54z8m{$5=L|AX*Ct8B=}Sb4Ax~mcE?PCx=Yru!|Ns_Uq(VpSN~;dCHE*zN(as z#~13}0ZP5Ixa0Ty?U6ME|0Wm60nS6&%u#*QqlK6c2Gi_!aCW3O&nyvCqrVK!!!*P2 zUSE%Q*rgn0n1C*1l$YPu9TNiS>(3E@kB*y_T5my-t;#NK72lP}PK6sgxj-SRD7YKL zeAzlB&3aw~+1Ax=e~%OfltH$7vv5Zt(XoI)WS^Rk&)MRl;5ke{Z=m!eI;I>m;m?2) zqR!hBqEBtSeF+XuM?IJwmEG!NIRsSsj)7q`CfuIP5Q}}T8!ZU@Y+)Lw*7Fh z7Ebc`j#*(8LWBc(ngb-1W@4s;@N52<4UZhQp)m(=8sPv>=2s!qa(*|KEy4jPqS_2E z_DetL3OxIJ#8wU_tlvL2uQ=2?l5ZUE%*it!W#mi9cmvTz-jM^o7H$`sdIlL zFXU4u*xNH;>>bW@FQzJD>>(xqb14kx$?!^T41F|+_%K#XcY9JwNm8tONqu@zx{Al@44vJd&+~96Sxiyg^3MOF+_VUz~`ZFzkVJ z{DW5OHLGRbv5yj zpoLM_28oDyCpuNr_fAxPlY3A71%%*ytC!r$J+$1CZtt}nDwLrF2s8ODx%jWWzYTH1r`I);8v9}U+wu_V#sd)#0o z$!%;lS@Q9fYJRbdXBIp4xF(S5gle2q|KxHQ2@<0^&p?gPvj;61p^DagNxE)TF(mlL zHe?LD1=`nKn$rv*dD`NCFwUA-k#Qm*{6vIbd!zE(0Vp$Fznw@vD$a8Gz_aoyN5fbx zMC15D=j$Kv7GYN#7=h@HM%fR;a9>Q!_n+(83GLO%(9^t=8;w6ctq)OBM5YAs2xdQn zo;HwsxN~zyM)*zmp$ooC`HyF-8bwTqxoRBXLuSPJ={-5j6BhbE4`G(A`aV1NAgG`y zs19ezqu}5PUpzCxzWP@f2GeX*lJADZ`v_m?y&;mR@VOte!|#Fs*)L#0euaJ47c(W! z0aRdXQA(Ha1#u=8ZD$h2mP7N4_T#P?RxXAMKJ%0a(RZcdl2In zCU<$iTU#1V-UPPe1rE)&7axD4xfZmWIQ(w@Lg44TZlg2yy*dbn&zSvzCP%EE=Fw6O z$huNGFIu3MrUSseoeOGH-Gqhbn56B5lPsqBL{;L2 zh`MIXTkXys=aT_zh8%!;Z%b+b{jrRo=6B5Th5HsSj+^v8$i_E#M|@e1?$wyv*r)cf zsKf47;#`1-buUqO{FS3leBdOx;vFoN?wU2dJ^4xKDSmdqE|L1RkfY zoX=GJN#wxvp8p`jUR`9z^z5?#fapok2w?)zb=f- zM%Y@Q$~eFlX{Hp8JZH8SoTg5Y93SA9J6p_PDb5WRt08q}9!s|ZUTJ<|b2AUnLR++B z%8>9**v{Zm`FYLYeJg&O^U9wXy&-%jrKB@*)Pdq$o*1*=9A!jDx@L4vW8Fd~0Gl~Oz-(A1b zZ^FYCl;Z$<3R&q8r5lB`g-C9C)v7*f1zIsgfmi|f74Etz`?p~?&1v|! zFnyR+@tkbfxto>%02XBvDjZZ7=}8@U>nW+ABBT&Fh0~ zQ)LcN;>RqS!-A?kdDW1$=EbpI?9)~jH#cn37R-J9Rpw-(?-pj!gc8*w-$#7H1-ass zq$=lWx&6JP0>7({{qzyJRVNrz`Su)3Qo)(cf~pZ{P{<%L0{@;hk6)iBa)87>{uwDK zqS%u8oHIC22I69-^k$AZV!2%CG1N?DsV9ANqu7lK>^WleAeCQQ4Pm;@85H=H)* z4s)_ug-d-8*-U(q^nSc&Ym1S%&JF7iJzk^i9(+Z08Z~m79zKZXf(%cSVJaaYbP8#r z<(+Oqec#k?yVZHb-zYNo&UV`aAI)qso!gBz*ED?etntRhd~5U;>z9#Mv{YqT(tNMP z!TXaI_w_}^~7d< zwzceS&Cvak^XEHxk5?p?TbteZV!Um(gel9)z<3eIx~=M{>kI3@!jwFNON$UY?m0ye zarL?JrRPCdvRb8g2{;|dJ6!X3Cc%xgXPak27Va&yKR-+y>&}|Mv)n-Ru(fAxI@x8; zr)2;2ozhQw`A-5ocL3l9P_zo1wvhb${D8P5Y z`)1pwZy@&sUD|6Y%owm;@uR+9lyvUafao0=FoVWdPk^Wq9x#Vo~OC_$a zb01^9w2_OR4`If0ki#nhYuFaVQ9_fVgy_0`JPM^<9f9tQ#0agz-_rWnn)l%CX9QVa z1~7N$FacBQOf@4MDgSgSK2?)umXqD+in))VIxWSjEKKfW<+7!aLV{+@NkM9HN1l%8 z5pC-Gb;7g_2iVp)$P8mFjXTh5z%m*SW|N|W5trMl&kFPhHi8ybF>ukY(#x~}GBz3OVDr>G1m|8Z`CzeXEaf7=2eP=t`tI;Uu95N0@#F^dz|xL)qd1 zvNjPFKj}KUp!4Tjr&;;;VyM(xBSi(;y*_9zR@ws`?cfs_=y(WyDu8Bo4$9$tAUVLI z+>F`!>@ThXQE$F+0QYQ^CbiF;aJk`q+Yju^BDOqqjvcz|zu!7q{Ge04QFL@DV*a=TQ+31n zAqBj zl^T~46ZNZ?nXv9JaGvw`VAKqNO2Z)W?g3qfV11qe+a2I#i;B&HbT+ zKVqLiE3yde4Jz30Q%ww+z$EsL{vt`v;Z1fIw~6@pVNZ5^a^$Z`|9_R-H^HYK;g{n} z$4c2B4DaDr34RE3X9ApRP28|>UKoL(VYwE>rdL(z%h!La_LXPaeq4aA!K!eELKQVu z3iAqeexemtdHtCmEFqx&N??TIOOn>nTd!Bc)`YicO(kntMLgc|>8-8WTbGynE(hPt zc?I?}B$<9-K0^VyPM3lxtOS%gi39K-g%XxuK-&Uk_b*?&0zCQf<63L4lRiue=GJW2 zX(F0d4)AXzY0kUu;hDjVrO{8ba`cE3+c4ulr-{*MDNwE0Ou8m&pNSe3cdEW2#=*HU za*0w?BW)0ZYSCiWA$jd*w1zsDPBNVbYlrfRdoyl~?CAe+Of>TQwnJuB*Z*wa^th&A zzq2bvb2v~nfSFxHhjgxdD@mKBJ3i9)`R|+a-!=JO$chd{394dLAh}c?iL>ut?LPK3 z`1O2RTmY~6P{s#Grrn{AGs&l&ubp{(QCh=T)%&w-X2C1(^jn)96|b9peEcPBgMx!~ zI_0hTZFLWF9g$B)O07u8Vy0fXI=5+Eoo*tQ2{gmSP#(hGEfCqyJ$kum zp8bp3iu( zddEO&9eo>U6GlUvuTJ~;+2p7^{tA`;H2hh>&PxtSzvPRU{S-fV3i2?|p&8^e&u=uU zqKD9ep#K$Z9UtO0=KhhD1@@23)%%tlz(;T{U*35DEljnFLd%1`Bh8ZlwHlZG!@#C| z@ZnIcF`I~jyfXm=MBr`9{a+`i%HNz+pX?y3PB0E! ze<7Wk$yj?l7?Il9U5%jHEZsFztHnC9veD{Lom4?ci9*Z+rl+x65mX8$$!}W3?z&`r zi`-B|3RoCqn4!cIrAgY&0U{v!*iMGi^PF~h3VXE<)oHfSQ5UcITq1q;+`F5O#p;S9 zl^E*I4+JRrK92*W@JXKb2>Tu_dHM+S*B=|diQ0W{M?uE_ zp?@dq28PAkCvXSTtb&=5LR27F4+8`-KQf0K??brQUn~A8E(^)OJx=-JmbXI5bBQPi z$V$)rw`qq<;{jSsG#dtbK;h@4!AS9~26fx2@I>=d4Ubh%9m`RBid()dv=P(_tHw~? zGmf~OM;&3M!Y?cAUTLauulInB)O(Tz#bhQ*u)vL}Gv2w9?*( zERWa9;1N2~y*S;s_$ z+~Ye!mZs#;91vhT`CRNH|Fww}{zNaOd98&@O5AHZuN4K4YJSJ_AcdO|BK4YHQ`neH zski;N61FhwOpWcT&s~a(cydHWJ^6OB&Eefu1q6f5%7{z^?22OfWNw33KrkjmzNjZ& z20na+5+oDuax1CuMT8P2j04<8_0W^rwMCIZt&`&Y#mvJbJO1tygA|o9zl-_fxd&SC zUymJ;vU)2O60h0e-}SY@-d|1X@*Bvp)UVef5D6haDYBL`#+ay0K%*P z+L#Zm5SZeI?9+}6XivxZscGzI6usRDQ?))HbfqTQjJ^AsKcscfvqR8_m+!np=Ck_4Ropv{f z<3`NFqw)UsJC~;wj>#)#VW$He1X61&*=4cl-gt)lm4FI9dfwoMXMK7VE+w%?YA?>I zRolm%^`tU@UZ_v)ENNV=e&7~*mT62PLfA?3S(Lv&O_;3s9P<)Qs8s-;w|dCF+q$(g zu|y>xb@2yM(|gN?1K9XpW#PH7u)_Wh1>F-B@#Ev;YQL&ui>h^wP+?9q)Pf`r?D-Q_ z{qj##HRboH>by5Ja6KXLo=VqoSxe3Q5Tc9ejCxT-vW;Kb%uoKj!`+#0)?Q3fR&64l zc6zsarw^~tPRFP1#G?RCGHdeC&W=*vvtmc4CaANIww&Ye(HQE(m`aMP^* znrs`od(|cs9Z$*KIQaC788^1_&3cH9=6l2Pc-Lx%w)5rZ9yBse3Pdz*@)6?eeaOZN z>LJKB{pz?Ho|t- zE`QQj)zzy`7i8xx7UE5J3i4aHQt?cQ=T2Z*={VDdg>O9Q!8|#<>!F<&$@bDPT7D2V zV0?MpOXbzNaQ%(>uYNxogpc&qYvoLU;jKnmuQKw-+LSBH%Bs&@yenVvbR9i+O7pWhRAu#|#7s z%VyDN+cCgG(Rp~8QUMH zb~*Wbvh^zU>AwBk*ov2mj&<`B^An-|jl~ciy(>Fj^XdHmti10(O9Ou!QG8zGFMw`7 zf!%~ko;C%uL;)yHgJQ9dd+}HCj9rru#64(Q9I+K)0;%&~MIL{XgcK!Uu%}QLSvh9A z(IaT%a@OlcB~W~h9$sEK*to-Iay#w(W^i=I0(xiF$mP!g6cuHyQ?{n_ZI^0Yl$W)r zg>OC$tzlb@Ut_M}Mmw7)Rl%tJHQAV!7OEn)XdXw615+6m4VB5WpCxXa zxS6b>`9wAHC>idJ19v+^sP4EXSs)%26h`wS8{YG8RQg^$Zq*Lo3Tn1(clBZGmETp| zLv9pz>)v;{iE@2lygNX%HFj*w@yJ>!fRcI(29Qf7kTFMSOK~2ZZ96tmvY7kVm!>1M zTLWFj6E52s5WBg@&I8LD`(-+A2uN}*L;Qm?>1!LlYL3E1Q*Xjav-kKy24~Fs1qZK0Mb_j4Ewk$3jvp$D?8s_^mDOXX`f$q#oXYw$VnEZVZ_I1Bz zGjB2NrRu`?tGYn>k)b!!#W$dGD1TlYx&5ud^y+g!qBQwcj{T04eZ$esN>& z_c8FlL;4`J(EYe7!#3grgm?)D@I;JWpVg>8y>o0D7t0qqiZuHgC-O2z+(Z7rSy=F0 z81f*tg70o|QXxrL^~sES?xouIX2gUL8-w0`A!hfv<6kUJggkI6e_U&+S;Q2t^H;k$ zLOIHCLRr#c+Co_naegT;5@PkXenQaBBJ)NfVxqLy z$F<_dj7bcnZF5J2`Pt7`8HhOM=>moI@u8oW>sJn|X4?9nc$k%$TcM8~!_eKqD~vs; zL-qjEq%zfKQH^o3u7qg7`07DvWVI~^UsZjvEqGqBRrYW}Hlp>4?O6U)ZW zb!zmBm7$?2{x>h&(9c8sd^%xN?4tI@C!g;56<=dIQ(k%I1f^Y>FA@sv)$uNxTPuaTk^17ZvDGJa1$wX-0twnll}3NXBO}VlaFj_vWJLr z7;j=ct}4GR!kli+idiF;xsKX=Ls^3rTeSRNcHC4P=(@}8RxKmrupAV*#S1$L_d<+y zlMoBzqIFE!jquay}#HL9#9EL`i*!4z~Q37Y%p$D8#Am+BYGg?yZ{a@+`e(A&#Mp44`{Zhgj^FrS@cm zJTe_qjU4HyjqT|zxD~cbBKcu~s%qGDLD%kv$hyFHwUVzR?D;uBC_S&DKy9Qpk|73B z{CP|8&={VmQdj&)S<(7)OXI@82;;-}8f`|1+5CF^p6vRlv42WH`j@%yub= z)ZvwMNN`J)4V2dGJkAYz{&uL;LQ=2R*{y~L08$;HX9Cz{XEpfZx=6&^%$4$4H z79N6vUqz^OUzZA>qs&!xK!`a2f^aVfG9KL1&w67OgRg{>#0D|t5OtxnTn|+RT4G3I zQ40vq6|)HZIDXvCq!B%ofj1lf37^V>i8X53La2}w?NxLhvSsooIX{Lig7iV}{?UWF zTt9e>cCN9A{+%4Cx0il(ZNk_kXR>KMOSIff0{t24A?WP!a-uFZm5#%j8udQw?GP?G z2imq*-+nwEBz{(KVA9#!W*7Fwwxg>hgx{c#BOUNEY&_};HHJQ3#d=6FGXed``O(^J zeyr5k&vf;BdWl+4jkd4B@LLxvR*L)$R^+^KoN7(dk%2vLQ`XBPtszf7uG@~Owu;HG ziK1TxrK!{HwWwHTY%t=zI&&9n3E@uRHVka_cDam%b_enTZS71KdK# zjvn57JmW)hD7+l8kT?}E=Fq(ElpgXjumsF^$5GG+Xqm4=Cx04VPVQZ>(cHOj!n2D< zHTjbZRDLp+Kz9J`7<-V)_JSyb!KQ{bb%^07rQxY&|5kdKdd+SLvb178&WrJ z5dRr+5+DA#LF!*RmVd3YV0H~R8nTDbGOSo4TQHCe^*fmjHtMOz2t*S)wwkxQUNeKO z2vZdIvIoOyL6Nv0c&}x2|15j1o4uE1P})IAfKe^rakmjs`?8_pnY2i>2I?1DXN#S> z^$2>`yz{0>DQQV@Y`$jQS#kGg*r|(JYgMC-Nj9FbCzq`}72Z6O9F?Y@ymJU2i>nA- zpwPlQjMUrg6JT5>(q!y*r0-Gn7=5A`(sh8oa{R_btDBwPHL3#9;8apraS`-`5u$nx z;qJ7P&!$XwOq`|t(qdv)a5S$aWuCmV{%Q2*yP@Z=C9M0^R}XDi3P@F10D*V`JQSg> z(~)KP0XBdVq6XY8uCm2d`i8JYsqL?xDSY|DUUsZb5Y^r_$n-OdKMv{EnJZfRGk0@<)EgFw$$)NVT}^>~A=FEEh^pO! z;6I0g3>C@GRL){hNzBEEe(+k7{$hknT-UyvpKMf(XLCHsr+<~f@PhTh7As*=H!heeg`lEc&Z_oQ*$sIX& z;daC@^*fo8R;@4|2#K<$VwyFPOb7DR;rNJ~7Zc6K>aHh>zA!!d#4bXZ_H$bLKHh67 z%0m?b+U8&vsEbduzC2nQ_quUVey=^vM(G$LRB8rBwZlYBs!;`+ph^pxkmtq2Q^+pnUo7;bUuQRcjFo= zwqMPmC&8(ynC6pkuTnu=I$ZR7=I-G|6^OM=E#kd~PI}?|Ie4jO>cAlrxS7I9hlG^t z4sN|RnihGL7>T*83i(#K*l%q{H}chZA77x~bMx;qtjF_^ad9=o$~2~=KnI&2RGTJ@ z>9)f}8(}WsBmGiE5Agn?wB2!GN8;9pfwr})uX2^q~w+eASpn4Ha-hDurY@+GfC|Pefe@XQ(ywo1igS6pNQJIirUaL?Ub> ztj-T5e+)0J7|-gqR<5y8ONr{2id)zU1V8yRfVxCrN|a(tEv2W)R-kkS>Vy1P@fR2Y z%-uL)lcS`J$^e3}Q>qws1kUceJ4wjbhaUDua5XL-ymdwv6!C>AcXpXqOgfJ36B-IzF(_1O>Gu;t+OYPYKYX)~b04A^nCuk1VG($=pji(7;PN1lei3xbFViaEd`Jap_yhDsfwZo|Q*gc5t^_NsfZ{c};YploE zdnjrbv8cQ5LnsbJrSaL(teDC$sy#$z)@G&nJU^E7A!vkRcwho*Xj2{uak2U-^F#>I zb4a`K^da5jM5k`eA@(^tR(~3k$Bix6L&AW^;1>g;)9ur0dZ@v9KVWOi5J$we=m~cW z!U&jXLp81N7HuJLUuOja)*52qc5oe4bPEduDYu?v8r2LR9D*n)yKJWAKD=AyP6MGj0k9D5X7@b9&I zzuM4$i|sQz&B}&PX+oug3c@ICfL2NSb4(%DrE;x zRQiQk<^(GwSl#Haul{(fKfBJ@A7+EZknQi*+fj!*r}@*!Zl_7b=H?0Kpf*P}v7>Yu zC5bl+ggbFF55H9(Fd{eVfNNA+`Y7l|Q=z9qzaI31akzQ9L7kplR`tHG{shYH;zfUw z!uju~yh8;X;%6-~-JoARnEYXvZLhK;Sto9}Y}MV5Zc4*+*pC@P0Ayq!vK~xnVhdAo z_feWzU?}-yRN;>r2e4$gcYJK|^qUf;45gR|{Mf6=Swq{{Zx1izY(Jupxrhb#Xt1gq zRhjnG3~aM42N3q2+>&PXj@nUHFdsfg+nrY6)?-?dHT+7_z*aR-@Qgv2e7oK?vVoQ? zOYHx&P5XTl9rdGYvLNB}f;=XE8ry+*fT+YgKwTkWB!8I6SM>#;j^-n;^Nr-yR9;RF zQyJWsmHd^m?Sh4{kab0c&ZmzhnIwiK`U5lrxc3-kZ7Eh!1H|SNUY)buT8>mFugBIK z3KuD~b?-=t@Rz4#CMs8OH60?~IRphhf4qndvRPxo?7g^&b$U8@hp2bM*F&LSORhX3 zv%L-S_YoAh>t<3TRJjs)5z~HD6iU+KmIzRrv!ecR6=uoJVn+2m<4PyVql4#5cic_4e*Zijd?Ir5K{Cu;o_Cj6_2=x)PE~ zavf7jrsndMLWWFfGvzjwu!~4$*dbF4ifTg0h#8kqOlG1nnM=RbKIc5A-#I$Ze$Lr@ zKj)l3X8rO0@mqwC9XEJ*d zj4_4HYcpD?j*TS#r3pW!+oSKQyTmGu@8;A@ynep#sj$o#0TfbpW5rn z?<2lk?ke+q`*a<0xM&{8@alGh{d#5KwRB`rTfnm&7!WHF;ae~52V%YnGX#8nt&fPO*{jryFJuF-|;H;qLdwXL|h#%dy!O0Fs{u^w7b8ds|y zf>3o*w#w1&e7a_QMf@I+ghfGst2B5+_8NG8VjX}JktW>BHaFx*KKV4Jv=ef7tw}n6 zf_`!7M8jF@v(^?`x%)iTbSBom^c=pqME3J!=;5OEeI+a0dXHZS{PfE^^M-GI+n4qR zv*kX2mF>^VslM%kAN~HrLFj$oz9YM$^{M*pr1bqI4#t5SCmi6&*|6_q5R^T;)nv*z zv89Y9Q

)l*Eyeg%4YSQmd(5J=?jlNNg9@*6S7?jMl4hHz_3Ld0)OA=99iiGi@y?xh(pTGQmF(2E5S4>JG~L37XoHUWsj0;;f3Bv(>e&zcj~ zXS{ihlp38CNL(NB?%}5`6QUi|X3jevr>x@R%k~s+D4ahiQ)Zn+>orMipmAC4(ZgV# z1^va@M}uVvY}m~9Z}NAG$$Rk1RnHmhWB!V+ z9kSL+nMVGn6Vf|F<1HgkIlp1pwV=+Y+ywrjuP{`e7zq$uwqr<1=PSiln)k~9elhSo zK+Hai1S=ymB7*S+8a}HPMp)*Ie>dyLT4%NXsZ(@iOV9#}WHE)yVU}K?sx1@di{yT# zl8DIP_|3iK?rEQ7@lOpmK7bak4cKdY`%Xg-BstfAJ|mvi%i1r`+x$&S#|z2#f@bXD zU^jk+sWZSD4_A<9wE``r-vxEz1HtVQtbbmslnP1$%RWM-h zPL7ui?K*a0xusdX;WZ0l@p3iPVS zL7VZP>1E#8vL=pP;Wus;pbbOi)?K7M2d?%{#L~#B6?DBT6*>TxTTbd>sp~-I5T!a& z*m&Y<=cE5ZCF8rE*Oo3J6=*+$bG_%u=~D6bvF1#O6xoAnr!CoH7gO9j7FFXhfmWaf zFPPSWs$7?LtT?W|zd}pXAMhbB0rBJzYQpA#MtB_2lHMh;F1npiznkXm;JZn8te4{8_odeKbRQSErL4Elo{t$9hzUd%oQ`TDM5$`_7bN#nmtk>x1;+}1X{YH0 z?DSvZw>GlCD5$Dsar~5uN&3#0n#VW9o~Lj;G0?P&Ef?rNDhNMl7}rNYx?ohXY2BD%)SdW90l4a zT7Nj;BKzP?Zi}i}WBZQzg}mpL8PkhsCq`N{l$M1dCTCWPH(@;+a_2~MYZSQw`fr!g zdv%^<`X|uWUL}fuL$63Z)+_Oe4$PgB>u&w`Mwwi_1TVcM(;z;tRi_CUje1eCmn}Ox zG~L^8w&CpwcYFK%Yod1CuMa=Gp#Awf!}R-CvgaN->%BIQ$POqgWvZQHSMoeTe(r_> z?t^-)Gi~JG87;NJnUGaE+u#(dJw5(RicO5;CgB-R-R73#ceRon!WfmHQ2vhZEmP_i zBU%I_gI;+W`F-~klKU^2`8zJNDEct1JlxmLvwTn=aRFsiGr6HjYnRLT@!yc@*bLTV z1VS2z_Ynhydtod;xRJ4lBWpv<7p03GL5gte^#;LoacHu739sZc`9*Dg+@Y|(@K>VGm#l}+lI})G37mD`m z6lA2tysxUL$Sk>o6Jl4(2eq(!JaP{^_W4xrHMZGHpnQk-*yW(VnhQu}m~o20XrLkS zu~f6kw}X0-o|T+TQ06de=XBC==wl)3LzyNf)#8ol3I9M;)O}aOHz|)7&+UNX;FZ(k zNWr!N=fd%RK58I6JM$OUGok%sCYXf>yF)Wc??+l5Cc4$6_1W10)=j)&TbmD_?&u=t z3fGslGL|Fsz>7c?$U1j7;XY|4xiKq4FvP>~qZ->1RC&=S>3g^b>$hQ0cFM<`FeS@FoO||1saJa)2{m zR63x?N=&^|f~C(v`=ykqquF=zi^0KBW#x-vU?H zA9NK_b-M9-(YHlL`>H#mMZA=cfyo|mt)G>&;hbvug)0_5@7Xpq)AeK`1*n@NC6=u+ z3yH2EpK=jzS?c>`#;f!YvuU%!n)(IuMny$Nd9bIW&5t)+My_OVGpS3=8NvSYLa%Ry zFf`iIfp1>#hq;hz&nKQ67#1v`WJXed^$*|dJg&*{Tv@DC=yp;sH!qUpRz$dh%Bni7 zD+Ss3+?00uT1GHZN8;kHJ?X)pCd+I@?F~dl8lxN+x-bd4cfdXT#L!cDCyKJ~d#%{h z2i7*daDeamE?UY_2UuF82nBvg;UGqfL!kWJI`yTLzKpt-`Yg_PgNt~lcEq}?@pH3} zE0rz>3wv7j3`tK-vMJ;okvs=J*r*z!854;L_fhnl=jKK=f-`p6*c5?gjYndAFEkuFxGSN z7dt_<(<}TLeo+&(j3rT@6s8HNjd0i){J})>sR?ZnR>YoQ&Ul@V#-PKslf#1u$J+>L zAK$sZD}7@`2~COosp(yU0X!dG4)H#VR}(vHjjH*HkI5vB4`=gBuIQI!=u#4`_15|I z+MqtQ@1*&OvBaJbR=kdl?C7STWwIbvnRgvr6$*jQQ2|#>q7}$Rk{N+Hk$U0VL}-3q zLt@i(6*{j;CvosK^k79mm2JF8rmA{%H(i zARIi=^wN9*UQ=4j2;=AzUqN&Z6N;}YCDp9#e-t3Z)A)%O&1n4tDp~&O_YRt1Z|t@% zx;D{&$g11v?yYe~8vfJg55z^%{8IoSxmx%%SqP4#xQuWcqGFBc3Xt|#*U8!6Vd}Gf zuxm=3<3+$YolAGIUFGk{cB(i~lXt;;jRQ<9u0xpe8Dz=Y0_@lmeZ%q&nt!-wg4m_w z)L49G-zWB74QH#!+eLYKCMNQ6^HS z@arf6d?>zl^+Bu`Qa+R_za{iexGwKr>aVAIdL31XZ3=CNjtdUB>y;bF-rkYsxge!Q zpK%`I-lkUGm1tH==?FutZ9ht~?veap!cA_@$@&=&>aQi$(fO!>hgS1HhnVT}Zjkn0 zW3obCF|rNQJ45iyBpA|hwDbbe0%$lg!Wqh1 z*sB2*?ZyfA2WQwH?cytF7S$dGmAEr0bsSUUgNIb!Z?@&Bi9?933|cUgp+J1D%5SN! zoiB18R0&WC_s+Fzwbn!J6lfQm5}R&+kCwH>L}cPE>LHA7^ZO|O`kZqPPZeL&yoVQ1 zY395buo)7CqtZfzY6wHE6tB+j8uY^^H=OlMLSuqne;z%|rz9Ty*!gmQaH@%7D*LDR zR(GO|({?6otkc0nh#rZF;9;*A9zB|G>m_Ka$HeC7Ou!Q z(b$rEUKBuct#aMX9?*)3LEp?t5}3c~ zk|~+vLis$Xzpy4ou&}O2rbAA`V+ec_QtJ(?|HbYRnMM|Gn@equ5y`b|C|QO4Aivq+ z=q2Bf1+~YBQ@3IffueM9O57Fc2yB~bS9Mvdk-#HS97WLxQz%;jEuIe|;csU`^rL>*_6tkI9Wd^DrSAN@a7JCKBK@z(Y)!NlY#%&ua2T zo}=UDwo6>Ci(TjLdFy;8==iG_Lntj(>!K${O&SVE_Nq_p-Klt9)Fe}S&ktziD9Mm` zy$-zRz{M#R+j94#M}ivRbVVI*so5!<{i>};8s8T0Fepo+yzserB|B~J1z+THh?~== zE4@i?srF0?ps^ZoWce^}--sDT-L>dTn@%ko3=*DP9^sz!h-8Bc8A{t2nTI+uM7~Cv R(fx6C^hZ_GAKAZl{{#BA@ht!V diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/alignment.o3476018 b/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/alignment.o3476018 deleted file mode 100644 index 6b027b5abade..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/alignment.o3476018 +++ /dev/null @@ -1,165 +0,0 @@ -Wed Sep 1 01:07:01 CDT 2021 -TACC: Starting up job 3476018 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -epoch: 0, train loss: 1.9497510997616514 -epoch: 0, eval loss: 1.754234939813614, correct: 3521, total: 10000, acc = 0.3520999848842621 -epoch: 1, train loss: 1.6049139609142227 -epoch: 2, train loss: 1.3857794501343552 -epoch: 2, eval loss: 1.2831632316112518, correct: 5410, total: 10000, acc = 0.5410000085830688 -epoch: 3, train loss: 1.3016913873808724 -epoch: 4, train loss: 1.2616293649284207 -epoch: 4, eval loss: 1.2658930838108062, correct: 5409, total: 10000, acc = 0.5408999919891357 -epoch: 5, train loss: 1.2320433721250417 -epoch: 6, train loss: 1.181612290898148 -epoch: 6, eval loss: 1.1402096092700957, correct: 5881, total: 10000, acc = 0.5880999565124512 -epoch: 7, train loss: 1.1643818397911228 -epoch: 8, train loss: 1.128499301112428 -epoch: 8, eval loss: 1.0965303361415863, correct: 6053, total: 10000, acc = 0.6053000092506409 -epoch: 9, train loss: 1.114193707704544 -epoch: 10, train loss: 1.0830892950904614 -epoch: 10, eval loss: 1.0390974164009095, correct: 6258, total: 10000, acc = 0.6258000135421753 -epoch: 11, train loss: 1.0508871960396668 -epoch: 12, train loss: 1.0322130365031106 -epoch: 12, eval loss: 0.9689173698425293, correct: 6482, total: 10000, acc = 0.6481999754905701 -epoch: 13, train loss: 1.0006194637746226 -epoch: 14, train loss: 0.9652800906677635 -epoch: 14, eval loss: 0.9150958389043808, correct: 6713, total: 10000, acc = 0.6712999939918518 -epoch: 15, train loss: 0.9430981692002744 -epoch: 16, train loss: 0.9156872307767674 -epoch: 16, eval loss: 0.8703682094812393, correct: 6913, total: 10000, acc = 0.6912999749183655 -epoch: 17, train loss: 0.8822251515729087 -epoch: 18, train loss: 0.8485424190151448 -epoch: 18, eval loss: 0.8234190821647644, correct: 7120, total: 10000, acc = 0.7119999527931213 -epoch: 19, train loss: 0.8285953049757042 -epoch: 20, train loss: 0.8009484337300671 -epoch: 20, eval loss: 0.7808267176151276, correct: 7228, total: 10000, acc = 0.7227999567985535 -epoch: 21, train loss: 0.7774611741912608 -epoch: 22, train loss: 0.7435575358721674 -epoch: 22, eval loss: 0.7523189872503281, correct: 7367, total: 10000, acc = 0.7366999983787537 -epoch: 23, train loss: 0.7315681789602552 -epoch: 24, train loss: 0.70117900627 -epoch: 24, eval loss: 0.6928718358278274, correct: 7580, total: 10000, acc = 0.7579999566078186 -epoch: 25, train loss: 0.677533069435431 -epoch: 26, train loss: 0.6627033298112908 -epoch: 26, eval loss: 0.6921748876571655, correct: 7586, total: 10000, acc = 0.7585999965667725 -epoch: 27, train loss: 0.6410714266251545 -epoch: 28, train loss: 0.6192339707394036 -epoch: 28, eval loss: 0.6416671514511109, correct: 7719, total: 10000, acc = 0.7718999981880188 -epoch: 29, train loss: 0.6093639281331277 -epoch: 30, train loss: 0.582532714520182 -epoch: 30, eval loss: 0.6166591048240662, correct: 7809, total: 10000, acc = 0.7809000015258789 -epoch: 31, train loss: 0.572193189847226 -epoch: 32, train loss: 0.5541256200902316 -epoch: 32, eval loss: 0.5951347410678863, correct: 7922, total: 10000, acc = 0.792199969291687 -epoch: 33, train loss: 0.5345369838938421 -epoch: 34, train loss: 0.5273816007740644 -epoch: 34, eval loss: 0.5837202191352844, correct: 7972, total: 10000, acc = 0.7971999645233154 -epoch: 35, train loss: 0.5059237045292951 -epoch: 36, train loss: 0.48622317095192114 -epoch: 36, eval loss: 0.5698897138237953, correct: 8024, total: 10000, acc = 0.8023999929428101 -epoch: 37, train loss: 0.47362951143663756 -epoch: 38, train loss: 0.46030426907296085 -epoch: 38, eval loss: 0.5610475659370422, correct: 8049, total: 10000, acc = 0.8048999905586243 -epoch: 39, train loss: 0.44165324921510657 -epoch: 40, train loss: 0.4327346086502075 -epoch: 40, eval loss: 0.5642214670777321, correct: 8095, total: 10000, acc = 0.809499979019165 -epoch: 41, train loss: 0.41423581935921494 -epoch: 42, train loss: 0.40917488780556893 -epoch: 42, eval loss: 0.5602998435497284, correct: 8131, total: 10000, acc = 0.8130999803543091 -epoch: 43, train loss: 0.39171184477757437 -epoch: 44, train loss: 0.3744060835059808 -epoch: 44, eval loss: 0.5633655220270157, correct: 8134, total: 10000, acc = 0.8133999705314636 -epoch: 45, train loss: 0.36267226934432983 -epoch: 46, train loss: 0.3420030690577565 -epoch: 46, eval loss: 0.5533872425556183, correct: 8157, total: 10000, acc = 0.8156999945640564 -epoch: 47, train loss: 0.3287143409252167 -epoch: 48, train loss: 0.316296321396925 -epoch: 48, eval loss: 0.5576229721307755, correct: 8209, total: 10000, acc = 0.8208999633789062 -epoch: 49, train loss: 0.3068045072105466 -epoch: 50, train loss: 0.2929732614025778 -epoch: 50, eval loss: 0.5654072970151901, correct: 8227, total: 10000, acc = 0.8226999640464783 -epoch: 51, train loss: 0.2795026940958841 -epoch: 52, train loss: 0.26673941375041493 -epoch: 52, eval loss: 0.5736668109893799, correct: 8227, total: 10000, acc = 0.8226999640464783 -epoch: 53, train loss: 0.2506744866164363 -epoch: 54, train loss: 0.24351145980917677 -epoch: 54, eval loss: 0.5846156671643257, correct: 8204, total: 10000, acc = 0.8203999996185303 -epoch: 55, train loss: 0.2253616195248098 -epoch: 56, train loss: 0.2177750574690955 -epoch: 56, eval loss: 0.5943332687020302, correct: 8246, total: 10000, acc = 0.8245999813079834 -epoch: 57, train loss: 0.20670234989755007 -epoch: 58, train loss: 0.1973607996288611 -epoch: 58, eval loss: 0.6195310011506081, correct: 8245, total: 10000, acc = 0.8244999647140503 -epoch: 59, train loss: 0.19024320448539694 -epoch: 60, train loss: 0.17597664877468225 -epoch: 60, eval loss: 0.6139472931623459, correct: 8294, total: 10000, acc = 0.8294000029563904 -epoch: 61, train loss: 0.1674150490791214 -epoch: 62, train loss: 0.15718420511301684 -epoch: 62, eval loss: 0.6285309329628944, correct: 8261, total: 10000, acc = 0.8260999917984009 -epoch: 63, train loss: 0.1480691913439303 -epoch: 64, train loss: 0.1384550367234921 -epoch: 64, eval loss: 0.6587671056389809, correct: 8263, total: 10000, acc = 0.8262999653816223 -epoch: 65, train loss: 0.13241269834795777 -epoch: 66, train loss: 0.12871786830376605 -epoch: 66, eval loss: 0.6718123883008957, correct: 8303, total: 10000, acc = 0.830299973487854 -epoch: 67, train loss: 0.11577517866176001 -epoch: 68, train loss: 0.11130036151378739 -epoch: 68, eval loss: 0.6887702852487564, correct: 8332, total: 10000, acc = 0.8331999778747559 -epoch: 69, train loss: 0.09883711646710124 -epoch: 70, train loss: 0.09635799735480426 -epoch: 70, eval loss: 0.7159708231687546, correct: 8307, total: 10000, acc = 0.8306999802589417 -epoch: 71, train loss: 0.09449125119313902 -epoch: 72, train loss: 0.08857650914210446 -epoch: 72, eval loss: 0.7160102307796479, correct: 8351, total: 10000, acc = 0.835099995136261 -epoch: 73, train loss: 0.08085554241373831 -epoch: 74, train loss: 0.07873564483407809 -epoch: 74, eval loss: 0.7119918942451477, correct: 8393, total: 10000, acc = 0.8392999768257141 -epoch: 75, train loss: 0.07206312137446841 -epoch: 76, train loss: 0.06772394200824962 -epoch: 76, eval loss: 0.7328802436590195, correct: 8351, total: 10000, acc = 0.835099995136261 -epoch: 77, train loss: 0.061777200397788265 -epoch: 78, train loss: 0.05721901174710722 -epoch: 78, eval loss: 0.7407010316848754, correct: 8385, total: 10000, acc = 0.8384999632835388 -epoch: 79, train loss: 0.056560877406475495 -epoch: 80, train loss: 0.0528045150318316 -epoch: 80, eval loss: 0.7767532706260681, correct: 8354, total: 10000, acc = 0.8353999853134155 -epoch: 81, train loss: 0.050682742870887934 -epoch: 82, train loss: 0.04895328068915678 -epoch: 82, eval loss: 0.7942879348993301, correct: 8368, total: 10000, acc = 0.8367999792098999 -epoch: 83, train loss: 0.04686643050185272 -epoch: 84, train loss: 0.04325723648071289 -epoch: 84, eval loss: 0.7906839996576309, correct: 8356, total: 10000, acc = 0.835599958896637 -epoch: 85, train loss: 0.040166335769605876 -epoch: 86, train loss: 0.039296497894945194 -epoch: 86, eval loss: 0.8033982694149018, correct: 8376, total: 10000, acc = 0.8375999927520752 -epoch: 87, train loss: 0.038185219698566565 -epoch: 88, train loss: 0.03735689769441984 -epoch: 88, eval loss: 0.8039661139249802, correct: 8377, total: 10000, acc = 0.8376999497413635 -epoch: 89, train loss: 0.03383794939145446 -epoch: 90, train loss: 0.03318257091034736 -epoch: 90, eval loss: 0.8097118645906448, correct: 8389, total: 10000, acc = 0.8388999700546265 -epoch: 91, train loss: 0.03290939923109753 -epoch: 92, train loss: 0.030776230903456405 -epoch: 92, eval loss: 0.8237936168909072, correct: 8401, total: 10000, acc = 0.8400999903678894 -epoch: 93, train loss: 0.033349379108344415 -epoch: 94, train loss: 0.031906195783189366 -epoch: 94, eval loss: 0.8250258564949036, correct: 8401, total: 10000, acc = 0.8400999903678894 -epoch: 95, train loss: 0.03031293043334569 -epoch: 96, train loss: 0.029958056238460904 -epoch: 96, eval loss: 0.8200247555971145, correct: 8402, total: 10000, acc = 0.8402000069618225 -epoch: 97, train loss: 0.029532150564981357 -epoch: 98, train loss: 0.029668816346295025 -epoch: 98, eval loss: 0.821219089627266, correct: 8399, total: 10000, acc = 0.8398999571800232 -epoch: 99, train loss: 0.02980129667842875 -finish training -TACC: Shutdown complete. Exiting. diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/loss-vanilla-lr1e-3.jpg b/tests/test_models/test_vision_transformer/test_vit_2d/exp_logs/vanilla-nproc1-lr1e-3/loss-vanilla-lr1e-3.jpg deleted file mode 100644 index 3f47b07b80585fa3f78979279938a1f188ae6858..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 37624 zcmeFZ2UJttx;7fRfDq|Yg9-xDRGLx}6=@I$XL;xIKJPQb`plYx z>^Wmzt7*E9RC2Pem`XD&`o4o+?^Ztj08JiNQOd3bra zxp(pJ;^q7G0ROj}pKtfC7r!d`_m8r3ad2?)@o@9}+b#dp6RQm(w2N(y{XPfVeh9k| z8;1}Zs|x~yK-jqct$A#a|9G&mb8vET^MK9Z2XCm`12&(718g)G*kbVRQ1Cj0Q;17= z-;tBtA~x4}_WOw*eUS2O*MU=2?P3>($O>B5?}YL4iA(I2lu|sXq^xpCTSr&#*l~U1 z(Ed9i3g>J-zSV_YIGXj*U-zp2QP~q_6Ye78aM5DL>aYHmO^{ z_RcT6*dQGLZWj3b?}q&^b_s!YvHuzao?mvcu?K;LLx_`W-w|%%lQulp{6zL2eXvXP zRLZlecHRS87sz7Q?+o#YD`*oGDZfnnH_QGp!@~Ycmi^y`{Xcf$A-g%)z`^4Xf*>Fa z`tvwt$p0z-d;6eLiqVMh!U8VVQO?ppA`8ODf-r@x0du$_;M;+8VUE#YlPpMGo$@Wr z7_Ki3o4yd7>&(q)4mubFb*+qNLFStDlky6feK_c*xBZT^g98$eAdF7v=^c$o+B?w| zckL^J#{g(zT=ZKqm$bBTgAvtz1oI|DayvFQgz8)oYAY#FDY9MCZQL(uurYCn||0!-=t&liE%MDyrH4D;y5$93~ z1U)1}sd-e*I4nVf1&MT`TGy~39Z*IBIMCs2l_6` zuiL+$p>$7Ib$iWyvFsV&<5^?)_~)3qoQ;+kMqhLYymPQAF+bgft&eO16AH*< z2d&d@2_|8a;Ns&f$jwFY{A}Ke~Fw)E!r=K;8}f zX~s!0<2Xomc?V%!rP#(98q6i{_H@>!U}}7eJ@kztCYoM63z9c4_a+)Eq&CZmT^4(s z+a?996Ic+sXeD(B04IYx)*U$+&gAtdL*)_96-hRed)`#gFT5VonccHQUwKzWszXjJxvuc z6FqLocHr=-dptVdV6XM)qQ+E}F~nS+GlcT7hIRtb>Nt!tYs(7EB*{v2<;vHO{Af9X zb@h^wxFYfJ>@xLdx7xXEJJY%R3S$*{m<39UxOSiJ{R+za83-Gm=o)g@wQnt*``-WA z)^@nuCwyBoz_Vg-*Uh9$yG@75@5GWuE)_8fSP-|z+U6}*h~7NzgV~6 zuTHyxLyr25b=~ZC{aU0O&%nCqi!W_)+pu!;MqBneHnID`5 z#?eZzzVAE=)ZIzUsV^!IeQrK;=lKEH!8OOkXNYr4 ztrC>TbUO2_u$k!3;M->gs*fLx)4O#|?$Qt{EDG-@P*!sb2oW z@M2+SX~%@Vp;p0r&P*HbS)WveJ4dx27{59q$>&9?1XT3GF|wF0rX~QcTrLO@O0bgd zXq5$QC_-oiMc{4@Tvhd$p$btk{%lj@lk1nI)qhM_HZJ`v+a0nSmWnyPlqU*5fNkH~ zdWbob!dVTg#rAq@_BiXj8QbWOFRTLYX1y(ne57}~jWS@A>YzncvvRm0}kI zVf+CY(9SUju(fBK9wBPBR{7_AW#NhP$+p%!sQST+x*B&tXHke%ekvy|D-fi!-9S@g zehf)pSluLtQ=_w~!;G{UCy|bwnH)C);=c1PX7^i4V6y)WzK*Y+Tc?P4ynFeNi$duM z1%D)NZt<)t5!diR?X3dUqgYpD7)G2j-hA*y)$9^Ro${fHt3^E5G4h24sk-^-F+Zto zSJ6EyQ&H1cA+))GY-YwBFrL@5J z)qBfa?cp~XYJq;;J|q^KhWR9#iYjG61nM+l-{FvxARRk^}m`Rro(Lz$k)Xaln|^hAoM<5I3NMTsum?5=PXZA&}r;2Gld^p97)>FjjUJ+Ci^7m{c9uRq+yv#i;bHwB}f2DE|-mspU!2+zDIrf8mXeGc4A$>ID%61)tdf3s@euKC@RE7Q=$nBYF@IuHem9^5z6I33>N7tbwz1H`*yc zr!LXBD^Q|-WV$TywOPPTOs7u~QL!*{Ygqr4iXnZStlBXn#{k#ETrPrq7RE)tHB6$& zrT9}`Er0xOHJ0xdMI7(wL|S@zTz^!*c*Vfo<;d|rYyx%X%;jNa8Q9u}4s84^hv7wv z1Jwy&kSDqiFYY*z8z>5Lhr+gut6QVE10*KIQxC@}7}z@=c1>1FQhmRQxRsoc6|^c% zQ>F8bYry5c2hK{iiUVm#N@WaMeoI1Ec-EudUVEs&U|RC^_|f+-&-D$LB~W&myBfr! zX%w?I=5D}~Nj0`&Qh#0Jktdi5%##_1f>g5ZgHZ5lE8fX3Pw_Xc7_No5p zDUFcJ){}!w%OJWijX8kmhV5J<=RiWWXkyHnCg1M7&)-PYB1Wm9%Um-1ls!P})|dTQ z?Eg|g>uj8WfphCfdAvoi0apsK&h!p)_k7%|1m!JQ1PdYs7o>-fBd~ig+~}+1i9~JQ zo9@rY=}Hw-%eub%Nq2b5zkGWA?%hXI6O1ig4ghewaC6x>J~+%CtqnMpd18fdUe7xA9R&sCq z+lzC>T?kU!!1taDLZtFlqM9vr9Li(&u@mW*gjA zjY%LJh-Qk;(&hC{lY>|gekiFox<#ZMd&{9;DXn;xQSdPXs&Q%R*w0Ka-3`g(rX<-` z9L)jK*D6Z64)AxwYI{2w22NPvlZ3BBzAdgUVpE>$YSw~J_jwp+7u(IbhReC|7VI;3 z9ejsuaErrvK=-1c)SF`nPegdGBpDHtNgW~I4ctLQ8hU8F^;XQcOih!HRa@nI+%CyoZ9?JlCH)8;E6Mw%x;?;b_RL^JtR@|j%-aqP8b95-;UM|?)<4cWR&buA{t z_Kw#TyjEM~?WTa*bDvC(PIoLVoK1z$dmoyFa#7+*TW!gcvz;lZUg~{df}qL1Br?5g z-x%73Xz6qmZ(;r}(e+J=Xg!<3$pLf#%>nHTsDn$W-4p9N8GwwRk*IePX@d!`2E1sb za8>j3e$hUWVx7qtsWM}8;9NZ>BlQzI6*5Rn!h!bH)Ypw-4xx9G#~!gDkh0{CdwI#Z zyxI1{Kc<&$5%~8nyhqjd$FU&X@A~`usLU%6lUw&OIVjuiTye@o)Jh9DV}+M?KBkm7 zl=tMkTC$Dx^9~T%U6#@>d1%>e-=l|K{aBCWcwD_w=YYTEFxGYL^XwTQlWU+X&9f=D zfpQ8e*nXXG|LP@a%O^JjC2vJs+S`6>;sXovI=T~y$5KrhnRHvi+N6jxAWhl%ZVQZ+ zF4uU#MJbN0V&vKIxu%9NkBxrCSe=V<{pIVS*@yNxHymKIZla@`0Yyqb!8wI2O~nCg zJRLa6FN^gd%m%_N2i+^r1XrWznn*AGEH_O2_Th|@)v8qQz8LWnwh)edMl|l0ktX#M zaIq8^r5_um%MxL{sKJZm-ti9vuOR7B( z&)j8DirSBuOeVrQ5fOVPJ>l$RrCv>;MK~wrOI9>i%7aoXB5Qvya@)5;pleHfEig}^ zXX5q`o~c@hjfv=gv7p~m(x}sn0-Rg&-YHZ#NpHo^rI@bv7KkrKotf>?59(^H*bLB* zDw4}~c?o}1?-_=-TWCGp@!!>5)9@v znK#`z7V)YJnt%O13lg=TQG{L6K-BUuMOs0){_W;{c;fTcJHNY<{MQxy|Gwf1H{xu& zZ+IT8MMAIBH;Dz|ZP6n63Qk+eOi1y>*Gh+;>9afF{ce@3!|+yLr+mu8U@;Pm+=Z;I zWGI4e5Dejh+K}`QLoCQ{6WSRTWY`hw4y7xhxj+Ck015gPY+gftj;DrD6bBQ)V6zv- z*8(k_@fh~l%Bl=!$}KgyexRlv)y`N`uiZ5uqV0Z0r1sGTGxmBWl=%TxW4*E-RDz_l zF>(zB<~%k_IKGRZ5FJ-STxJg|L_MB59d&_MNj>*POC@t}JT;vKDQskZf&u3Meir0O z7J?9i^FZ!cCa_T$S63KWQ!2f|A#Ztaj@(^6C+KS8YZdaA*q& z{rIAy5#O;*C!}Rz1;5@o06A z65X65JvZ)8n=&nBOV^I+Q_Ph46Ex~wkH=vW6b4sxqbXT$_2u}m%-a0d`B0XD0$W~D~{2cNh6?6zOzRB0F(RCDuTL+l(k>%HLs@>W(hQVep;Hx8{ce*G@QTR ztQCB+-siess#ni%2JrV2X45wYQG-|^P`r9}5M8rzxSFjnLo1@SU^*N|;fX=3xZMkV zA?Cg8y^y7wDsyC7_i+0{xm6+a<7^Ezyby3Z@u`&y(C_PP4@qc>4lgS=JmMjvFE$-! zqu8CQcFo)Mnjs-qzU=hyjhxlI1OKRZ#BU-Z5WID`J$)x@7=E`rH?x+;Gcj(!E*H(@&Pv%ZEOQy60&h4SnJdHep+uKI#TlGoc{d$1rQ zMu3xblPj>-zVGqGYf^ijhz(LP&ji;F{(v5_PS^MGuC9;`k`YR?AXG*pplFHm?-)gL z&;xC6B^!#)4ic4ienh)5KkstW)l{+E`5|L#`M6^w!THDg4=q6W$hg1W#eUbu`}#0x zY`YF(?&+*>i&F{u1dvE$LyJsouHO7<`4;Ig&J^>I2nw1I3^E+CQg1e_=t{MVRd|}_ zw9t*5{5}Wm_1)z_Hv(qK2JOtFpVVcYCyXh`DDMF2V|KQlkD`crCi1s;$q1^x`-ffP z7Bm{c&y+&Tk#lhOnL>1fPc7=t@WS4Ak%At3KI4sZoA*UtmVkG>8BBy#2dz}m%$@9J zF`rnF8tYCLL~gM7ML%zg$J~^x&1$AWmE@b)zTK&}kGvlU&itJ#_|0rOA;%zK`2d%8 zIk_5gav1&iwL%a5yZqdFMN5GmzP{eVjM1cHZI^65Lm;J_>;G`AFw-IA@!x#i?>*Fp zx7h!w`Y*N=w!hUOI${eeHZ;qr$^P4UWLs$R*TP$EFhPJ{k^`}!_K9csk}b%Yw;yX& zMs1O>bdm3KK|R>G-2I`*t{HW}g7C$yOTR3Ga@uFjr}p!~tP3fJp8KSHUNyNa*#@z# zJ^(;9p}7YvghdMo`ygR&jESG@F${sjyM}otf_Fu3&dOjTp6eVxn~&H zP2h+Z5jt6H3)m854>(g~qe_C+sU`Z;Upz0$>Ut0?uNKx#I2YXJ^oVJu72^f zP;Z&l+{FDgY;DJgZ|(EP5MQPJSP(}T3vyDMZ4O2@pWQaGT>%;V8`2%f9k;s5XQc~y zfPydmc`4Nk?P4;-EDi9rPA z&FoG+j$Uc=m$e6g!-O5Rg3%>e5WEa_3bDzL1PKckWXpKb7qD%D(V5FQ;8-IILR`*^ zK_li6yA~y8)F^IoK6l@$TGnK&T^Ox{30ZQzEoyOhwkS&Z$}e@OC)U0A`*#T4)%5+8|;T2HIF}po?wD9Rt@lmom9bP6v!1L(sR$ zU%K#dyi8HAAJG=0#kD!P2E2tP!Y7siwfv*u(HGver$4dIjw#p-smRlI^{EO%D!FdU zy^w1dSAO-hI&sIsuJUrw_DTpmk>JY(B%9SzpwTTR!Psk9Z`tm|%?zCO>*-U=k_w6A zrz#Q(_q07c$$2^N&jx>A&((}*kdpySXaeZHfHch-ZTo4%Iu5RpgxtFZ1jbJLUZ^2hiW@2)97JTBaEApOe8kJ$Dz7!6b=Aq?d723Mv?$Qb*Z1+-&m zt=AdrJ(xp`S9MO}(gyEuxwcfhZ;VH6F@-IrgFP3{?`?dz&9>!pEw*aMLbmw9Jmu@G zz_70<*5hQMNr{E(!9ukU!j=kW#L1PDSx!Usf7tDQRpj(gwNzGhkP{WyG3>CkrW5wC zXZTqB#io=P4kf-ayX1UG+9y6bh4B*M2}=t3m_nKAwO8tf?s2~Z0}h4(Q)g<10{Y|K z@8$%$PTbIsd{yv(t8d$nP5=6deF;9>-1@lPc=RR4V=&tYjeyI(W)y+UK{Ygfwq_zh z$6BhBWW9HK`EZV#W%%i=S)1xOQXC91NdEY&dJ9GB%7txog`}(R&*}3!Rs$ZZ?Yygrj9&51) zB_sxeG;>GjxglbXv*qbGO%F{Mx1cvH6nHo}HcFXdKcZO>L{lYGPy+!p&10lcrE`@% zvKPCX^g8n-mzxp?AD}NQMbR(ArZk_?#EWI?>PDu%IX!nY4)URq)S^GMdH1!UByScp8`-wC%25{RqGtv><%}p5u`fg8)+~m87^)YT&s?5$&~1DZ zHQYbQc{H?U-t|ry1YwCFI4@$?X<%Ywj!0rb;tT#4ljuKNz%?5_23-}P?PGqx)qN{1 z0fO4)49J!!HwQ{(x5LXi-ht7Z-yO(L;fmM%f_4vZ3K{ytbcr08i+3vcOx!Bz<^YK- zj);WS;1V$V-0dqnE0ZMjeY>$yOg%v`beAxX=}Fj3ak* z9!Wk2iwA?EL7K7$TpqxJ5o&bnnNL=|7CpBYn7q;~ zi1I5&77MZxj9t>b1cXXZ^^R2Mdrx{*k(HmvLSLp z4kR-;LH)p^%a{Qy^<;}83-aQg{f<;?Fz5zN!42RL?k0{VKO@Y7j2M-N3Y?Gw@8H@; zZlE88z0T}bSL**Vk1{20?FF6xEyo4&&lW{Mcj{531gl#nU+}b>9 ze|mxPz8sne2fc6vX>K9jP;B0p`I$VxNcs@Eru&7|%QSMf2HR6lvHn#(v{T*o!pXr9Ls1mr&HCnnY)7pyp+R&EPx*s_UqMM&*;y0eGjoO%?7Ha=qU;MMz zt^w9e<6JN-dks$2CBw^at~!g#dtvq|%n8`sv1Z)scEF%v>ywDJXJ?kU9xto3{>Lk` zJRUmLnzm>YFdgVpQqwN*LDSox@4Zai74t7AuUt5MlILZSoe+~0o@{*H6JMX1zZ9hZ z?M;cpq@A5Z%T{Klznb4kVHLNOFEM4C^AZcgKc**M^RsIMg6q?KwL#%X@*E_(u zf#1M?%L5_TTG-01*8Y+K&+*h*v#(vv8qSkh+>fu}4=P_R*PV=Byk++MURPnU0e zUXAa4^hKhy6Wo&DDX0r0Sl#+IYS_f}UO=w1#_lNf)NS;;w)X*p7#5^@^~cnp8u=X>`G5v z@rP5B4)NKg&BgWA>rU59-`hFvm+zo%!N@j)pNFAgA;?t@Xb(>_`9}j$#Sl?s`~vX0 z&@|=~fTE9RWHnvV1wBXYp0k6#UxQy()f|oF{KOr-VnW-$h^@l52eZV!z>cp&32uX%N#sMtkF#TD-fZI7fZ0y0 zFkPaEev%wIJ+fp!kYJmIp3v2DFsGP5d(@)!+(P7!pGmL!Q1U`?kgwMnDTo`3a42l@ z8+0<9fJZ?F7GNAoVP)J&H$JYiAlHAkBu*|EXl0eYVs^10Rk?4-^}TaG4KxMxkrwi~ z%sUfOZs9*#xQZ=`?9WbqNU_&GxOe*NeEP}bI!mMHxqZW1BtcgljghD#n}J*epGT8; z{0oN|?JqF1*Gc2!!~2GhJx@4fT2TFxy{6>=3$i~F+%vEsZeCncEvWGCK>Sn&UrcO* zlCY%I`-c9%-!stbzxVuqBTT8xJIJVlA9hDH%tnI>EIG`;l@Qx#P}nrM$8$!Q@)dL3 zGIlM^!du5p*kj4lFw(!otS+(L_e=eGzZ0;NLypNkZ-Ypg(P+^C*yI`^GYaiMgw^#F zSWw&&ia(VbhUfeYeZ|keFavi~N{f2Gohd`loh4s^COQiVM=lbtG%Q zzH@G%An3K1ZD2hH`7Q=` z0VETVoVjCL@}T@og+oGq;nQa;FUH1(zdgF>6>xWBtvp!?cBWirgv+V><>1sw1NT`4 z*BjJ8zlr+j(YrP#!ER;$?1uRF?!WUI@HeF-$Q>qU#+1W{ikE`~JVGHUX&<62uMSPz zdDwZ#{GEJKPa_lv%3A>e?CByf8Ve0&L3+P`QZqbRfO6@=j}^xR>@CaswyL3?HD9mx z@|sCsT6?DoXJ9!Dxa!Qsf;?*m6ZdU(pf2L4mRk7!kB3ZAbanmVxN8518^bR=!huo2_U8D0ODFsY%i3iw4pAj5c9oy5gws{~NvFHHCx?INeWMhUu z5d>9x!Kk+#6d-Ex6L?nE$l-HZo&x03j_9-cFJqA3n8EZxP0ZkC#gyMS#924P4Y5R; z7n*Av;fejcMa*0oA*_jN#A8Wo(L1>4l{ybL6)*KKPhYDxI-g-D-IhszCo|y<@*@0g zxD7pzSwI{|GNY?$va=wcz7%8U>{*b+d!N+=`PDnkpb^DVcS-k5RZI{4`0?JoVsq2b z@TQ*RW1?bA_oehRsde)Q=apPt12+ThLtrA9Ze+DyZ0&4fD<37g7y21Tl8dcCsT#j+ zkW&R7(9Suc8csIk5=T}VcwbCfl{#4{T-Cyy;*VLwQ}N^XO7iO-PWNOA@biB&Uf@z-Ed2cm0zr$F(IJeQy4f%sw8fZqip~bS zAz8<^%8sJX=1Nd-orrjS?nEMvZ*j{uC0kYYoLgh1Jk$!LfoMeNC3FR zU$=lr%{UOGlpc67wXq!?WKD(uK8VRx34G%276bX1zc;xdw``9SVB4aaEXemA7K8;c zv$&E?RC*w036t;uI1HUsn02-tyvpRWTf(O(@lSjfVsBiKc6ql_7(OB&4t(NC`o~@M zZ)sXF=5P?m5^+t?MW+bJ9wgX=c6dt=(U%LYOKynUkX{p6Vjdya3VEuR)kjbqr`iKb zO>Xaxrs)Sfv=D{|uGsSEP>V#_7DUiz047LCVGf{@V3SSmXcI#7z!ew-g#w=UT_ghe z4N%wHiZdyb{7)>B`*TKn8~h*c(wWEAxV1rH%w90U%n>1UZ?(??_A^EfHf6w8C&g3z zumw?bkEdS8mhDq3)OVi`8X6j|);eQ$*Ah82IYYHfAm}`r99j&D_f)2obZ-;YDqjk+w2HBIdwm2!r8hg|4vcnE0^&U5Fm}*?&wBKCAOH+?9 z9%4O~%IPLFb7nshYKX-aezxS!3?(7BCq4b`x_bQ;to=kDk2WZ%a2QUMT#_oLhx;JD zhl8l(;C(}me-eWjW>ZPzuepkhj21hA#{FKZ=4*t2kQ zJF(TaF<%=P4R8^!pH-$5i8gPu&YGSt^CNY33gGb89ym=u;_PEX%hl?>uoTnQzo2I1 zZpng}zx+5&+M0$%mQFyEqWg};X+<;Btze-xCSpr6Cx172{)_K%+=x4Fs+f3H@IjOZ z?gpr|&I3Kq+Jg%=CxjTOBh1hO7NnI6_Wy+o@eb$js26bFHQ>_v3oJIe`6o^y9}0H* z@7M;UA-LB53TNfB$0`5q@~4b|j?u|i*mt@lW+Q?b=FZWE_-lY>JBavA&fq1a^5b`F zpnkR?x})?~u*Wkxk`fr3^rU|D)^*)i*tt_z!=7u~n1q<^`L9dG{&|VnpVn*fOC9TN?rH1<4+t`>ASj-H0T+eT zTAr=dTXmg70xGs)xSM&hnw@W$()82hKjUUBuf1^Q5ew1DZ@eBVr{#rbAAbQRqOdypPdDHJegGGo!Sv4s_&l zjiF^SntGlExfFk1!lx&d1P2FuY2qudXP9EuwuM!$HqOk!-2c=`t$~be7Q{Ud@6$;w z1&?dEXil-i*q7z@At4T^!HyYaAn1kjRXgXab)-oryp>n5zn_A?!@yUe{e2fMy}4L@ z(DdSlMqQAXcd?^-_Xoc;jyZz|xzKV5O6i~KZ87p1U7V+l$9echqqzlh5(qAB`%#dC z_c~Z2#jrHFZ}4?u`tlrtd|p#vX(um?exP#Miy7}qGVt=Ys2c8DlH=_q^2Kd46}t>^ zY}SVz2&LD)qe~`UfFVcf8LVpV0+f((Eul2is7l*wXnVB4#*76~N`MC%8GtG*04OI* zv|~jK*N%XQoNsCCXTAS=zfaN}ElL6A=M;$^0|S>d%E4!)n9dn#az*1TMT?wNP~t9? zy70roH&|ng+GVwuuDWEuvoa$1`boN{s{2Qa5T_}!`?g`JFy)C8-&*h~^_@v1<*f4a zID*mtwu5GWS^?JQJ@bAFfJvwdC<~IG1tCL!lTRF5AN<|GLBMwZRZ;y4#q-bQ(M8?5 zL40%`3kE~AgB^oWU{s-DC`%@tqzFd8=p{m9v);ZaE^+WuG7m1!*n8LFd6>Dw**{Xs zhSkE(TlR*je&12%cyDMGN(=f77cyTR)RJp%euPT3V5C>0 zr9*V|`Ps&X%-s)HA%f@32Ju^T5nIw42#%u>vMo-O8lqB!p%KM^$b3r(bvqT_v1fW_ zaYnLjSeZAXQb!E%^0nInVzuC7$3IK>6$qqRCe;ZrK|K;x0|wCAjiL# z`3_OLG0xo2D8#x}M&}}M!?zT}En6vh;kW9Jml0jwq$)MWe(-ci6@8HFHTvqm$6<2? z+pabcUQoPY;@9&Kkdh0L1vj|sGt967*bk6y35TsA0$3el868C)S3&IO0$J~1lz zu@rmY24Y7nICzGKX06yY2PRnOkM}MpTWI$57NwG~HU0^@n#|DAh*q0N4s;DP$hvdB zr5tiVn*mGY(PNi0gl$FdxZH zcNpz-aG&7>X3G|vzRbZCE3zvb_B*t+rWBYR@LA*!NkJBf{aol843m=r-cv{<*b0f* zeRTz+fJRh}UVHcg9lJW1BG!?#lIEQ@U8yq+!j@x9Jya&KxJ_sd8T(-f_)h34(u{0z z^;P|5?B^M1c3de#*i@`x!ws60zCg~}>E-T;172VD`jpCCXeiP5{rc+8wzCpa6jAE; zXc-qReI^lwTcb(ce$|Q@%>FoRv=gUXdE)E8N>cFuTF+lHY5ybWO?6&V1Q?_~1;F{u zWe50@akh*iO&wfxo`|6U$?7~x(3)?lp6nsKWc^INDrsq--kmDuP@|f&Q}O&Zw=%~v zUNr&uK8J{#6{1VPqA+{Vfh|D5%eVA9PN;Y*G9$O(uKIm%%Bjo}1)*l)=ntrQh>En} zN^)XtUh-_iU?TR0Cf@%v13dob2@^qEfoafgGE z!N{RN%IC`Kj8Ki<9?a3|k%KL<=f+W~#7jq&eR$8CqVfeb|H&Zh*Pt7k5vQQM81Ss{ zZbSPn1$%1REBDL`Zjf6iwa;KGrfnZHKV^CkY8+Bpdk4G&Wtmvhg*MI6w6MCzN>(K%M9v*uIoKl_2attsMq>`Z{4 zuRv^PZ^qGdm2=sCPv^O(6+dLEoAFkd{a9IZnZE1fWloVx1~pF5<&3RyiSZ?)ocY2I z^WxaiqfJLsCC%l-j>(T@0sRDKJjIvE>tBoq?se&{qiP7U-BVEAk9xryDvqh>#xL>v zk8HQR{umK8RW$|O%T!je&5~rUj3rw$DaAdgr*$HD^Cx; zI!HQn=T7*%rCvUsM-D?@R>8n?QD)zNG06O_DD7|6^=AZ>`zLSz&&sZT za{|V;4=01YUOA;l^j)nYXyQlOVfCO!RRGbJb+fVv<eA~m!K{h*et}IZ z`#mf@s`h@p>LL`oS2Qggc^MR;>0?G_tCLs#DO*%DrMTUiA2Uq>>}lN??R(AEd%DS6 z=q%r;rw3QrOv)K$We#4d$ikt)YFqxQ`lc_S1nf9w9La~cNy9zZQCbxu<4kcQ>MP!G zxmiGIE@H>$Nxr>9lSD)2xjg$O+66;Tmp&`D6JNs|c7whcR25D3f+sO!$?GIsEY33z zuSpuibwOj``^f{zQL8_+k-P%@7Q@={b_;JD?s;kX2kZ-0YD~3I%pjWH+F#M_2)i|2 zuI_X9dsJh6(kXeg#0kIl&C-BB`IY}_n7lhr7NyjQ69R)IK%zTiKk3|9TLe*&z}KB03t0CzWWY;mr^L>4B32Y7MUya zKv~8`Qa-p0ai+8v;guX2GG0~qneGkv7n>pC(6X&JYTdUqN{e6b|JG;1)VZM^xlqd1 z|ElQ>Vj%Xue*Zx!tfnf^B`ibul3oz+-yYqonmatPge#PSQ+^ zOpEE;U5f4?-~ZkQvs{fopXxhj72$x09 z!V6;K{T?Y~owMXCJ!{sE*EG6(fuO!aO#vybm1>$Ii29PtuxJvs?r*tQjU{L$e-Lar*2#*6hqCc=ab z4QN+Aq=I@8oRjXe!iG?SCNElh;^sY6e->2dR))U50&D0Ht2u{`Mo4k zTLQum_)o0(uf>J`8L*N0*Ldj;j7N0u3RD9@?*W#eLY8X|rZS8>?a@=dwWX;?)H2?g zDwxi#$dDm?5`GJ<>fn)ESsKP810meXgEq(V(yx${T*+H8t0V75Yu}4LopS2HczE{< zm%&}7vpwXS8b-wyU=ldj*(j{|+DNy$z{g0Umw!`#w-o2BTT5))T#cr%;H>VsW1O}h zuH6QgBOei7m2&AqaUdgNCH0&4`wJy*{o>QOfD2khr55c*@HC0Icc@~4P4v&~MAK9% zx!E^e5&S6y$Kfl;mSW{gL%J`^2q05yh!wMN={kN*yh6Uw@@B? z)h!?O3Y)8k8ox$$RGw;^w_U>Vpk~|i_H67$U!H@BFE!0Pxvj-rP&jMm&zWbIkdDYf zU8$q+_kzfcfPf-yDr5sD=w;~#vYTS8=v>IJ8~5^k3jYM$Ur>|;SLKUzQ=bc-pOW@Si- z!Q#>_6~(DdV?9TM=ASy9vG#GT#az^R- zd0)HGAd_Itm2%p3`ccNKyOxc2l{Gdr#oUv>0;TOz_+a~_lw$^VPJ7uYKjzH2p{K;E z3&u;&no6|_*A&@n9CavWc;+sZzNoXx@4S8+=;N#7RWVta*er0+9`;xqzGfnNg`4oV zcFzCq`~P!0FB`aG5m3W>6#rjKmRvwU#vxwQZHQPlFDdJ&%}OI8a}f8YI!pRq;?nxT z_fD`oKXt$QBl!Ra@W4hg9!he~q?w_LI|AXdAm~|(h3^_h>&?AYB@R>+_{}AAdyJf9 zyj6VH&?sa<-ORL!EWrQN*U6oe+?BjCJLT3bCEKtvOF~B7W`4?*0sM%SlBffABZgv? zKzNss2Mgjls!)S^-jtXV4m)V~?9O>%oaPj3zJ(#RX&|3jnDo!RD(FW?p zxhc#CK^63K4V2ypc7BIo5-l__edfu_ftOz1C8rf@C$&{@tC(AjciHbF92cLrUZ5Zn zKsD)L6q0;74`g2WH?jB&>kdFLMXPN_dyMD)S8_$RFL#Q@qaER?y+2Y#f>dFv(qY~6 zcntF4_51Mh=dDhv*_jK-T&>%kEF4CXs0; z=kk_3h2f&|qTWZT=v*&T$D?7z3(2v_`pQ*rnlgYTfr36tCH+br@POpYIfF(-6Ec(0 zW`ysSb~%REO6pU50{HK7hT*T&G)EB~yJhxYZK*pXt#$%&@=t<3xyDoJxht1+GyH+;g#qa29vXYk@{YU6{TcnQ9rqM-aLFh#A&D)GZ@-4L~eGBbIX zEC|^8f1FwWPq+Oaf7xZt&W&u|36L;oTj?WiX`=?2R8k<>0Wx83bjkB(I%G`Q@%1$3)Cf_ZH;)HN)?v-3Dx`jRfjLog#Ma)K-l4x`CT`6qL%Jsvj zg7&Ow{zK_=5|?jZad(gp4ke~f6v%BD2Esb84 z5_Ivbz}Or~V!p-ZhbQJF>(MN~g*zYZ1YM(#(o#xESUODlC){`{Dx z|K543MDtrj>+p}-M}eu6t-qL%!<(o6m1Nd`*7c__=f9Cl`d?-|M>0@JGeWZi6e<*O zMsUz!B*aRk9!--e49eV>5iaY2!MV2h16WOLrrc`dmn#39=Whm~7;wT>X+{LlU}AtZ z)c~+9TF!F)Y8&EHRE#zn`J=J6F|o5(`+@>nL4fP-vcf$lneu(6S*j%>%BLEM-#}04JV=2NlDioQkj}`jXtAVlkVJR1@x@7KJwoFK>aOmP z)6F5%yc_F|+g=`b%l%_-K6rXY5_=iD(h>0#vxnO^nwo1ZPN zRh|D9zO6m}%Et5K(fIMB@m4x_Y1H}cB&|c;y&{o$Y7+z68y>|0%I`gtYe&uv1X6sX zk#*w5`9RVvL1`6{uM}x`j^vv0!hxxn_3_y3?%oKO-gmuWC*CQ!ld$fUlN>~e2)LpZ z%?Z<)S>L4B#m+|2i>#0_D*}2TcZNQ`;g@Y+xb;R7Je-h9T74!<}1&sZari{ifbp1HfZN(Ef_flT0W$tYd9Xg zioX1#Wc^Ni|MtsEX^d%5X+37vx7IINyUmMpTLYA>JCO0MAlV&hOgUga3YSKkMnp7` z%@QuIte;tUR+oO?vdD0CD(a7;A~F-Lw(&r7+ibN2T~w1$^_n971l)%43h+yF!TK3# z2ZBtA4uRk?vw*61*PKrq2GK5I+x4#6p^=%FfjY81yEt@jJM2j%>bhLVCJ#OIL;=%2 z=>%sgPR;Gd>v(0NjG>>Tl}WJb%XJk~{mY*>slF97~p*{^;^Tf3FHHx;sb$(r_@r}PzX@43VG zRO}Z!bY&p8CgPcrq^eGV_1%!yPY<0uqp`G{HKq*)ffgtKyKwR^KmR`mE&ucrCx1$t z{!iqV|NZPkFtXcv645rJKVjVk<6O4I2vE>BYT*jx?VmHIM#TwxYE;&q9+46675kvs zyA}`Qu%;hC)-{A(pq)DWhTj96xm}c$re&}V)n$PxLUJHl%eglPIvU{z6 zMeje_w~RO)MS5@1Vj`GuEzrF`0GYqB0N)8RUQoc#VaoEDKcFU0$ufh@?HnlLR z4>Yem0w=$pU!2LdWRDrW1g16&N=(sdCPT&WU(w}35v`7pVM(>LQ-Fc9U;U&HBOv(d z^=)Oyx$r|i<#Lb`TpzIv3m&COv>d>Xy#*VNIw9y^IBB{0YJq3bw3==95m|or96Nd6 zcN3Z2)Otx~f(kK$Fx1|{HKx|ONV;t7vaW7&^9B;@0jeA?9n6UQ@X7lQPly_%&hj*Gj=lH!x7_VhN@#?x6#`;|xk!uS0F<#Cga zU0>cz$eR9Oe%C{OoWcUqX+@N|78>FJ+}rH&7W%+uXGgezn6wdWfK+#teq^Qb#4o{1 zFH#?`)@yb%kH1w)KvS1ylb9UV^g~%4rdgn7DJ6!P zbz3i=!==vVxwVOeqh39~Wpc_x_yuX4G~nXLwH!}B0%*2i4?7Qox@|U&Y<-)d7A$T! z57D)%GVc1gj&H}F@2~oj|phY4t`k*2dslTy<5-ilw^}ob3F%D>g|tJ zZ`{vcYnRKvbfFZRgLds9Op{!ezH8S%rm{5=xi>jBH9787qIWZnQw1ZB>sgdXI``l8 zatbT!yX}1I9NW=eb|{!C{xd|UKNPqAY;Om{s7Nbtl7e0SW;;6C zHsf6z!|H!wP~_QXYn`sktADLP&zQ%`Eo;Cb#Vx1qYLW2fhao4c=Wm?_eFxlfC(aI- z^?rmIo%xE)z+`({DyxD%2_Xs zyvdBI7%NwLh#;VjQ=WX9y9$U-FAjJXyWV(k>S_H&-gP+I+oi<64BJdYc)_V*w=lmYFRlP^4)2 z1G(Ud?LoGFZ1-Weha=zcO}b)UPPL}3b6D&>kw=RkmD`r6j(G@428gVCWE=Q^#Y0z$ z0^j}<(lTKN(hvJShl(w%+uz|t8G~021TZY~ zQ`uT!=W2lpsz{S6@$iPdr#Kn=l)+Elr*Ia8*xR1R@cAG(6uu{(x#3^s=ILCOKx(aVNR1e~Vf&g+iDa2OClhi|D(?p&+%rk`z0DK?k1hH)g@swOvA@DiH6Q8)Jw0hT! z`E}*zQ-|c<0+RIOfxI*%^VKnC9O?)#-fCrcSNJj$$B=w%{>%dt5F{~hec=x2mJMrk zi?{RKi;IF^?=H1>I^w`2z`GGXYS9ec&r)uuUqoI2)~SNz+?c{zI4t7bBA&ZKj-Ayy zFqbIfb-cW4pm13s(=Bb1`1N(mfcg`@j`G8yIB@IwjS%vew6XsLm{^K*nu70U==R4( zOyfD+2(8KI1a=ZqF69#^uQV**9DU9`I$Xh*eWY9QjJ2754H)llpIsWcrvBw>+$)Ea zp&*rfk1iG89c8l8b7Bdo!8wL#S7!AQKlD6Pyy@&fXn!hs&_{Got|#HFZP%d9XU=_T zu>7uWZ8YsVxURyH$C>ZWmSg}mWa!?&rXk26QoC>>Um}7PY}Y+*MON-%J5;UtG8l^j z&xvL_AF7)uT(l{qWiS>a8YVqh*UD{8`$GBHcbut##IUxzBKU%p+Ax=G?whuCmi&#Z3&YYZvx>c=a zOv`v$Df_<<=%fmg)|ow`uI)eaTETX^RbpzsTT0!Bx_-YSis=$z3bkJrUk*AO@l!6( zEg8^FSY2usM!85Kidj2;zk!TvMs;#If?HQ`D?si*P-QIOa$lWIy$BwT7C`a3Pin_s zWF~Ji?Waass{IwSDbZxh+QebPN47BG&n?QThlceh&vy^*uJF5cis2oWK{ewr;MLdY zTo`hZjSQgq9pAe=tk}g&cI;TYNk#I<6n?8ly@NF@BITl9kZEDB`(g z$v;Efk9RNPZDE~&9$?7>!RS4KL#K)|7K(RUN953L>zDG)bl#@@&e>a=83;+0XG#WsQOe37Yy=pB zG7^1Sl)tMa{D<*ye@%M*DFh}e?s__bHtS29%XwRt8>))-Br83B6}{&YuQpl3Mu{`v(F29 zGb6KpSVg^Z{E;=VtxQ^tB9_6I9VoobMI_*E7Bjv6h~4*hy%ilTF!^2!`fxxLyx;#J z?7H&@?kk)rO`zVBf1r2s7B>*k0Ij^JDrmJ(AoU2QIj`(CBJdVn;4!5tAPNJnVvOlK zAGAQyJt?BqL5&V22Q(LPwOCajvtJCpq-F?vq^Hi1!=CO84AE{01=GW8RMo1g=AO|( zH_z@k-v#?zEhyP7$q9%!(af~o<*3{|$!g~2^1KYIjj1qC-C65d)52V2mN5+8{ZQR$ zZ!-%Eu1iezIUL8u2bhwzolH`c`w^X4}LFNn z;qA&Qi{%=*IYb0;7ZOf}9@Lpek1o=hhnLGE@zX;$2Fb(at>VCL(|cLI$0Nn7m+kQs z42qa0G>8n0C6Eg1`JN2Bss ztkEl{6S-ku-+xi^cFi?nuBaSew{WEq=rD<`Twuy~SWDiT>D-n{stoa2k$4{0O!QRR>^YXpT9@eqH_r0>0R+I8 z2ZBpyXzOxJvmST!r#624;*gysl=Vx#40@(&O}jE`UFjgG6;!$v)eB3e=ITdiSIF3X z#n4$ipR@(Z2F-XLCl)tRY`%(KzdBk0O5-xu+Ul;K42YZM&{C951tTm4LwF(IKJY6bf~j^#9djx5EZ4lD3f)nHHcSzq2QR*TAtg< zw6C5j)gTAahFoAt1pNKKj6EG{-bjRCm%mEX#=r+7+ElqXH! zO6M3%lDF{gK&*ZAiXv}f32+*^_e@Py8}7ix&f(L==eyxkvMsEHFYZ4fRY^V}197n3`&=grT+?WjIlX3c@{>1o&Y;UaivD!zes>YG}NP0!Sgh0gxBKOx(E zc;1ouUx(r8x84d&d;@eBOCs?i{0EKRNy8yxXHGs^spO? zvw!-7%O=$IJ9T`wuvz5kgz~2E)se8b)F($66UH&%=X}9Tw$072h zWw~s0{2=MPP%)z(!`HutIA>SxAshI&B@beT~xHsB2R%90ZC zBfOOiEz0{n`Qwk-`AB78`~Q^uJOy@Y0@X*XL@;=QJ-unj;ao};NE?qtvo9$zv_o<| z9=LhuxLa4=>v~o5d^?dYtS>US0`ZAQXkrN#ZP*VH4Mc)Vb4qR#Q4|36S0AC+uLu5w zG!KIk|I8wWLBEQe`mMUC$%XDY>-f-;sAc1CjWZXnA2B-f-uxjn^N&m%TqH{%(1dKT z^x1J)DCM72T1#Be09n`5Fweoj=-U zw76g4-7s4gqOOv$O+w2Xco5fKlGK8Zq&SY6NOosVrJ6N=(7n|#WgtCNVJ~xld!0+c z!DsMsD+Ej%|Ad6|?+SPQ34-iDLYBFv5&I_BL*UGh;EHfHtATW*<&|4pN27wDTJ7du za(9=$oa=is-pU13rap|GN=6@2&@zF8BQd7jUenWv6YN-5841T~IFBE?@XhA*TBrnOOk#tAmnsnANOZJ;koi{B@{sQcQsZp?Yb=A8F(j?EG-QQB(RN!uG0T zi*i)LjlAg_1$uYCXN_d*ts=9#YhO;AN#^Q%WN!iqOOVs6s!h{5>K{zLoC|+#`U^T+;zng)gOpT`$3#NHpvEa#eXu+!7XuW~ zH5eQKDeK!jUZ9?5USq@h>Z>>trmAYPLB260PIbfihJh188=NT$tWju5R>W0iWW6Lt zBv4iSV?8fJryi8eq^E>Z8>9QKetEahrhDzfn5l|A*VTTkdyG2Xs8tS5Z6i)ztTpus z3uqazyvZ0GFn27UA%gqPZX9zsREcaB4z|^iw`UQ8NVQ%+125crx5HnFZ}L>yJP;*d z#|t;NOx&|ky#-2)n1B@L3Bkrbv0a9`({|{4;B4+fRXBwu6Sclr~48}870rR#iYPAot+|RrdccUfQ&w*cBfD4^1W~*T8eHd;>EDBMjc_5Y%7Wt zXM>+18k=V8Dx=_o#ex{hy)Rg1NP;n9$1BU4Wenf!jLc?SHv#z$2yxR~*4te0v*Z8v z03uz_;V0xXf;Kcl)JCP;fvSu&*gxIeSQH;64mYVQ^pOAZH;Bq8|vqZqL`aD8aBOm&N#WRb@W6*$+ zTSfN$KOr|TW4b12O&0&9fHS;_aa-4FUYBegl!0c49OS<;Rm>g!F>UWFY>+`JbNvDO zS#Fz_we(t{r(tnl(h2&zcX+l4FebZ$9-G!_>}PQVItZbb;#^;Ro@VV{Fk5u6iYH+X zx*jOIK-r9W!3MFjlR)q6;}4FaeHgoA4~BXH>HE-64`W#I{M%`&Qm4QAU|0Eb;Z)Q6 zjG3k=>EeOszvW54UjO=BX}g^k3C>+-B$Sy^e=IP!T@5QxFCM5rP8N&y%tBgFh>vP@ z_T*rvO{BTWv2rBI+S0a5hlZCy8uMDMdQ57@#(9X~93?SXI0E_9>7H`gR zp!DELQCZ^o;iBp2CXSnYJ{do_O}wb%@)@V@7ztc^k+|o)da(;Bm%-ak5`3b>@FAN; zr)g3#t@y)8J<4m4G+nDpRk~7<60~vnB)dtJh=_;1hljr!uX5q3Ov8Qw|677$TOe_n zNmm1N4h_~(uqwg5fsYEn3$er*IzzR}BwM7)lnW4DJnksB(65*;lKd$R`X@vmP9wKtb&ti zELtM1NaWO4Pq1Pfy4c{2o=U0*wW|G2ex!?l0{!OzlfdhrYn-3v5E5fog1#mO$Oo)1 z076>UgM-#6!Lwga-J8k2`u-6POWrNv$KnsjNjl1VD(kXDK@`-#% z5TDl{R(l|@+FtS%y7Sl;;{xSWQO+aUCrmT03X%>`Ox398Ja?e6f zzAG-RUJ#s&m%;5pNen1NYEkRD>nI~Fn5vFmn3U_BVcM0zJ+@;RGZen+TfuCI3kDJo z^h;Yv2@hqz+~23YPo*)c@fOA6YKw)Bw?`3EERXRLp905U9#X5ZwrnJ9q_7O$Pe=gqhz&e|?_ z{L1&pg(>S{KI`-+U8k$nHSQ%$ZH33Zf--AXaI+r~$#b{X3JLAx*qst$mg zlIt=dwYMC3T~`%T%d0k~D5l#%zvvNOZSM2w9%pmRl(l(5ywlmd%e_hKdLMP?I6^wm zPfaJ1KhMtvD!2_=lIy?rhqEj&HAEl-HzBsIhs@FX^?RE$Qy(ID0@WISLPXCV(U_cn z5W{OPtx?u8)i8H7{JApkWJN95G23&EnY^O%o*_c5=>(77{aWW4qEh6OVG1v9sY|-- z${d^j+VQemRx-{hTTouRz@Sd;E6bCM9iEl&yAQSl#XOt&=f%dVbtJW+|#S?fJhOp9z0^yKuxHcg|@foMbM0HNY%? zD0BgAtXzBb_mkx>MzsH}>1REgw9HE}AVHVUQI%=mfpogT#eN`Sstx1fM!{Wwb0tm= z^UGK(NyO7p+v4&|o6xube05Di?j>brs?)7bAv*|DA*0rsM0-QR6QXIqQ2g4Ly|eAV z*#;JnDx(Vm*M?a?mPlvbn1p-^Q`YY&JuMrzV0a-tA3PS#z((BH(xbeLuCvEZ>K<3G z?C+b>BMnfB8UCTL3xn5%R!Fg43MH?O%}rg@z{l3}B0AF&_E_lj#;Xutz8Q(Q)|FSh$#$w|L|&|3Z@FAhrYY) zt%$OW0<~%wm$HN8dxY3ya*O^&`Q2}O*S~kQLPRsTr|{1-n>xo-i=K3G=2_F?gbBr>KfqlM)o~FhA%o9yT!3 ziYaZcPz_hE{1YWlensY6+c-^u5~%GoKbT8n4}t9hP?xT|1$ye8>?JI?V<%#F#OWj8 zIdhA>BGgtdxVJ}{oqzwDr4u2;g8pVJ>5J={)Qj)=roJcUJL^624rh+^U41tv^Z=?i} zoDRhHN-HU1^4j4^Y2Eoox=RgE_jz35T|dopzRSkla?wFAvU<&q_`Wo|@+DAwbObDD z@!_Xl!%JOS4%)MLq3pngRnltY^etN2Ngx18u>8u!oO+3xj|+}_gL&U&4Ks)WGG))K z3ifa@_FAQE8(*Yr(K#rlD$@j$G4R8FiJ2l6-U>aShs^UN!2#KbrHwfg3g$Vun^c|G zVS#B0hrVx&yQJvhFA*K_v1%XhNTq3*gKyj;3dB%i4d<98=v!D}P#3N0?nf#t!B5~* z4gtG*S+ex8F!V&HauCaQgl0W%z_Qn_SD^6eL+g9DSid@3yU( zcrjP|?jONW%d6K%n()u@`|H&M9pOUY0TPxS7=0-oGT{~wEgkc$yZ!3AJ}bZB)QVg- z?qs&cs~pnV0sEZ7k&aQC%wVqFeqVyy(!gc7e(|Fy>G$&gni!snz<|0xtD|$YHvo8U z9MGIbG!6}I2HjOy^3#9pI4EDVSzsvPcC&J@sK`tFvBeY7Z-v%wXB26>?iu3TL4N0B za?g?|=rnT=j8#DH=_SDPL1+#GQ8n-mi173#dm0bW&E<~tYq~buf72iSk|$gszxVWs z8KYX|J5|lzqg=FmpbESg4zoIRFtlRz9Q8oV{vidE;`e&|}6Qzq!xJpGj zh#J>u%$(BJ5pC4~Rr%-01`M7>8k`E&@n)&R(NjgbjVVivj)FignnkR2b@`x6g*Y#VbDtbm%5X2tfDRg6^$Xnio6aI^sLLb?<>P5W-4=j)N$^aUjGYvT8gqzOhtPmL{ZWD9kDMVd=JYF@1`w$-!eP zsrfZ>S25C#3XQp4^)Z1w%&@$Fvi7tx@}<_pgLFYQpP9F5%3(cGxrrf4Zcp>Ta*BKg zZXsLn%SisE`KD`KmI#-qToL=}@YK4a5+CC??i+agqsUriF74$%XD;v$YLtH$TjOqo z6rA(Sd?9YcD_e5?p<$cP@$2J`8xEGq9+Q%gjg>0{F*#1)9CP^z$?Ride?q7t zn^=~|-#pL2E(n3xiltZ~#++!@Ks^02eA$e`=75Mu+dWik=zEn8TU2dh*o<{Pmeshz=}HcQaN0C&NfDHHj**@-38Y^(mchk3Ak*G9~*P9$dK$m-hQr~&C;BY%{Yo2+%zO|yM`hlI`Yu$~Y6r!iV@!trBgTu=bM}e-bGIQ|BibU0`Tp|%KT41?vmKe^k>-dFywctqC-;t+hQkTwh& z)it}YdX~Sp*+VJa!k@eMqUSw}*=UAT7Y2V4Eti?V5@1e&ARnuz zrmN$F;IPm2ZLt7`q|xlsy6Woar^-3kZfUmV{l9t;TEQt+=pF4}z1)97xbfiBn61S8 z>D}Lq2>*0-|AR&UFL?;LhZE>z?e8SR3GHpz(s@wM0&-#UFoiur>ylp&;4qoar>$z<*$bBR?#xV7Lqc=^XU_irt~EUJmerXCc8wmD zg04ZJarQGex;X>V_cV3-Cxk!HWhsIdI_#y@oar&Lqf-sm@i@$#y6If0W$Z3*${s9# zI}aQ91G?!~Ve3EH69yDv0^pYIbu1)+SzZr;%wKy+SNAyAcm&iGu6(HQXJGAfOiVqS z8C@RL0VY>q;m-rdm-A7onYhbMOvKY&Q~a(HtJQ_SD(!?K%Mp|_@Sj+IJ}p%g{EzJz zek)=G;g;caLsU1mtm@G+f)+E903s*(CMq$xuBQV1S8YEHe-BKZBpifw3$1`nCEqyZ z{EJg3O72RZtl@fu7J>1x`rw?t_~#(B(8U7>cvu@9e46cA{ILA~S#$mQ1svbY&lr)G z1)NBxoNVa}m<#h$ZO9aHY?`*iOoO{)Vs~bQrJ64bHp5gYv^;W`nM|&$M*X{*ilq*Xr)6-9f#W6sAg+62Hbaw0pTEs)t7$s2h3^C(TBz zKZ7EoLCNB!DysfEDYq3KB@lcFf4?#X6X{$=nON?se_M5=va)6fr{FnBG_~C=Cu|ZV zp_6$0afQBi@8O0sFLaJ_Ba#px;nWoohTSI*rt?Zq5rQhu{J0J?kOSnQ`h#x01nsYQ z?(#a_#zbLg93;W%w(jv+cf&Q#&>3FVtW3`H@Ui5 zKrr3%Q|+g~nS`59hns|QlfUT7-4s{f*RoM;q0cPGJ0YGy*($sVSe8H{26T(SJXBiI z|5UiRbC+fqjXm1ez(K&@<@Rrd+lcsyx ziu5H->g)~*HVa>`30HNp9$Mk=s>>y~I6ocR7i$nS>l`bc@p6=O4|Rg7fPIJzrSAp{ zSIHFbY@ z|B#gE+QDE3O*GPGMpcamBUg9&m~s>)XU0BF^t5z8x@_z2H(hc;nHm zM8YoD8P_I5{wM*0&ksv#?ouYzf;6%Wb{F^j>PTqD>VnCXQ|0Y#8wo* zxjobb1$OxHk;pwoAT;c2uXx8_v$E`9MKYO zHr70aX_Y&tA|1Cj&^&B4d9Yh70|t*F=`6a9R%Xk_n7QV0t0@=HDI`iES1tUDNXk(rGnplY)Bp)?PPBu}C>C!Mm0=GkkW&TFQIk9-g0b|*)AOvOgLhSeRQL%8 z^LrjhKRN>lp8R2}_Q(*zwEo>BgcN!|Lm#f?r@UzhZU-yEkzx`ReH&a{Kg!&Ny9E;N4o!O+~b5 zV!1>utf~UB#_JN%m+E;5jmP3+F*4Sd^Xk(hdkw6u4D@%s>`Qp)d<&+aAyC~l?4VqP1&ctq z6E=l-T1n*43YlWEwRVw(l+I4Tuhb0tMvqF>r%y7+(qRCA-nN?LJxt;#{MB>i= zdUwWqn_xs6jk2F1NDV<+4yJWBO*id?Q6*!wGE%c@?EY?i$ysYVKmb z=cKz<(J0KyKJ0!)PsZo_U(^_#*ju=8m@ums6wrl(xdkBI-hhUp+BLpu3`bDLNe0ne zm{s3#gIFc5cCaOSThA$DBJQ28zjI}r?w!%N(|5jp$rPg3|8ZF-P2-o_>1R=0tTW}H zr!c%NxB!USGG+CeaRK&?gkpT?oYT=v(nv36721h7 zw#&DOm+T#LC|d2AeDP+dHnf?=J*XmwE#=sqWcP|j@GI}PACEjaq44#~%ZL{%p)Df~ z#EQ3$M*t~$C~Bx)i}?=R*@^IJIRyfk0goXkZm#8C;v~*nxTx*802WcMf}vS@FUY&N+D|R0BmpIfcdb>xV`9=A5}Hs!py9LSq&s|b&BR4qg{px5@|w0 ziP~r|=592Cds_lh0d#ld1&VhQ2CxifZ$GHps4*9@kqW=+;J;{$X}#~tz0iAIdt8UF z*$ zsCbynQcvPNbBs8H>ozOC1sw2nb@G3!<3(E+dQxv^Slb}j@9Rx~(t4ShAlWSv#mRgv zLIUkmxGB+X0`&sJIIVyTca7L|vcjbwG`pDoac1Jm*jd2oDQi+Cp?~I6gyt_{FSznt zabJbtt_Z>^&78`$+($P>3iqL)j(|7`*H%dtjM~8R5GFhG7JHjXN{8wuir#+cIcmiv zT7I+N&3wj4BAwGGTyb)Y8bV_L-=|i`-5004il#A7i3ha!MQIUS;T7;m&7;gS2d)d9M)iEAt14acQT@u{c1(46-){dAZmf%2$8Ft)7e86S>UL1M{Eo9n(*q(5DQe zJiVFOkk=_=#aE0yNy@EzTCSQ}oDok9oe+EuVj~coAaYr>0*@hb!d<;1@dr1#1TVX& z#g?Pl+-V%SK%9q7+`ioEsI@Po^f#YlrG-7L9~s0-aJIY`p9CR3>{JYccVG%10|Nh0 zfMz~`ZRP%fEx(f;5;7gMyy_WFJ9B-W7JTVZYs^QRS2|9-N-1UB;^!pt_LZa5!6l;N7rMTVi_$i_6D0(N93#xcl>JZJkTA>Z5OXL-_8}`Z^@;P3XtAc0f<_ QZ>ao+B>yj8cl{juUustNoB#j- diff --git a/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py b/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py deleted file mode 100644 index 487ba335bafc..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2d/test_vit_2d.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.builder import build_lr_scheduler -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2d.py') - - -def eval(engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output[0], - ParallelMode.PARALLEL_2D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2D_COL, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label[0] == output) - correct_sum += correct - total_sum += label[0].size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine, train_dataloader, lr_scheduler): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - avg_loss = accumulated_loss / num_steps - lr_scheduler.step() - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer) - logger = get_global_dist_logger() - - logger.info('start training') - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine, train_dataloader, lr_scheduler) - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine, test_dataloader) - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2d_parallel_vision_transformer() diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3.txt deleted file mode 100644 index 54ecbf86960b..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3.txt +++ /dev/null @@ -1,103 +0,0 @@ -TACC: Starting up job 3498212 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -epoch: 0, train loss: 1.9590576728995965 -epoch: 1, train loss: 1.6275222167676808 -epoch: 1, eval loss: 1.5277319371700286, correct: 4435, total: 10000, acc = 0.44349998235702515 -epoch: 2, train loss: 1.4355541419009774 -epoch: 3, train loss: 1.3253967445723864 -epoch: 3, eval loss: 1.309086227416992, correct: 5283, total: 10000, acc = 0.5282999873161316 -epoch: 4, train loss: 1.2578775298838714 -epoch: 5, train loss: 1.2231916554120121 -epoch: 5, eval loss: 1.1699816286563873, correct: 5695, total: 10000, acc = 0.5694999694824219 -epoch: 6, train loss: 1.1872552669778162 -epoch: 7, train loss: 1.1616783823285783 -epoch: 7, eval loss: 1.069484794139862, correct: 6183, total: 10000, acc = 0.6182999610900879 -epoch: 8, train loss: 1.1155579333402672 -epoch: 9, train loss: 1.0878059365311448 -epoch: 9, eval loss: 1.0522838592529298, correct: 6202, total: 10000, acc = 0.620199978351593 -epoch: 10, train loss: 1.0780728623575093 -epoch: 11, train loss: 1.0522098152004942 -epoch: 11, eval loss: 1.0902862310409547, correct: 6148, total: 10000, acc = 0.614799976348877 -epoch: 12, train loss: 1.0366473337825464 -epoch: 13, train loss: 1.0067467458394108 -epoch: 13, eval loss: 0.9696728616952897, correct: 6531, total: 10000, acc = 0.6530999541282654 -epoch: 14, train loss: 0.9676224273078295 -epoch: 15, train loss: 0.9494374029490412 -epoch: 15, eval loss: 0.9511896312236786, correct: 6646, total: 10000, acc = 0.6645999550819397 -epoch: 16, train loss: 0.9231320935852674 -epoch: 17, train loss: 0.9023846679804276 -epoch: 17, eval loss: 0.8728409796953202, correct: 6866, total: 10000, acc = 0.6865999698638916 -epoch: 18, train loss: 0.8684309854799387 -epoch: 19, train loss: 0.836099565637355 -epoch: 19, eval loss: 0.8208363801240921, correct: 7091, total: 10000, acc = 0.7091000080108643 -epoch: 20, train loss: 0.8285067890371595 -epoch: 21, train loss: 0.7930980793067387 -epoch: 21, eval loss: 0.7793890535831451, correct: 7235, total: 10000, acc = 0.7234999537467957 -epoch: 22, train loss: 0.762698369366782 -epoch: 23, train loss: 0.7376812471418964 -epoch: 23, eval loss: 0.746866625547409, correct: 7340, total: 10000, acc = 0.7339999675750732 -epoch: 24, train loss: 0.7071484223920472 -epoch: 25, train loss: 0.6905171658311572 -epoch: 25, eval loss: 0.6909466415643692, correct: 7526, total: 10000, acc = 0.7525999546051025 -epoch: 26, train loss: 0.6608500091397033 -epoch: 27, train loss: 0.65504517907999 -epoch: 27, eval loss: 0.6612646311521531, correct: 7697, total: 10000, acc = 0.7696999907493591 -epoch: 28, train loss: 0.6234641969203949 -epoch: 29, train loss: 0.6107665622720913 -epoch: 29, eval loss: 0.666494044661522, correct: 7704, total: 10000, acc = 0.7703999876976013 -epoch: 30, train loss: 0.5875011883219894 -epoch: 31, train loss: 0.5739485697478665 -epoch: 31, eval loss: 0.6217960953712464, correct: 7828, total: 10000, acc = 0.7827999591827393 -epoch: 32, train loss: 0.548510205684876 -epoch: 33, train loss: 0.5237194764979032 -epoch: 33, eval loss: 0.6254391580820083, correct: 7842, total: 10000, acc = 0.7841999530792236 -epoch: 34, train loss: 0.5154265892140719 -epoch: 35, train loss: 0.494700480176478 -epoch: 35, eval loss: 0.5981663644313813, correct: 7963, total: 10000, acc = 0.7962999939918518 -epoch: 36, train loss: 0.4785171020395902 -epoch: 37, train loss: 0.46277919259606576 -epoch: 37, eval loss: 0.6061880439519882, correct: 7958, total: 10000, acc = 0.795799970626831 -epoch: 38, train loss: 0.4398626606075131 -epoch: 39, train loss: 0.4206806777083144 -epoch: 39, eval loss: 0.6158866941928863, correct: 7959, total: 10000, acc = 0.7958999872207642 -epoch: 40, train loss: 0.40768756550185536 -epoch: 41, train loss: 0.39494050035671313 -epoch: 41, eval loss: 0.5725498422980309, correct: 8132, total: 10000, acc = 0.8131999969482422 -epoch: 42, train loss: 0.3742571521778496 -epoch: 43, train loss: 0.3583034301290707 -epoch: 43, eval loss: 0.5765605017542839, correct: 8155, total: 10000, acc = 0.8154999613761902 -epoch: 44, train loss: 0.3342630756752832 -epoch: 45, train loss: 0.31316718063792404 -epoch: 45, eval loss: 0.583588008582592, correct: 8199, total: 10000, acc = 0.8198999762535095 -epoch: 46, train loss: 0.30922748148441315 -epoch: 47, train loss: 0.2906164434187266 -epoch: 47, eval loss: 0.5934860140085221, correct: 8143, total: 10000, acc = 0.814300000667572 -epoch: 48, train loss: 0.2741488078419043 -epoch: 49, train loss: 0.2597196321098172 -epoch: 49, eval loss: 0.5978868633508683, correct: 8195, total: 10000, acc = 0.8194999694824219 -epoch: 50, train loss: 0.2440016470393356 -epoch: 51, train loss: 0.2293997729311184 -epoch: 51, eval loss: 0.5915440261363983, correct: 8232, total: 10000, acc = 0.823199987411499 -epoch: 52, train loss: 0.2132072006257213 -epoch: 53, train loss: 0.19785404767917128 -epoch: 53, eval loss: 0.6171442106366157, correct: 8258, total: 10000, acc = 0.8258000016212463 -epoch: 54, train loss: 0.1838149410121295 -epoch: 55, train loss: 0.17691133977199086 -epoch: 55, eval loss: 0.623777586221695, correct: 8275, total: 10000, acc = 0.8274999856948853 -epoch: 56, train loss: 0.16595362697024735 -epoch: 57, train loss: 0.1531825682946614 -epoch: 57, eval loss: 0.6466041743755341, correct: 8243, total: 10000, acc = 0.8242999911308289 -epoch: 58, train loss: 0.14334788979316243 -epoch: 59, train loss: 0.13799503377201605 -epoch: 59, eval loss: 0.6496601745486259, correct: 8249, total: 10000, acc = 0.8248999714851379 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3hxmodel.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3hxmodel.txt deleted file mode 100644 index 9bb1bf4bb9ed..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-3hxmodel.txt +++ /dev/null @@ -1,196 +0,0 @@ - -c196-011[rtx](1013)$ bash ./test.sh 1 1 1 0.001 -TACC: Starting up job 3503164 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -USE_VANILLA model -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -epoch: 0, train loss: 1.9408839624755236 -epoch: 0, eval loss: 1.7896566271781922, correct: 3488, total: 10000, acc = 0.34880000352859497 -epoch time: 40.82966494560242 -epoch: 1, train loss: 1.6500030257263962 -epoch: 1, eval loss: 1.5464953780174255, correct: 4545, total: 10000, acc = 0.4544999897480011 -epoch time: 40.01254224777222 -epoch: 2, train loss: 1.422887429899099 -epoch: 2, eval loss: 1.37536381483078, correct: 5074, total: 10000, acc = 0.5073999762535095 -epoch time: 40.107905864715576 -epoch: 3, train loss: 1.3217590207956276 -epoch: 3, eval loss: 1.3036327004432677, correct: 5377, total: 10000, acc = 0.5376999974250793 -epoch time: 40.12306189537048 -epoch: 4, train loss: 1.262234352072891 -epoch: 4, eval loss: 1.2568134129047395, correct: 5475, total: 10000, acc = 0.5475000143051147 -epoch time: 40.10755228996277 -epoch: 5, train loss: 1.2381379117771072 -epoch: 5, eval loss: 1.1941023647785187, correct: 5676, total: 10000, acc = 0.5676000118255615 -epoch time: 40.119303464889526 -epoch: 6, train loss: 1.2061052650821453 -epoch: 6, eval loss: 1.1313925206661224, correct: 5938, total: 10000, acc = 0.5938000082969666 -epoch time: 40.07719683647156 -epoch: 7, train loss: 1.1659562563409611 -epoch: 7, eval loss: 1.125486546754837, correct: 5958, total: 10000, acc = 0.59579998254776 -epoch time: 40.1702299118042 -epoch: 8, train loss: 1.1378972846634534 -epoch: 8, eval loss: 1.082760637998581, correct: 6102, total: 10000, acc = 0.6101999878883362 -epoch time: 40.22099733352661 -epoch: 9, train loss: 1.1073276430976635 -epoch: 9, eval loss: 1.1077564001083373, correct: 6038, total: 10000, acc = 0.6037999987602234 -epoch time: 40.1106858253479 -epoch: 10, train loss: 1.087894769347444 -epoch: 10, eval loss: 1.0400531351566316, correct: 6311, total: 10000, acc = 0.6310999989509583 -epoch time: 40.20973324775696 -epoch: 11, train loss: 1.0556547295074075 -epoch: 11, eval loss: 1.0295817345380782, correct: 6359, total: 10000, acc = 0.6358999609947205 -epoch time: 40.23791980743408 -epoch: 12, train loss: 1.0299884901971232 -epoch: 12, eval loss: 1.003737959265709, correct: 6380, total: 10000, acc = 0.6380000114440918 -epoch time: 40.08779859542847 -epoch: 13, train loss: 0.9972386627781148 -epoch: 13, eval loss: 0.9707699298858643, correct: 6499, total: 10000, acc = 0.649899959564209 -epoch time: 40.10878801345825 -epoch: 14, train loss: 0.9784559072280417 -epoch: 14, eval loss: 0.9253897607326508, correct: 6641, total: 10000, acc = 0.6640999913215637 -epoch time: 40.13168978691101 -epoch: 15, train loss: 0.9409253481699495 -epoch: 15, eval loss: 0.9120320588350296, correct: 6759, total: 10000, acc = 0.6758999824523926 -epoch time: 40.162830114364624 -epoch: 16, train loss: 0.925923115136672 -epoch: 16, eval loss: 0.8850776582956315, correct: 6870, total: 10000, acc = 0.6869999766349792 -epoch time: 40.145774602890015 -epoch: 17, train loss: 0.8923340841215484 -epoch: 17, eval loss: 0.8570599347352982, correct: 6950, total: 10000, acc = 0.6949999928474426 -epoch time: 40.18058943748474 -epoch: 18, train loss: 0.8638542884466599 -epoch: 18, eval loss: 0.838410159945488, correct: 6971, total: 10000, acc = 0.6970999836921692 -epoch time: 40.110822439193726 -epoch: 19, train loss: 0.8400422529298432 -epoch: 19, eval loss: 0.8189669162034988, correct: 7097, total: 10000, acc = 0.7096999883651733 -epoch time: 40.066970109939575 -epoch: 20, train loss: 0.8072922752828015 -epoch: 20, eval loss: 0.7772788077592849, correct: 7240, total: 10000, acc = 0.7239999771118164 -epoch time: 40.045086145401 -epoch: 21, train loss: 0.788195074821005 -epoch: 21, eval loss: 0.7793144911527634, correct: 7261, total: 10000, acc = 0.726099967956543 -epoch time: 40.05983781814575 -epoch: 22, train loss: 0.7574447350842612 -epoch: 22, eval loss: 0.7660320281982422, correct: 7272, total: 10000, acc = 0.7271999716758728 -epoch time: 40.11693739891052 -epoch: 23, train loss: 0.7402738150285215 -epoch: 23, eval loss: 0.7264292597770691, correct: 7418, total: 10000, acc = 0.7418000102043152 -epoch time: 40.18724513053894 -epoch: 24, train loss: 0.7125097580102026 -epoch: 24, eval loss: 0.7105035990476608, correct: 7506, total: 10000, acc = 0.7505999803543091 -epoch time: 40.1254940032959 -epoch: 25, train loss: 0.6900304744438249 -epoch: 25, eval loss: 0.6911167114973068, correct: 7562, total: 10000, acc = 0.7561999559402466 -epoch time: 40.103896617889404 -epoch: 26, train loss: 0.6648721482072558 -epoch: 26, eval loss: 0.6780407190322876, correct: 7624, total: 10000, acc = 0.7623999714851379 -epoch time: 40.18161463737488 -epoch: 27, train loss: 0.6446310062797702 -epoch: 27, eval loss: 0.6820667266845704, correct: 7612, total: 10000, acc = 0.761199951171875 -epoch time: 40.19018864631653 -epoch: 28, train loss: 0.6262476389505425 -epoch: 28, eval loss: 0.6506347745656967, correct: 7704, total: 10000, acc = 0.7703999876976013 -epoch time: 40.23526978492737 -epoch: 29, train loss: 0.5968854001590184 -epoch: 29, eval loss: 0.6507940381765366, correct: 7727, total: 10000, acc = 0.7726999521255493 -epoch time: 40.26889181137085 -epoch: 30, train loss: 0.587430303194085 -epoch: 30, eval loss: 0.6333519726991653, correct: 7788, total: 10000, acc = 0.7787999510765076 -epoch time: 40.28285789489746 -epoch: 31, train loss: 0.5701514035463333 -epoch: 31, eval loss: 0.6348810195922852, correct: 7799, total: 10000, acc = 0.7798999547958374 -epoch time: 40.199995040893555 -epoch: 32, train loss: 0.5482188679125845 -epoch: 32, eval loss: 0.6192457497119903, correct: 7833, total: 10000, acc = 0.78329998254776 -epoch time: 40.270729780197144 -epoch: 33, train loss: 0.534268391375639 -epoch: 33, eval loss: 0.6381673783063888, correct: 7790, total: 10000, acc = 0.7789999842643738 -epoch time: 40.36342120170593 -epoch: 34, train loss: 0.5104483384258893 -epoch: 34, eval loss: 0.6173199415206909, correct: 7867, total: 10000, acc = 0.7866999506950378 -epoch time: 40.34266257286072 -epoch: 35, train loss: 0.4968841674984718 -epoch: 35, eval loss: 0.604002220928669, correct: 7916, total: 10000, acc = 0.7915999889373779 -epoch time: 40.39444589614868 -epoch: 36, train loss: 0.4773432207959039 -epoch: 36, eval loss: 0.5884111285209656, correct: 7965, total: 10000, acc = 0.7964999675750732 -epoch time: 40.40647268295288 -epoch: 37, train loss: 0.4621481445370888 -epoch: 37, eval loss: 0.5748852327466011, correct: 8047, total: 10000, acc = 0.8046999573707581 -epoch time: 40.29281520843506 -epoch: 38, train loss: 0.4431859048045411 -epoch: 38, eval loss: 0.5874941781163215, correct: 7995, total: 10000, acc = 0.7994999885559082 -epoch time: 40.40029954910278 -epoch: 39, train loss: 0.4305852785402415 -epoch: 39, eval loss: 0.5991648495197296, correct: 7972, total: 10000, acc = 0.7971999645233154 -epoch time: 40.399904012680054 -epoch: 40, train loss: 0.4092241589512144 -epoch: 40, eval loss: 0.5725525215268135, correct: 8069, total: 10000, acc = 0.8068999648094177 -epoch time: 40.32663059234619 -epoch: 41, train loss: 0.39218547179990887 -epoch: 41, eval loss: 0.5886161357164383, correct: 8068, total: 10000, acc = 0.8068000078201294 -epoch time: 40.32424521446228 -epoch: 42, train loss: 0.3773612398274091 -epoch: 42, eval loss: 0.5762413635849952, correct: 8126, total: 10000, acc = 0.8125999569892883 -epoch time: 40.44430422782898 -epoch: 43, train loss: 0.3593267098981507 -epoch: 43, eval loss: 0.5729024946689606, correct: 8107, total: 10000, acc = 0.810699999332428 -epoch time: 40.488121032714844 -epoch: 44, train loss: 0.3396431426612698 -epoch: 44, eval loss: 0.5944831907749176, correct: 8072, total: 10000, acc = 0.8071999549865723 -epoch time: 40.41803979873657 -epoch: 45, train loss: 0.32412939716358574 -epoch: 45, eval loss: 0.5849291861057282, correct: 8171, total: 10000, acc = 0.8170999884605408 -epoch time: 40.428131341934204 -epoch: 46, train loss: 0.3099915471916296 -epoch: 46, eval loss: 0.5797522723674774, correct: 8121, total: 10000, acc = 0.8120999932289124 -epoch time: 40.623990058898926 -epoch: 47, train loss: 0.29422828676749246 -epoch: 47, eval loss: 0.5898703813552857, correct: 8175, total: 10000, acc = 0.8174999952316284 -epoch time: 40.71224045753479 -epoch: 48, train loss: 0.27581544600579205 -epoch: 48, eval loss: 0.5950756087899208, correct: 8170, total: 10000, acc = 0.8169999718666077 -epoch time: 40.53409385681152 -epoch: 49, train loss: 0.26118586242807157 -epoch: 49, eval loss: 0.5998703584074974, correct: 8213, total: 10000, acc = 0.8212999701499939 -epoch time: 40.564385175704956 -epoch: 50, train loss: 0.2513351797753451 -epoch: 50, eval loss: 0.6011391341686249, correct: 8226, total: 10000, acc = 0.8226000070571899 -epoch time: 40.55033254623413 -epoch: 51, train loss: 0.22965944299892505 -epoch: 51, eval loss: 0.5979882061481476, correct: 8233, total: 10000, acc = 0.8233000040054321 -epoch time: 40.54532980918884 -epoch: 52, train loss: 0.21661002188920975 -epoch: 52, eval loss: 0.6121026620268821, correct: 8220, total: 10000, acc = 0.8219999670982361 -epoch time: 40.649473667144775 -epoch: 53, train loss: 0.20266114950788264 -epoch: 53, eval loss: 0.6016955643892288, correct: 8260, total: 10000, acc = 0.8259999752044678 -epoch time: 40.752054929733276 -epoch: 54, train loss: 0.19287180794136866 -epoch: 54, eval loss: 0.6043265879154205, correct: 8284, total: 10000, acc = 0.8283999562263489 -epoch time: 40.68043255805969 -epoch: 55, train loss: 0.175087109208107 -epoch: 55, eval loss: 0.6146622076630592, correct: 8316, total: 10000, acc = 0.8315999507904053 -epoch time: 40.58446717262268 -epoch: 56, train loss: 0.16749868762432313 -epoch: 56, eval loss: 0.6235148012638092, correct: 8313, total: 10000, acc = 0.8312999606132507 -epoch time: 40.62826180458069 -epoch: 57, train loss: 0.15567801619062618 -epoch: 57, eval loss: 0.6325852945446968, correct: 8308, total: 10000, acc = 0.8307999968528748 -epoch time: 40.72224497795105 -epoch: 58, train loss: 0.1484297229623308 -epoch: 58, eval loss: 0.6329193383455276, correct: 8325, total: 10000, acc = 0.8324999809265137 -epoch time: 40.750558614730835 -epoch: 59, train loss: 0.14238623818572688 -epoch: 59, eval loss: 0.6318104699254036, correct: 8329, total: 10000, acc = 0.8328999876976013 -epoch time: 40.77172636985779 -finish training \ No newline at end of file diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4.txt deleted file mode 100644 index d7404eea68b3..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4.txt +++ /dev/null @@ -1,103 +0,0 @@ -TACC: Starting up job 3498663 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -epoch: 0, train loss: 2.095031557034473 -epoch: 1, train loss: 1.8454539605549403 -epoch: 1, eval loss: 1.7768513083457946, correct: 3564, total: 10000, acc = 0.3563999831676483 -epoch: 2, train loss: 1.7044833728245325 -epoch: 3, train loss: 1.5999061124665397 -epoch: 3, eval loss: 1.5574450254440309, correct: 4389, total: 10000, acc = 0.4388999938964844 -epoch: 4, train loss: 1.4929670217085858 -epoch: 5, train loss: 1.401450170546162 -epoch: 5, eval loss: 1.4644017696380616, correct: 4857, total: 10000, acc = 0.48569998145103455 -epoch: 6, train loss: 1.319102376091237 -epoch: 7, train loss: 1.2555806539496597 -epoch: 7, eval loss: 1.2475590467453004, correct: 5486, total: 10000, acc = 0.5485999584197998 -epoch: 8, train loss: 1.1992503173497258 -epoch: 9, train loss: 1.1600336493278036 -epoch: 9, eval loss: 1.1786625683307648, correct: 5834, total: 10000, acc = 0.5834000110626221 -epoch: 10, train loss: 1.1214540807568296 -epoch: 11, train loss: 1.0808329728184913 -epoch: 11, eval loss: 1.096825110912323, correct: 6072, total: 10000, acc = 0.6071999669075012 -epoch: 12, train loss: 1.0521019423494533 -epoch: 13, train loss: 1.0262362957000732 -epoch: 13, eval loss: 1.056444275379181, correct: 6268, total: 10000, acc = 0.626800000667572 -epoch: 14, train loss: 0.9932536555796253 -epoch: 15, train loss: 0.9653559442685575 -epoch: 15, eval loss: 0.9576991081237793, correct: 6582, total: 10000, acc = 0.6581999659538269 -epoch: 16, train loss: 0.9465620943478176 -epoch: 17, train loss: 0.9181081974992946 -epoch: 17, eval loss: 0.9245584070682525, correct: 6747, total: 10000, acc = 0.6746999621391296 -epoch: 18, train loss: 0.8987109752333894 -epoch: 19, train loss: 0.8840238646585115 -epoch: 19, eval loss: 0.8989996433258056, correct: 6787, total: 10000, acc = 0.6786999702453613 -epoch: 20, train loss: 0.8591911811001447 -epoch: 21, train loss: 0.843510093129411 -epoch: 21, eval loss: 0.8595858901739121, correct: 6969, total: 10000, acc = 0.6969000101089478 -epoch: 22, train loss: 0.8306782276046519 -epoch: 23, train loss: 0.8181647640101763 -epoch: 23, eval loss: 0.8600298583507537, correct: 7005, total: 10000, acc = 0.7005000114440918 -epoch: 24, train loss: 0.7964763343334198 -epoch: 25, train loss: 0.7840689718723297 -epoch: 25, eval loss: 0.824479615688324, correct: 7073, total: 10000, acc = 0.7073000073432922 -epoch: 26, train loss: 0.7709570752114666 -epoch: 27, train loss: 0.7591698108887186 -epoch: 27, eval loss: 0.7967212647199631, correct: 7196, total: 10000, acc = 0.7195999622344971 -epoch: 28, train loss: 0.7438001352913526 -epoch: 29, train loss: 0.7341659853653032 -epoch: 29, eval loss: 0.8041222035884857, correct: 7168, total: 10000, acc = 0.7167999744415283 -epoch: 30, train loss: 0.7254330929444761 -epoch: 31, train loss: 0.710246913895315 -epoch: 31, eval loss: 0.7848481118679047, correct: 7287, total: 10000, acc = 0.7286999821662903 -epoch: 32, train loss: 0.6976562008565786 -epoch: 33, train loss: 0.6906438475968887 -epoch: 33, eval loss: 0.7644171923398971, correct: 7370, total: 10000, acc = 0.7369999885559082 -epoch: 34, train loss: 0.6795850834067987 -epoch: 35, train loss: 0.6724951656497254 -epoch: 35, eval loss: 0.7515032321214676, correct: 7368, total: 10000, acc = 0.736799955368042 -epoch: 36, train loss: 0.6527298372619006 -epoch: 37, train loss: 0.651018523440069 -epoch: 37, eval loss: 0.7381327033042908, correct: 7449, total: 10000, acc = 0.7448999881744385 -epoch: 38, train loss: 0.6365304406808348 -epoch: 39, train loss: 0.6372388047831399 -epoch: 39, eval loss: 0.7342826008796692, correct: 7453, total: 10000, acc = 0.7452999949455261 -epoch: 40, train loss: 0.6199644664112403 -epoch: 41, train loss: 0.6101092303894005 -epoch: 41, eval loss: 0.7353240340948105, correct: 7466, total: 10000, acc = 0.7465999722480774 -epoch: 42, train loss: 0.6093496211937496 -epoch: 43, train loss: 0.6019633388032719 -epoch: 43, eval loss: 0.7350291252136231, correct: 7479, total: 10000, acc = 0.7479000091552734 -epoch: 44, train loss: 0.5928211437196148 -epoch: 45, train loss: 0.5840530048827736 -epoch: 45, eval loss: 0.7301350146532058, correct: 7525, total: 10000, acc = 0.7524999976158142 -epoch: 46, train loss: 0.578370426078232 -epoch: 47, train loss: 0.5703256440405943 -epoch: 47, eval loss: 0.7226948082447052, correct: 7526, total: 10000, acc = 0.7525999546051025 -epoch: 48, train loss: 0.5622531275968162 -epoch: 49, train loss: 0.5543749076979501 -epoch: 49, eval loss: 0.7278151929378509, correct: 7536, total: 10000, acc = 0.753600001335144 -epoch: 50, train loss: 0.5494355583677486 -epoch: 51, train loss: 0.5427058047177841 -epoch: 51, eval loss: 0.7180711388587951, correct: 7608, total: 10000, acc = 0.7608000040054321 -epoch: 52, train loss: 0.5323820530760045 -epoch: 53, train loss: 0.5341374232452742 -epoch: 53, eval loss: 0.7136827558279037, correct: 7618, total: 10000, acc = 0.7617999911308289 -epoch: 54, train loss: 0.5295403867351766 -epoch: 55, train loss: 0.5226148692320804 -epoch: 55, eval loss: 0.7158426463603973, correct: 7624, total: 10000, acc = 0.7623999714851379 -epoch: 56, train loss: 0.5206544593888887 -epoch: 57, train loss: 0.5186455438331682 -epoch: 57, eval loss: 0.7141193479299546, correct: 7611, total: 10000, acc = 0.7610999941825867 -epoch: 58, train loss: 0.5130856335163116 -epoch: 59, train loss: 0.5103850683995655 -epoch: 59, eval loss: 0.7077989399433136, correct: 7628, total: 10000, acc = 0.7627999782562256 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4hxmodel.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4hxmodel.txt deleted file mode 100644 index 72889a455791..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/111log1e-4hxmodel.txt +++ /dev/null @@ -1,195 +0,0 @@ -c196-012[rtx](1006)$ bash ./test.sh 1 1 1 0.0001 -TACC: Starting up job 3503177 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -USE_VANILLA model -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -epoch: 0, train loss: 2.07912605757616 -epoch: 0, eval loss: 1.9337591707706452, correct: 2845, total: 10000, acc = 0.28450000286102295 -epoch time: 48.79993748664856 -epoch: 1, train loss: 1.8506990890113675 -epoch: 1, eval loss: 1.7832269430160523, correct: 3506, total: 10000, acc = 0.350600004196167 -epoch time: 39.10968255996704 -epoch: 2, train loss: 1.707400695401795 -epoch: 2, eval loss: 1.6983122050762176, correct: 3935, total: 10000, acc = 0.3935000002384186 -epoch time: 39.205119609832764 -epoch: 3, train loss: 1.5925798574272467 -epoch: 3, eval loss: 1.6361137092113496, correct: 4276, total: 10000, acc = 0.4275999963283539 -epoch time: 39.220152378082275 -epoch: 4, train loss: 1.4817699790000916 -epoch: 4, eval loss: 1.4869949519634247, correct: 4706, total: 10000, acc = 0.4705999791622162 -epoch time: 39.297648191452026 -epoch: 5, train loss: 1.3685331247290786 -epoch: 5, eval loss: 1.4110832333564758, correct: 5043, total: 10000, acc = 0.5042999982833862 -epoch time: 39.31484127044678 -epoch: 6, train loss: 1.283743022655954 -epoch: 6, eval loss: 1.317776972055435, correct: 5320, total: 10000, acc = 0.5320000052452087 -epoch time: 39.31891870498657 -epoch: 7, train loss: 1.2292176107971036 -epoch: 7, eval loss: 1.2397323846817017, correct: 5619, total: 10000, acc = 0.5618999600410461 -epoch time: 39.31014013290405 -epoch: 8, train loss: 1.1705418606193698 -epoch: 8, eval loss: 1.2041720151901245, correct: 5696, total: 10000, acc = 0.569599986076355 -epoch time: 39.29190945625305 -epoch: 9, train loss: 1.1253369718181843 -epoch: 9, eval loss: 1.1219275832176208, correct: 6039, total: 10000, acc = 0.6038999557495117 -epoch time: 39.314892053604126 -epoch: 10, train loss: 1.0875617825255102 -epoch: 10, eval loss: 1.1398449420928956, correct: 5921, total: 10000, acc = 0.5920999646186829 -epoch time: 39.29768466949463 -epoch: 11, train loss: 1.055325626110544 -epoch: 11, eval loss: 1.0739773243665696, correct: 6212, total: 10000, acc = 0.6211999654769897 -epoch time: 39.26834416389465 -epoch: 12, train loss: 1.0238730627663282 -epoch: 12, eval loss: 1.0526267528533935, correct: 6244, total: 10000, acc = 0.6243999600410461 -epoch time: 39.30522894859314 -epoch: 13, train loss: 0.9906492087305808 -epoch: 13, eval loss: 1.0342225402593612, correct: 6295, total: 10000, acc = 0.6294999718666077 -epoch time: 39.28985071182251 -epoch: 14, train loss: 0.968360669758855 -epoch: 14, eval loss: 0.9747557610273361, correct: 6498, total: 10000, acc = 0.6498000025749207 -epoch time: 39.33563685417175 -epoch: 15, train loss: 0.9413909072778663 -epoch: 15, eval loss: 0.9359912216663361, correct: 6659, total: 10000, acc = 0.6658999919891357 -epoch time: 39.332377672195435 -epoch: 16, train loss: 0.9215109226654987 -epoch: 16, eval loss: 0.9215879321098328, correct: 6693, total: 10000, acc = 0.6692999601364136 -epoch time: 39.35148882865906 -epoch: 17, train loss: 0.9036085179873875 -epoch: 17, eval loss: 0.8947311192750931, correct: 6787, total: 10000, acc = 0.6786999702453613 -epoch time: 39.31995511054993 -epoch: 18, train loss: 0.8774841433885147 -epoch: 18, eval loss: 0.8880111247301101, correct: 6844, total: 10000, acc = 0.6843999624252319 -epoch time: 39.32100558280945 -epoch: 19, train loss: 0.8607137598553483 -epoch: 19, eval loss: 0.8770220369100571, correct: 6883, total: 10000, acc = 0.6882999539375305 -epoch time: 39.3321533203125 -epoch: 20, train loss: 0.8482279163234088 -epoch: 20, eval loss: 0.8661656975746155, correct: 6926, total: 10000, acc = 0.6926000118255615 -epoch time: 39.319167613983154 -epoch: 21, train loss: 0.8280732814146547 -epoch: 21, eval loss: 0.8369802534580231, correct: 7041, total: 10000, acc = 0.7040999531745911 -epoch time: 39.32543706893921 -epoch: 22, train loss: 0.8162973212952517 -epoch: 22, eval loss: 0.8281545102596283, correct: 7096, total: 10000, acc = 0.7095999717712402 -epoch time: 39.344929695129395 -epoch: 23, train loss: 0.8043988426120914 -epoch: 23, eval loss: 0.8369941651821137, correct: 7070, total: 10000, acc = 0.7069999575614929 -epoch time: 39.342397928237915 -epoch: 24, train loss: 0.788704516328111 -epoch: 24, eval loss: 0.8305304765701294, correct: 7040, total: 10000, acc = 0.7039999961853027 -epoch time: 39.349589347839355 -epoch: 25, train loss: 0.7747861517935383 -epoch: 25, eval loss: 0.8025588423013688, correct: 7164, total: 10000, acc = 0.7163999676704407 -epoch time: 39.35692596435547 -epoch: 26, train loss: 0.7557641073149077 -epoch: 26, eval loss: 0.7929455429315567, correct: 7204, total: 10000, acc = 0.7203999757766724 -epoch time: 39.36091661453247 -epoch: 27, train loss: 0.7422851062550837 -epoch: 27, eval loss: 0.7790816932916641, correct: 7249, total: 10000, acc = 0.7249000072479248 -epoch time: 39.355828046798706 -epoch: 28, train loss: 0.7305653861590794 -epoch: 28, eval loss: 0.7937072366476059, correct: 7204, total: 10000, acc = 0.7203999757766724 -epoch time: 39.3598473072052 -epoch: 29, train loss: 0.719313730998915 -epoch: 29, eval loss: 0.7657937437295914, correct: 7320, total: 10000, acc = 0.7319999933242798 -epoch time: 39.353551626205444 -epoch: 30, train loss: 0.7127084263733455 -epoch: 30, eval loss: 0.7556168884038925, correct: 7341, total: 10000, acc = 0.7340999841690063 -epoch time: 39.37097501754761 -epoch: 31, train loss: 0.7044506967067719 -epoch: 31, eval loss: 0.7438590109348298, correct: 7359, total: 10000, acc = 0.7358999848365784 -epoch time: 39.37364745140076 -epoch: 32, train loss: 0.6920064693810989 -epoch: 32, eval loss: 0.7408553540706635, correct: 7419, total: 10000, acc = 0.7418999671936035 -epoch time: 39.372353076934814 -epoch: 33, train loss: 0.6790882920732304 -epoch: 33, eval loss: 0.7541307628154754, correct: 7332, total: 10000, acc = 0.733199954032898 -epoch time: 39.310251235961914 -epoch: 34, train loss: 0.6666433202977083 -epoch: 34, eval loss: 0.7413494348526001, correct: 7401, total: 10000, acc = 0.7400999665260315 -epoch time: 39.394805908203125 -epoch: 35, train loss: 0.6561720742254841 -epoch: 35, eval loss: 0.7245241671800613, correct: 7483, total: 10000, acc = 0.7482999563217163 -epoch time: 39.34455704689026 -epoch: 36, train loss: 0.6433814526820669 -epoch: 36, eval loss: 0.7294039458036423, correct: 7483, total: 10000, acc = 0.7482999563217163 -epoch time: 39.337549924850464 -epoch: 37, train loss: 0.6366085136423305 -epoch: 37, eval loss: 0.7336494833230972, correct: 7462, total: 10000, acc = 0.7461999654769897 -epoch time: 39.338196754455566 -epoch: 38, train loss: 0.6294400272320728 -epoch: 38, eval loss: 0.719609409570694, correct: 7532, total: 10000, acc = 0.7531999945640564 -epoch time: 39.33430027961731 -epoch: 39, train loss: 0.6179663903859197 -epoch: 39, eval loss: 0.7210630685091018, correct: 7507, total: 10000, acc = 0.7506999969482422 -epoch time: 39.33643341064453 -epoch: 40, train loss: 0.6102935781284254 -epoch: 40, eval loss: 0.6994094282388688, correct: 7569, total: 10000, acc = 0.7568999528884888 -epoch time: 39.38672637939453 -epoch: 41, train loss: 0.5990810029360712 -epoch: 41, eval loss: 0.7133035778999328, correct: 7550, total: 10000, acc = 0.7549999952316284 -epoch time: 39.374757528305054 -epoch: 42, train loss: 0.5964441865074391 -epoch: 42, eval loss: 0.7060712993144989, correct: 7577, total: 10000, acc = 0.7576999664306641 -epoch time: 39.4019033908844 -epoch: 43, train loss: 0.5878602710305428 -epoch: 43, eval loss: 0.7106044471263886, correct: 7580, total: 10000, acc = 0.7579999566078186 -epoch time: 39.408252477645874 -epoch: 44, train loss: 0.5797601254010687 -epoch: 44, eval loss: 0.7093768745660782, correct: 7568, total: 10000, acc = 0.7567999958992004 -epoch time: 39.40289378166199 -epoch: 45, train loss: 0.5684604742089097 -epoch: 45, eval loss: 0.7075642883777619, correct: 7612, total: 10000, acc = 0.761199951171875 -epoch time: 39.35792422294617 -epoch: 46, train loss: 0.5617077308041709 -epoch: 46, eval loss: 0.707081851363182, correct: 7576, total: 10000, acc = 0.7576000094413757 -epoch time: 39.37784481048584 -epoch: 47, train loss: 0.5572127462649832 -epoch: 47, eval loss: 0.7069586098194123, correct: 7606, total: 10000, acc = 0.7605999708175659 -epoch time: 39.33794188499451 -epoch: 48, train loss: 0.5519619742218329 -epoch: 48, eval loss: 0.6923990368843078, correct: 7679, total: 10000, acc = 0.7678999900817871 -epoch time: 39.39500594139099 -epoch: 49, train loss: 0.5454421751961416 -epoch: 49, eval loss: 0.7032370567321777, correct: 7626, total: 10000, acc = 0.7626000046730042 -epoch time: 39.38570594787598 -epoch: 50, train loss: 0.5419908360559114 -epoch: 50, eval loss: 0.6949253618717194, correct: 7669, total: 10000, acc = 0.7669000029563904 -epoch time: 39.334325551986694 -epoch: 51, train loss: 0.5299993215166793 -epoch: 51, eval loss: 0.6966427147388459, correct: 7654, total: 10000, acc = 0.7653999924659729 -epoch time: 39.337984561920166 -epoch: 52, train loss: 0.5282451452649369 -epoch: 52, eval loss: 0.6932955116033555, correct: 7664, total: 10000, acc = 0.7663999795913696 -epoch time: 39.34237813949585 -epoch: 53, train loss: 0.5234840703862054 -epoch: 53, eval loss: 0.6988086104393005, correct: 7654, total: 10000, acc = 0.7653999924659729 -epoch time: 39.364726066589355 -epoch: 54, train loss: 0.5139317989957576 -epoch: 54, eval loss: 0.6950253814458847, correct: 7643, total: 10000, acc = 0.7642999887466431 -epoch time: 39.40451097488403 -epoch: 55, train loss: 0.5158528734226616 -epoch: 55, eval loss: 0.6978882610797882, correct: 7672, total: 10000, acc = 0.7671999931335449 -epoch time: 39.38926696777344 -epoch: 56, train loss: 0.5082419429506574 -epoch: 56, eval loss: 0.6909049898386002, correct: 7692, total: 10000, acc = 0.7691999673843384 -epoch time: 39.42493271827698 -epoch: 57, train loss: 0.5027476120360044 -epoch: 57, eval loss: 0.6897687911987305, correct: 7695, total: 10000, acc = 0.7694999575614929 -epoch time: 39.35954570770264 -epoch: 58, train loss: 0.5053188776483342 -epoch: 58, eval loss: 0.6899506479501725, correct: 7667, total: 10000, acc = 0.7666999697685242 -epoch time: 39.44884634017944 -epoch: 59, train loss: 0.4997740634241883 -epoch: 59, eval loss: 0.687486720085144, correct: 7678, total: 10000, acc = 0.767799973487854 -epoch time: 39.391881465911865 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-3.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-3.txt deleted file mode 100644 index 213cc80fe3a0..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-3.txt +++ /dev/null @@ -1,115 +0,0 @@ -TACC: Starting up job 3497142 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -warning: variables which starts with __, is a module or class declaration are omitted -process rank 2 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 3 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -warning: variables which starts with __, is a module or class declaration are omitted -process rank 1 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -epoch: 0, train loss: 1.9320369898056498 -epoch: 1, train loss: 1.6352128605453335 -epoch: 1, eval loss: 1.5123237550258637, correct: 4542, total: 10000, acc = 0.45419999957084656 -epoch: 2, train loss: 1.4457968728882926 -epoch: 3, train loss: 1.3382204977833494 -epoch: 3, eval loss: 1.2539702713489533, correct: 5451, total: 10000, acc = 0.5450999736785889 -epoch: 4, train loss: 1.2739947474732691 -epoch: 5, train loss: 1.2285400483073021 -epoch: 5, eval loss: 1.1386113047599793, correct: 5908, total: 10000, acc = 0.5907999873161316 -epoch: 6, train loss: 1.1903334296479517 -epoch: 7, train loss: 1.1711674235305007 -epoch: 7, eval loss: 1.1258068561553956, correct: 5967, total: 10000, acc = 0.5967000126838684 -epoch: 8, train loss: 1.1419668745021432 -epoch: 9, train loss: 1.1143895728247506 -epoch: 9, eval loss: 1.040754759311676, correct: 6224, total: 10000, acc = 0.6223999857902527 -epoch: 10, train loss: 1.1041023871120141 -epoch: 11, train loss: 1.089750115968743 -epoch: 11, eval loss: 1.0472844064235687, correct: 6265, total: 10000, acc = 0.6265000104904175 -epoch: 12, train loss: 1.064698440687997 -epoch: 13, train loss: 1.038266262229608 -epoch: 13, eval loss: 1.0117274671792984, correct: 6415, total: 10000, acc = 0.6414999961853027 -epoch: 14, train loss: 1.029945282303557 -epoch: 15, train loss: 1.0171620620756734 -epoch: 15, eval loss: 0.9712629705667496, correct: 6519, total: 10000, acc = 0.6518999934196472 -epoch: 16, train loss: 0.9928132119227429 -epoch: 17, train loss: 0.9921575498824217 -epoch: 17, eval loss: 0.9429782271385193, correct: 6641, total: 10000, acc = 0.6640999913215637 -epoch: 18, train loss: 0.9607366293060536 -epoch: 19, train loss: 0.9427766927650997 -epoch: 19, eval loss: 0.9346068739891052, correct: 6623, total: 10000, acc = 0.6622999906539917 -epoch: 20, train loss: 0.9219280481338501 -epoch: 21, train loss: 0.8945026689646195 -epoch: 21, eval loss: 0.8710516095161438, correct: 6909, total: 10000, acc = 0.6908999681472778 -epoch: 22, train loss: 0.8807675826306246 -epoch: 23, train loss: 0.851514169756247 -epoch: 23, eval loss: 0.8239740908145905, correct: 7052, total: 10000, acc = 0.7051999568939209 -epoch: 24, train loss: 0.8388774534877466 -epoch: 25, train loss: 0.8265813291072845 -epoch: 25, eval loss: 0.8102335959672928, correct: 7137, total: 10000, acc = 0.713699996471405 -epoch: 26, train loss: 0.8057564490911912 -epoch: 27, train loss: 0.7816558753957554 -epoch: 27, eval loss: 0.7648743063211441, correct: 7292, total: 10000, acc = 0.729200005531311 -epoch: 28, train loss: 0.766656969883004 -epoch: 29, train loss: 0.7515677390049915 -epoch: 29, eval loss: 0.7517296761274338, correct: 7360, total: 10000, acc = 0.7360000014305115 -epoch: 30, train loss: 0.7300611174836451 -epoch: 31, train loss: 0.7038229193006244 -epoch: 31, eval loss: 0.7385401755571366, correct: 7375, total: 10000, acc = 0.7374999523162842 -epoch: 32, train loss: 0.6928578931458143 -epoch: 33, train loss: 0.672958068093475 -epoch: 33, eval loss: 0.6915913820266724, correct: 7596, total: 10000, acc = 0.7595999836921692 -epoch: 34, train loss: 0.6505378533382805 -epoch: 35, train loss: 0.6292881539889744 -epoch: 35, eval loss: 0.7068031072616577, correct: 7567, total: 10000, acc = 0.7566999793052673 -epoch: 36, train loss: 0.6092992303322773 -epoch: 37, train loss: 0.5922880838720166 -epoch: 37, eval loss: 0.6735526144504547, correct: 7662, total: 10000, acc = 0.7662000060081482 -epoch: 38, train loss: 0.5777627850065425 -epoch: 39, train loss: 0.562178050376931 -epoch: 39, eval loss: 0.6323211371898652, correct: 7799, total: 10000, acc = 0.7798999547958374 -epoch: 40, train loss: 0.5385949274106901 -epoch: 41, train loss: 0.5233490755971597 -epoch: 41, eval loss: 0.6360922038555146, correct: 7806, total: 10000, acc = 0.7805999517440796 -epoch: 42, train loss: 0.50960702373057 -epoch: 43, train loss: 0.48859657985823496 -epoch: 43, eval loss: 0.607847985625267, correct: 7914, total: 10000, acc = 0.7913999557495117 -epoch: 44, train loss: 0.47382923291654006 -epoch: 45, train loss: 0.45052725380780745 -epoch: 45, eval loss: 0.5986941397190094, correct: 8012, total: 10000, acc = 0.8011999726295471 -epoch: 46, train loss: 0.43711013392526277 -epoch: 47, train loss: 0.42507915229213483 -epoch: 47, eval loss: 0.5871582478284836, correct: 8002, total: 10000, acc = 0.8001999855041504 -epoch: 48, train loss: 0.40591827947266246 -epoch: 49, train loss: 0.3911267008100237 -epoch: 49, eval loss: 0.5832945287227631, correct: 8047, total: 10000, acc = 0.8046999573707581 -epoch: 50, train loss: 0.3770884950550235 -epoch: 51, train loss: 0.3587312725733738 -epoch: 51, eval loss: 0.5942261666059494, correct: 8073, total: 10000, acc = 0.8072999715805054 -epoch: 52, train loss: 0.34132662324272856 -epoch: 53, train loss: 0.3267737687850485 -epoch: 53, eval loss: 0.5920912757515907, correct: 8118, total: 10000, acc = 0.8118000030517578 -epoch: 54, train loss: 0.3116904997399875 -epoch: 55, train loss: 0.30321489380938665 -epoch: 55, eval loss: 0.5957943320274353, correct: 8082, total: 10000, acc = 0.8082000017166138 -epoch: 56, train loss: 0.2874147834218278 -epoch: 57, train loss: 0.27991348140093747 -epoch: 57, eval loss: 0.5895262002944947, correct: 8165, total: 10000, acc = 0.8165000081062317 -epoch: 58, train loss: 0.274563160173747 -epoch: 59, train loss: 0.2600744918596988 -epoch: 59, eval loss: 0.5934095367789268, correct: 8150, total: 10000, acc = 0.8149999976158142 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-4.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-4.txt deleted file mode 100644 index 513037271877..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/421log1e-4.txt +++ /dev/null @@ -1,115 +0,0 @@ -TACC: Starting up job 3498509 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -warning: variables which starts with __, is a module or class declaration are omitted -process rank 2 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 3 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 1 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -epoch: 0, train loss: 2.107759721425115 -epoch: 1, train loss: 1.8388929500871776 -epoch: 1, eval loss: 1.7622965753078461, correct: 3535, total: 10000, acc = 0.35349997878074646 -epoch: 2, train loss: 1.7141443588295762 -epoch: 3, train loss: 1.6003259931291853 -epoch: 3, eval loss: 1.608506625890732, correct: 4263, total: 10000, acc = 0.4262999892234802 -epoch: 4, train loss: 1.5016733225511045 -epoch: 5, train loss: 1.4050611877927974 -epoch: 5, eval loss: 1.386299443244934, correct: 4984, total: 10000, acc = 0.4983999729156494 -epoch: 6, train loss: 1.3264902623332278 -epoch: 7, train loss: 1.2681689250225923 -epoch: 7, eval loss: 1.3251740992069245, correct: 5295, total: 10000, acc = 0.5295000076293945 -epoch: 8, train loss: 1.2236176984650748 -epoch: 9, train loss: 1.172800781775494 -epoch: 9, eval loss: 1.1429427027702332, correct: 5966, total: 10000, acc = 0.5965999960899353 -epoch: 10, train loss: 1.1335287532027887 -epoch: 11, train loss: 1.0974334563527788 -epoch: 11, eval loss: 1.1024536848068238, correct: 6107, total: 10000, acc = 0.6107000112533569 -epoch: 12, train loss: 1.0638826300903244 -epoch: 13, train loss: 1.0406859383291127 -epoch: 13, eval loss: 1.0324654281139374, correct: 6282, total: 10000, acc = 0.6281999945640564 -epoch: 14, train loss: 1.0157714376644211 -epoch: 15, train loss: 0.990898135365272 -epoch: 15, eval loss: 0.9790050059556961, correct: 6539, total: 10000, acc = 0.6538999676704407 -epoch: 16, train loss: 0.963820260398242 -epoch: 17, train loss: 0.9404383374720203 -epoch: 17, eval loss: 0.9367435872554779, correct: 6691, total: 10000, acc = 0.6690999865531921 -epoch: 18, train loss: 0.9299906589546982 -epoch: 19, train loss: 0.9038882474510037 -epoch: 19, eval loss: 0.9210823565721512, correct: 6709, total: 10000, acc = 0.6708999872207642 -epoch: 20, train loss: 0.8825302799137271 -epoch: 21, train loss: 0.8686576388320144 -epoch: 21, eval loss: 0.8791542768478393, correct: 6913, total: 10000, acc = 0.6912999749183655 -epoch: 22, train loss: 0.8509396040926174 -epoch: 23, train loss: 0.8375457452268017 -epoch: 23, eval loss: 0.8651147484779358, correct: 6948, total: 10000, acc = 0.6947999596595764 -epoch: 24, train loss: 0.8163802222329744 -epoch: 25, train loss: 0.8068491317787949 -epoch: 25, eval loss: 0.8353333532810211, correct: 7089, total: 10000, acc = 0.708899974822998 -epoch: 26, train loss: 0.7894753631280393 -epoch: 27, train loss: 0.7779296344640304 -epoch: 27, eval loss: 0.8161472469568253, correct: 7143, total: 10000, acc = 0.7142999768257141 -epoch: 28, train loss: 0.763744876092794 -epoch: 29, train loss: 0.7521962505214068 -epoch: 29, eval loss: 0.7903082758188248, correct: 7219, total: 10000, acc = 0.7218999862670898 -epoch: 30, train loss: 0.7443178624522929 -epoch: 31, train loss: 0.7280340212948468 -epoch: 31, eval loss: 0.7877005040645599, correct: 7233, total: 10000, acc = 0.7232999801635742 -epoch: 32, train loss: 0.7196985489251663 -epoch: 33, train loss: 0.7108793039711154 -epoch: 33, eval loss: 0.7838329076766968, correct: 7292, total: 10000, acc = 0.729200005531311 -epoch: 34, train loss: 0.6965019471791326 -epoch: 35, train loss: 0.6875918537986522 -epoch: 35, eval loss: 0.7513678789138794, correct: 7392, total: 10000, acc = 0.7391999959945679 -epoch: 36, train loss: 0.6793362346230721 -epoch: 37, train loss: 0.6741023343436572 -epoch: 37, eval loss: 0.7752945452928544, correct: 7316, total: 10000, acc = 0.7315999865531921 -epoch: 38, train loss: 0.6629589072295597 -epoch: 39, train loss: 0.6507086388918818 -epoch: 39, eval loss: 0.7758691757917404, correct: 7322, total: 10000, acc = 0.7321999669075012 -epoch: 40, train loss: 0.6381483582817778 -epoch: 41, train loss: 0.6374095179596726 -epoch: 41, eval loss: 0.7589699536561966, correct: 7386, total: 10000, acc = 0.738599956035614 -epoch: 42, train loss: 0.6251792050137812 -epoch: 43, train loss: 0.6148473596086308 -epoch: 43, eval loss: 0.7495014071464539, correct: 7478, total: 10000, acc = 0.7477999925613403 -epoch: 44, train loss: 0.6119371378908351 -epoch: 45, train loss: 0.6012086509441843 -epoch: 45, eval loss: 0.725347763299942, correct: 7515, total: 10000, acc = 0.7515000104904175 -epoch: 46, train loss: 0.597867566103838 -epoch: 47, train loss: 0.5913592832429069 -epoch: 47, eval loss: 0.7254288077354432, correct: 7529, total: 10000, acc = 0.7529000043869019 -epoch: 48, train loss: 0.5801522807807339 -epoch: 49, train loss: 0.575563525666996 -epoch: 49, eval loss: 0.7291093468666077, correct: 7533, total: 10000, acc = 0.7532999515533447 -epoch: 50, train loss: 0.573031121674849 -epoch: 51, train loss: 0.5667383588698446 -epoch: 51, eval loss: 0.7240727603435516, correct: 7570, total: 10000, acc = 0.7569999694824219 -epoch: 52, train loss: 0.5578772419569443 -epoch: 53, train loss: 0.5526659309255834 -epoch: 53, eval loss: 0.7226850330829621, correct: 7576, total: 10000, acc = 0.7576000094413757 -epoch: 54, train loss: 0.5473246245968099 -epoch: 55, train loss: 0.5443006860358375 -epoch: 55, eval loss: 0.720612645149231, correct: 7596, total: 10000, acc = 0.7595999836921692 -epoch: 56, train loss: 0.5361242987671677 -epoch: 57, train loss: 0.5323515981435776 -epoch: 57, eval loss: 0.7203025311231613, correct: 7580, total: 10000, acc = 0.7579999566078186 -epoch: 58, train loss: 0.5297852404871766 -epoch: 59, train loss: 0.5288004583241989 -epoch: 59, eval loss: 0.7189624041318894, correct: 7605, total: 10000, acc = 0.7604999542236328 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-3.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-3.txt deleted file mode 100644 index cda0d59efd20..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-3.txt +++ /dev/null @@ -1,131 +0,0 @@ -TACC: Starting up job 3496458 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -warning: variables which starts with __, is a module or class declaration are omitted -process rank 3 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 2 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -warning: variables which starts with __, is a module or class declaration are omitted -process rank 7 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 6 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -optimizer is created -start training -warning: variables which starts with __, is a module or class declaration are omitted -process rank 4 is bound to device 0 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 5 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 1 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -epoch: 0, train loss: 1.936693473738067 -epoch: 1, train loss: 1.627108974116189 -epoch: 1, eval loss: 1.5279120564460755, correct: 4576, total: 10000, acc = 0.4575999975204468 -epoch: 2, train loss: 1.438910031805233 -epoch: 3, train loss: 1.3184991053172521 -epoch: 3, eval loss: 1.3557079970836639, correct: 5129, total: 10000, acc = 0.5128999948501587 -epoch: 4, train loss: 1.271946340191121 -epoch: 5, train loss: 1.2340542175331894 -epoch: 5, eval loss: 1.207822185754776, correct: 5703, total: 10000, acc = 0.5702999830245972 -epoch: 6, train loss: 1.187913371592152 -epoch: 7, train loss: 1.154962458172623 -epoch: 7, eval loss: 1.0685692846775054, correct: 6100, total: 10000, acc = 0.6100000143051147 -epoch: 8, train loss: 1.1158924905621275 -epoch: 9, train loss: 1.0909727805731249 -epoch: 9, eval loss: 1.0345157146453858, correct: 6328, total: 10000, acc = 0.6327999830245972 -epoch: 10, train loss: 1.0725988399009316 -epoch: 11, train loss: 1.0453423085261364 -epoch: 11, eval loss: 0.9778846323490142, correct: 6543, total: 10000, acc = 0.6542999744415283 -epoch: 12, train loss: 1.0397504823548454 -epoch: 13, train loss: 1.011059400986652 -epoch: 13, eval loss: 0.9668682873249054, correct: 6446, total: 10000, acc = 0.644599974155426 -epoch: 14, train loss: 0.9938353963044225 -epoch: 15, train loss: 0.9691349967401854 -epoch: 15, eval loss: 0.9465512812137604, correct: 6657, total: 10000, acc = 0.6656999588012695 -epoch: 16, train loss: 0.9470896617490419 -epoch: 17, train loss: 0.927201622602891 -epoch: 17, eval loss: 0.8875106543302536, correct: 6837, total: 10000, acc = 0.6836999654769897 -epoch: 18, train loss: 0.8975223132542202 -epoch: 19, train loss: 0.8810242603019792 -epoch: 19, eval loss: 0.8688296616077423, correct: 6832, total: 10000, acc = 0.6832000017166138 -epoch: 20, train loss: 0.8482622784011218 -epoch: 21, train loss: 0.8266285700457436 -epoch: 21, eval loss: 0.7801274597644806, correct: 7205, total: 10000, acc = 0.7204999923706055 -epoch: 22, train loss: 0.8038581859092323 -epoch: 23, train loss: 0.7879118153027126 -epoch: 23, eval loss: 0.7779350578784943, correct: 7203, total: 10000, acc = 0.7202999591827393 -epoch: 24, train loss: 0.7542270896386127 -epoch: 25, train loss: 0.7369782894241567 -epoch: 25, eval loss: 0.7534965008497239, correct: 7362, total: 10000, acc = 0.7361999750137329 -epoch: 26, train loss: 0.7095995545387268 -epoch: 27, train loss: 0.6873777825005201 -epoch: 27, eval loss: 0.7344318777322769, correct: 7381, total: 10000, acc = 0.738099992275238 -epoch: 28, train loss: 0.6713967414534822 -epoch: 29, train loss: 0.650338428969286 -epoch: 29, eval loss: 0.677948921918869, correct: 7653, total: 10000, acc = 0.7652999758720398 -epoch: 30, train loss: 0.6301205882004329 -epoch: 31, train loss: 0.5990057824825754 -epoch: 31, eval loss: 0.6719370454549789, correct: 7643, total: 10000, acc = 0.7642999887466431 -epoch: 32, train loss: 0.590088236696866 -epoch: 33, train loss: 0.5689327443132595 -epoch: 33, eval loss: 0.6191721886396409, correct: 7807, total: 10000, acc = 0.7806999683380127 -epoch: 34, train loss: 0.5426055670392756 -epoch: 35, train loss: 0.5270413601276825 -epoch: 35, eval loss: 0.6150132775306701, correct: 7879, total: 10000, acc = 0.7878999710083008 -epoch: 36, train loss: 0.5215025428606539 -epoch: 37, train loss: 0.4952395400222467 -epoch: 37, eval loss: 0.628344652056694, correct: 7868, total: 10000, acc = 0.786799967288971 -epoch: 38, train loss: 0.47989121687655545 -epoch: 39, train loss: 0.46510300618045186 -epoch: 39, eval loss: 0.5977057978510857, correct: 7944, total: 10000, acc = 0.7943999767303467 -epoch: 40, train loss: 0.4441945254802704 -epoch: 41, train loss: 0.4285763985648447 -epoch: 41, eval loss: 0.5695438250899315, correct: 8023, total: 10000, acc = 0.802299976348877 -epoch: 42, train loss: 0.41337763776584546 -epoch: 43, train loss: 0.3940146170100387 -epoch: 43, eval loss: 0.5688270673155784, correct: 8091, total: 10000, acc = 0.8090999722480774 -epoch: 44, train loss: 0.37741332303504554 -epoch: 45, train loss: 0.36565779605690313 -epoch: 45, eval loss: 0.5831407308578491, correct: 8104, total: 10000, acc = 0.8104000091552734 -epoch: 46, train loss: 0.3468657017362361 -epoch: 47, train loss: 0.32949359198005834 -epoch: 47, eval loss: 0.5751512110233307, correct: 8097, total: 10000, acc = 0.8096999526023865 -epoch: 48, train loss: 0.3140165246262842 -epoch: 49, train loss: 0.29480520498995877 -epoch: 49, eval loss: 0.5712087765336037, correct: 8184, total: 10000, acc = 0.818399965763092 -epoch: 50, train loss: 0.2766021394303867 -epoch: 51, train loss: 0.26527753776433516 -epoch: 51, eval loss: 0.5643855139613152, correct: 8218, total: 10000, acc = 0.8217999935150146 -epoch: 52, train loss: 0.2525861115784061 -epoch: 53, train loss: 0.23714738658496312 -epoch: 53, eval loss: 0.5732526823878288, correct: 8249, total: 10000, acc = 0.8248999714851379 -epoch: 54, train loss: 0.2238179413335664 -epoch: 55, train loss: 0.2119908875652722 -epoch: 55, eval loss: 0.5957901775836945, correct: 8261, total: 10000, acc = 0.8260999917984009 -epoch: 56, train loss: 0.19989302222217833 -epoch: 57, train loss: 0.1875186789096618 -epoch: 57, eval loss: 0.5905491337180138, correct: 8290, total: 10000, acc = 0.8289999961853027 -epoch: 58, train loss: 0.18436841180129926 -epoch: 59, train loss: 0.17459663231762088 -epoch: 59, eval loss: 0.589044263958931, correct: 8313, total: 10000, acc = 0.8312999606132507 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-4.txt b/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-4.txt deleted file mode 100644 index 6f69c17cd6a7..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/log/822log1e-4.txt +++ /dev/null @@ -1,131 +0,0 @@ -TACC: Starting up job 3498327 -TACC: Starting parallel tasks... -warning: variables which starts with __, is a module or class declaration are omitted -process rank 0 is bound to device 0 -distributed environment is initialzied -model is created -Files already downloaded and verified -Files already downloaded and verified -training and testing dataloaders are created -loss is created -optimizer is created -start training -warning: variables which starts with __, is a module or class declaration are omitted -process rank 2 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 3 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 4 is bound to device 0 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 5 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 7 is bound to device 3 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 6 is bound to device 2 -Files already downloaded and verified -Files already downloaded and verified -warning: variables which starts with __, is a module or class declaration are omitted -process rank 1 is bound to device 1 -Files already downloaded and verified -Files already downloaded and verified -epoch: 0, train loss: 2.1005014667705613 -epoch: 1, train loss: 1.8539113086097094 -epoch: 1, eval loss: 1.7973519027233125, correct: 3362, total: 10000, acc = 0.3361999988555908 -epoch: 2, train loss: 1.7149482040989155 -epoch: 3, train loss: 1.5927067617980801 -epoch: 3, eval loss: 1.5848429083824158, correct: 4344, total: 10000, acc = 0.4343999922275543 -epoch: 4, train loss: 1.4912729798531046 -epoch: 5, train loss: 1.3957378158763962 -epoch: 5, eval loss: 1.4951884388923644, correct: 4841, total: 10000, acc = 0.48409998416900635 -epoch: 6, train loss: 1.3090402642074896 -epoch: 7, train loss: 1.2566283296565621 -epoch: 7, eval loss: 1.2464738070964814, correct: 5562, total: 10000, acc = 0.5561999678611755 -epoch: 8, train loss: 1.2084139476017075 -epoch: 9, train loss: 1.1706127719003327 -epoch: 9, eval loss: 1.162048089504242, correct: 5876, total: 10000, acc = 0.5875999927520752 -epoch: 10, train loss: 1.120817175933293 -epoch: 11, train loss: 1.084984731309268 -epoch: 11, eval loss: 1.0764922022819519, correct: 6155, total: 10000, acc = 0.6154999732971191 -epoch: 12, train loss: 1.0559214432628787 -epoch: 13, train loss: 1.0261321286765896 -epoch: 13, eval loss: 1.0338306188583375, correct: 6334, total: 10000, acc = 0.6333999633789062 -epoch: 14, train loss: 0.992842432187528 -epoch: 15, train loss: 0.9660871296512837 -epoch: 15, eval loss: 1.0059030145406722, correct: 6458, total: 10000, acc = 0.645799994468689 -epoch: 16, train loss: 0.9467733100968965 -epoch: 17, train loss: 0.9243187673237859 -epoch: 17, eval loss: 0.9469569176435471, correct: 6610, total: 10000, acc = 0.6609999537467957 -epoch: 18, train loss: 0.9059403721167116 -epoch: 19, train loss: 0.8819177935318071 -epoch: 19, eval loss: 0.9196836709976196, correct: 6727, total: 10000, acc = 0.6726999878883362 -epoch: 20, train loss: 0.8721987532109631 -epoch: 21, train loss: 0.8469706013494608 -epoch: 21, eval loss: 0.8634845405817032, correct: 6976, total: 10000, acc = 0.6976000070571899 -epoch: 22, train loss: 0.8352831839298716 -epoch: 23, train loss: 0.8124590455269327 -epoch: 23, eval loss: 0.8418784946203232, correct: 7034, total: 10000, acc = 0.7033999562263489 -epoch: 24, train loss: 0.7961219853284408 -epoch: 25, train loss: 0.7883704268202489 -epoch: 25, eval loss: 0.8191130340099335, correct: 7116, total: 10000, acc = 0.7116000056266785 -epoch: 26, train loss: 0.7733409623710477 -epoch: 27, train loss: 0.7561956893424598 -epoch: 27, eval loss: 0.8028618812561035, correct: 7200, total: 10000, acc = 0.7199999690055847 -epoch: 28, train loss: 0.7479740460308231 -epoch: 29, train loss: 0.7343520899208225 -epoch: 29, eval loss: 0.7829996794462204, correct: 7256, total: 10000, acc = 0.725600004196167 -epoch: 30, train loss: 0.7244430549290716 -epoch: 31, train loss: 0.7121965617549663 -epoch: 31, eval loss: 0.765428164601326, correct: 7299, total: 10000, acc = 0.7299000024795532 -epoch: 32, train loss: 0.6988190838268825 -epoch: 33, train loss: 0.6908610359746583 -epoch: 33, eval loss: 0.7602580636739731, correct: 7395, total: 10000, acc = 0.7394999861717224 -epoch: 34, train loss: 0.6785666395206841 -epoch: 35, train loss: 0.6664504153387887 -epoch: 35, eval loss: 0.7671193510293961, correct: 7345, total: 10000, acc = 0.734499990940094 -epoch: 36, train loss: 0.6639333245705585 -epoch: 37, train loss: 0.6509425913800999 -epoch: 37, eval loss: 0.7612941324710846, correct: 7382, total: 10000, acc = 0.7382000088691711 -epoch: 38, train loss: 0.6416311720196082 -epoch: 39, train loss: 0.6312643265237614 -epoch: 39, eval loss: 0.7380059510469437, correct: 7496, total: 10000, acc = 0.7495999932289124 -epoch: 40, train loss: 0.620578939209179 -epoch: 41, train loss: 0.6195461816933691 -epoch: 41, eval loss: 0.7172901630401611, correct: 7550, total: 10000, acc = 0.7549999952316284 -epoch: 42, train loss: 0.6013389248020795 -epoch: 43, train loss: 0.6049416010477104 -epoch: 43, eval loss: 0.7145429253578186, correct: 7569, total: 10000, acc = 0.7568999528884888 -epoch: 44, train loss: 0.5950779300563189 -epoch: 45, train loss: 0.5786038743598121 -epoch: 45, eval loss: 0.7171747118234635, correct: 7569, total: 10000, acc = 0.7568999528884888 -epoch: 46, train loss: 0.5752052083915594 -epoch: 47, train loss: 0.5669339743195748 -epoch: 47, eval loss: 0.7040806382894516, correct: 7601, total: 10000, acc = 0.7601000070571899 -epoch: 48, train loss: 0.5596802952338238 -epoch: 49, train loss: 0.5521421706189915 -epoch: 49, eval loss: 0.7221358746290207, correct: 7592, total: 10000, acc = 0.7591999769210815 -epoch: 50, train loss: 0.5504364164508119 -epoch: 51, train loss: 0.5363630725412952 -epoch: 51, eval loss: 0.710089972615242, correct: 7650, total: 10000, acc = 0.7649999856948853 -epoch: 52, train loss: 0.5382009008709265 -epoch: 53, train loss: 0.5292040118757559 -epoch: 53, eval loss: 0.7044323921203614, correct: 7672, total: 10000, acc = 0.7671999931335449 -epoch: 54, train loss: 0.5289747638970005 -epoch: 55, train loss: 0.5239191630056926 -epoch: 55, eval loss: 0.6983724802732467, correct: 7694, total: 10000, acc = 0.7694000005722046 -epoch: 56, train loss: 0.5177402243930467 -epoch: 57, train loss: 0.5132759012738053 -epoch: 57, eval loss: 0.7066506981849671, correct: 7671, total: 10000, acc = 0.7670999765396118 -epoch: 58, train loss: 0.5119742675095188 -epoch: 59, train loss: 0.5074386891661858 -epoch: 59, eval loss: 0.7012903690338135, correct: 7693, total: 10000, acc = 0.7692999839782715 -finish training diff --git a/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py b/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py deleted file mode 100644 index a8361d2e6ec8..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_2p5d/test_vit_2p5d.py +++ /dev/null @@ -1,86 +0,0 @@ -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.builder import build_lr_scheduler -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -CONFIG_PATH = Path(__file__).parent.parent.joinpath('configs/vit_2p5d.py') - - -def eval(engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output[0], - ParallelMode.PARALLEL_2P5D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2P5D_COL, - 0, - ) - output = _gather( - output, - ParallelMode.PARALLEL_2P5D_DEP, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label[0] == output) - correct_sum += correct - total_sum += label[0].size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train(engine, train_dataloader, lr_scheduler): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - avg_loss = accumulated_loss / num_steps - lr_scheduler.step() - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2p5d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, engine.optimizer) - logger = get_global_dist_logger() - - logger.info('start training') - for epoch in range(gpc.config.num_epochs): - train_loss = train(engine, train_dataloader, lr_scheduler) - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval(engine, test_dataloader) - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2p5d_parallel_vision_transformer() diff --git a/tests/test_models/test_vision_transformer/test_vit_3d/profiling_3d.py b/tests/test_models/test_vision_transformer/test_vit_3d/profiling_3d.py deleted file mode 100644 index 1044710986a3..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_3d/profiling_3d.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import time -import colossalai - -import torch -from tqdm import tqdm - -from colossalai import initialize -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.utils import print_rank_0, report_memory_usage -from colossalai.utils import empty_cache - -WAIT_STEPS = 3 -WARMUP_STEPS = 50 -ACTIVE_STEPS = 100 -PROFILE_CYCLE = WAIT_STEPS + WARMUP_STEPS + ACTIVE_STEPS - - -def _train_epoch(epoch, engine, dataloader, profiler=None): - logger = get_global_dist_logger() - print_rank_0('[Epoch %d] training start' % (epoch), logger) - engine.train() - data_iter = iter(dataloader) - - train_loss = 0 - batch_cnt = 0 - num_samples = 0 - now = time.time() - epoch_start = now - progress = range(PROFILE_CYCLE) - if gpc.get_global_rank() == 0: - progress = tqdm(progress, desc='[Epoch %d]' % epoch, miniters=1) - for step in progress: - cur_lr = engine.optimizer.param_groups[0]['lr'] - - _, targets, loss = engine.step(data_iter) - if profiler is not None: - profiler.step() - - batch_size = targets[0].size( - 0) * engine._grad_accum_size * gpc.data_parallel_size - train_loss += loss.item() - num_samples += batch_size - batch_cnt += 1 - - batch_time = time.time() - now - now = time.time() - if gpc.get_global_rank() == 0: - print_features = dict(lr='%g' % cur_lr, - loss='%.3f' % (train_loss / (step + 1)), - throughput='%.3f (images/sec)' % - (batch_size / (batch_time + 1e-12))) - progress.set_postfix(**print_features) - - epoch_end = time.time() - epoch_loss = train_loss / batch_cnt - epoch_throughput = num_samples / (epoch_end - epoch_start + 1e-12) - print_rank_0( - '[Epoch %d] Loss: %.3f | Throughput: %.3f (samples/sec)' % - (epoch, epoch_loss, epoch_throughput), logger) - if gpc.get_global_rank() == 0: - report_memory_usage('Memory usage') - - -def test_cifar(): - engine, train_dataloader, test_dataloader = initialize() - - logger = get_global_dist_logger() - logger.info("Train start", ranks=[0]) - data_iter = iter(train_dataloader) - output, targets, loss = engine.step(data_iter) - if gpc.get_global_rank() == 0: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule(wait=WAIT_STEPS, - warmup=WARMUP_STEPS, - active=ACTIVE_STEPS), - on_trace_ready=torch.profiler.tensorboard_trace_handler( - f'./log_cifar_{gpc.config.parallel.tensor.mode}_{gpc.get_world_size(ParallelMode.GLOBAL)}' - ), - record_shapes=True, - # profile_memory=True, - with_flops=True, - with_modules=True, - ) as prof: - _train_epoch(0, engine, train_dataloader, prof) - - torch.cuda.synchronize() - - print('Test complete. Generating profiling report ...') - print( - prof.key_averages(group_by_input_shape=True).table( - sort_by="cuda_time_total")) - - torch.distributed.barrier() - else: - _train_epoch(0, engine, train_dataloader) - torch.cuda.synchronize() - torch.distributed.barrier() - - -def test_imagenet(): - from test_vit_3d import build_dali_train, build_dali_test - engine, train_dataloader, test_dataloader = initialize( - train_dataloader=build_dali_train, test_dataloader=build_dali_test) - - logger = get_global_dist_logger() - logger.info("Train start", ranks=[0]) - if gpc.get_global_rank() == 0: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule(wait=WAIT_STEPS, - warmup=WARMUP_STEPS, - active=ACTIVE_STEPS), - on_trace_ready=torch.profiler.tensorboard_trace_handler( - f'./log_imagenet_{gpc.config.parallel.tensor.mode}_{gpc.get_world_size(ParallelMode.GLOBAL)}' - ), - record_shapes=True, - # profile_memory=True, - with_flops=True, - with_modules=True, - ) as prof: - _train_epoch(0, engine, train_dataloader, prof) - - torch.cuda.synchronize() - - print('Test complete. Generating profiling report ...') - print( - prof.key_averages(group_by_input_shape=True).table( - sort_by="cuda_time_total")) - - torch.distributed.barrier() - else: - _train_epoch(0, engine, train_dataloader) - torch.cuda.synchronize() - torch.distributed.barrier() - - -def test_allgather_n_broadcast(): - from colossalai.communication import all_gather - from colossalai.initialize import init_dist - from colossalai.utils import get_current_device - from tqdm import trange - - init_dist() - - logger = get_global_dist_logger() - - BATCH_SIZE = 4024 - HIDDEN_SIZE = 512 - DEPTH = torch.distributed.get_world_size() - SEQ_LENGTH = 128 - - logger.info("Test start", ranks=[0]) - if gpc.get_global_rank() == 0: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule(wait=1, - warmup=5, - active=10, - repeat=2), - on_trace_ready=torch.profiler.tensorboard_trace_handler( - f'./log_allgather_n_broadcast_{gpc.get_world_size(ParallelMode.GLOBAL)}' - ), - record_shapes=True, - # profile_memory=True, - with_flops=True, - with_modules=True, - ) as prof: - tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE // DEPTH) - for _ in trange(16): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = all_gather(x, -1, ParallelMode.GLOBAL) - prof.step() - - torch.cuda.synchronize() - torch.cuda.empty_cache() - - tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE) - for _ in trange(16): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = x.clone() - torch.distributed.broadcast(x, src=0) - prof.step() - - torch.cuda.synchronize() - torch.cuda.empty_cache() - - print('Test complete. Generating profiling report ...') - print( - prof.key_averages(group_by_input_shape=True).table( - sort_by="cuda_time_total")) - torch.distributed.barrier() - else: - tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE // DEPTH) - for _ in range(16): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = all_gather(x, -1, ParallelMode.GLOBAL) - - torch.cuda.synchronize() - torch.cuda.empty_cache() - - tensor_shape = (BATCH_SIZE, SEQ_LENGTH, HIDDEN_SIZE) - for _ in range(16): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = x.clone() - torch.distributed.broadcast(x, src=0) - - torch.cuda.synchronize() - torch.cuda.empty_cache() - torch.distributed.barrier() - - -def test_layer(): - from colossalai.initialize import init_dist - from colossalai.utils import get_current_device - from tqdm import trange - from colossalai.nn.layer.parallel_3d import Linear3D, LayerNorm3D - - CONFIG = dict(parallel=dict(pipeline=1, tensor=dict(mode='3d', size=8)), - seed=0) - - init_dist(config=CONFIG) - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - gpc.set_seed() - - logger = get_global_dist_logger() - - BATCH_SIZE = 512 - HIDDEN_SIZE = 4096 - DEPTH = colossalai.nn.layer.parallel_3d._utils.get_depth_from_env() - SEQ_LENGTH = 128 - linear1 = Linear3D(HIDDEN_SIZE, HIDDEN_SIZE * 4) - linear2 = Linear3D(HIDDEN_SIZE * 4, HIDDEN_SIZE) - dropout = torch.nn.Dropout(0.0) - norm = LayerNorm3D(HIDDEN_SIZE, eps=1e-5) - layer = torch.nn.Sequential(linear1, linear2, dropout, norm) - - logger.info("Test start", ranks=[0]) - tensor_shape = (BATCH_SIZE // DEPTH ** 2, SEQ_LENGTH, HIDDEN_SIZE // DEPTH) - - if gpc.get_global_rank() == 0: - for _ in trange(WARMUP_STEPS): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = layer(x) - grad = torch.randn(x.shape, - dtype=torch.float, - device=get_current_device()) - x.backward(grad) - empty_cache() - start = time.time() - for _ in trange(ACTIVE_STEPS): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = layer(x) - grad = torch.randn(x.shape, - dtype=torch.float, - device=get_current_device()) - x.backward(grad) - empty_cache() - torch.cuda.synchronize() - end = time.time() - avg_step_time = (end - start) / ACTIVE_STEPS - throughput = ACTIVE_STEPS * BATCH_SIZE / (end - start) - logger.info('Avg step time = {:.3f} s | Throughput = {:.3f} /s'.format(avg_step_time, throughput)) - else: - for _ in range(WARMUP_STEPS + ACTIVE_STEPS): - x = torch.randn(tensor_shape, - dtype=torch.float, - device=get_current_device()) - x = layer(x) - grad = torch.randn(x.shape, - dtype=torch.float, - device=get_current_device()) - x.backward(grad) - empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() - - # if gpc.get_global_rank() == 0: - # with torch.profiler.profile( - # activities=[ - # torch.profiler.ProfilerActivity.CPU, - # torch.profiler.ProfilerActivity.CUDA, - # ], - # schedule=torch.profiler.schedule(wait=WAIT_STEPS, - # warmup=WARMUP_STEPS, - # active=ACTIVE_STEPS), - # on_trace_ready=torch.profiler.tensorboard_trace_handler( - # f'./log_layer_3d_{gpc.get_world_size(ParallelMode.GLOBAL)}' - # ), - # record_shapes=True, - # # profile_memory=True, - # with_flops=True, - # with_modules=True, - # ) as prof: - # for _ in trange(PROFILE_CYCLE): - # x = torch.randn(tensor_shape, - # dtype=torch.float, - # device=get_current_device()) - # x = layer(x) - # grad = torch.randn(x.shape, - # dtype=torch.float, - # device=get_current_device()) - # x.backward(grad) - # prof.step() - - # torch.cuda.synchronize() - - # report_memory_usage('Memory usage') - # print('Test complete. Generating profiling report ...') - # print( - # prof.key_averages(group_by_input_shape=True).table( - # sort_by="cuda_time_total")) - # torch.distributed.barrier() - # else: - # for _ in range(PROFILE_CYCLE): - # x = torch.randn(tensor_shape, - # dtype=torch.float, - # device=get_current_device()) - # x = layer(x) - # grad = torch.randn(x.shape, - # dtype=torch.float, - # device=get_current_device()) - # x.backward(grad) - - # torch.cuda.synchronize() - # torch.distributed.barrier() - - -if __name__ == '__main__': - # test_cifar() - # test_imagenet() - # test_allgather_n_broadcast() - test_layer() diff --git a/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py b/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py deleted file mode 100644 index 8a450581ef33..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_3d/test_vit_3d.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import glob -import os - -import colossalai -import nvidia.dali.fn as fn -import nvidia.dali.tfrecord as tfrec -import torch -from colossalai.context import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.trainer import Trainer -from colossalai.utils import (get_global_multitimer, - set_global_multitimer_status) -from nvidia.dali import types -from nvidia.dali.pipeline import Pipeline -from nvidia.dali.plugin.pytorch import DALIClassificationIterator - -DATASET_PATH = str(os.environ['DATA']) -# imagenet 100 -# TRAIN_RECS = '/project/scratch/p200012/imagenet-100/train/*' -# VAL_RECS = '/project/scratch/p200012/imagenet-100/validation/*' -# TRAIN_IDX = '/project/scratch/p200012/imagenet-100/idx_files/train/*' -# VAL_IDX = '/project/scratch/p200012/imagenet-100/idx_files/validation/*' - -# imagenet 1000 -TRAIN_RECS = DATASET_PATH + '/train/*' -VAL_RECS = DATASET_PATH + '/validation/*' -TRAIN_IDX = DATASET_PATH + '/idx_files/train/*' -VAL_IDX = DATASET_PATH + '/idx_files/validation/*' - - -class DaliDataloader(DALIClassificationIterator): - def __init__(self, - tfrec_filenames, - tfrec_idx_filenames, - shard_id=0, - num_shards=1, - batch_size=128, - num_threads=4, - resize=256, - crop=224, - prefetch=2, - training=True, - gpu_aug=False, - cuda=True): - pipe = Pipeline( - batch_size=batch_size, - num_threads=num_threads, - device_id=torch.cuda.current_device() if cuda else None, - seed=1024) - with pipe: - inputs = fn.readers.tfrecord(path=tfrec_filenames, - index_path=tfrec_idx_filenames, - random_shuffle=training, - shard_id=shard_id, - num_shards=num_shards, - initial_fill=10000, - read_ahead=True, - prefetch_queue_depth=prefetch, - name='Reader', - features={ - 'image/encoded': - tfrec.FixedLenFeature( - (), tfrec.string, ""), - 'image/class/label': - tfrec.FixedLenFeature([1], - tfrec.int64, - -1), - }) - images = inputs["image/encoded"] - - if training: - images = fn.decoders.image( - images, - device='mixed' if gpu_aug else 'cpu', - output_type=types.RGB) - images = fn.random_resized_crop( - images, size=crop, device='gpu' if gpu_aug else 'cpu') - flip_lr = fn.random.coin_flip(probability=0.5) - else: - # decode jpeg and resize - images = fn.decoders.image( - images, - device='mixed' if gpu_aug else 'cpu', - output_type=types.RGB) - images = fn.resize(images, - device='gpu' if gpu_aug else 'cpu', - resize_x=resize, - resize_y=resize, - dtype=types.FLOAT, - interp_type=types.INTERP_TRIANGULAR) - flip_lr = False - - # center crop and normalise - images = fn.crop_mirror_normalize(images, - dtype=types.FLOAT, - crop=(crop, crop), - mean=[127.5], - std=[127.5], - mirror=flip_lr) - label = inputs["image/class/label"] - 1 # 0-999 - # LSG: element_extract will raise exception, let's flatten outside - # label = fn.element_extract(label, element_map=0) # Flatten - if cuda: # transfer data to gpu - pipe.set_outputs(images.gpu(), label.gpu()) - else: - pipe.set_outputs(images, label) - - pipe.build() - last_batch_policy = 'DROP' if training else 'PARTIAL' - super().__init__(pipe, - reader_name="Reader", - auto_reset=True, - last_batch_policy=last_batch_policy) - - def __iter__(self): - # if not reset (after an epoch), reset; if just initialize, ignore - if self._counter >= self._size or self._size < 0: - self.reset() - return self - - def __next__(self): - data = super().__next__() - img, label = data[0]['data'], data[0]['label'] - label = label.squeeze() - return (img, ), (label, ) - - -def build_dali_train(): - return DaliDataloader(sorted(glob.glob(TRAIN_RECS)), - sorted(glob.glob(TRAIN_IDX)), - batch_size=gpc.config.BATCH_SIZE // - gpc.data_parallel_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=True, - gpu_aug=True, - cuda=True) - - -def build_dali_test(): - return DaliDataloader(sorted(glob.glob(VAL_RECS)), - sorted(glob.glob(VAL_IDX)), - batch_size=gpc.config.BATCH_SIZE // - gpc.data_parallel_size, - shard_id=gpc.get_local_rank(ParallelMode.DATA), - num_shards=gpc.get_world_size(ParallelMode.DATA), - training=False, - gpu_aug=True, - cuda=True) - - -def train_cifar(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize() - logger = get_global_dist_logger() - set_global_multitimer_status(True) - - logger.info("Engine is built", ranks=[0]) - - trainer = Trainer(engine=engine, - timer=get_global_multitimer(), - verbose=True) - logger.info("Trainer is built", ranks=[0]) - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - # epochs=gpc.config.num_epochs, - epochs=5, - hooks_cfg=gpc.config.hooks, - display_progress=True, - test_interval=1) - - -def train_imagenet(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize( - train_dataloader=build_dali_train, test_dataloader=build_dali_test) - logger = get_global_dist_logger() - set_global_multitimer_status(True) - - logger.info("Engine is built", ranks=[0]) - - trainer = Trainer(engine=engine, - timer=get_global_multitimer(), - verbose=True) - logger.info("Trainer is built", ranks=[0]) - - logger.info("Train start", ranks=[0]) - trainer.fit(train_dataloader=train_dataloader, - # test_dataloader=test_dataloader, - epochs=gpc.config.num_epochs, - max_steps=100, - hooks_cfg=gpc.config.hooks, - display_progress=True, - test_interval=1) - - -if __name__ == '__main__': - # train_cifar() - train_imagenet() diff --git a/tests/test_models/test_vision_transformer/test_vit_vanilla.py b/tests/test_models/test_vision_transformer/test_vit_vanilla.py deleted file mode 100644 index f5216174847b..000000000000 --- a/tests/test_models/test_vision_transformer/test_vit_vanilla.py +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -from pathlib import Path - -import pytest -import torch - -from colossalai.builder import build_model -from colossalai.context import Config - -CONFIG_PATH = Path(__file__).parent.joinpath('configs/vanilla_vit.py') - - -@pytest.mark.cpu -def test_with_vanilla_vit_config(): - config = Config.from_file(CONFIG_PATH) - model = build_model(config.model) - model.build_from_cfg() - - img = torch.randn(1, 3, config.IMG_SIZE, config.IMG_SIZE) - out = model(img) - loss = out.mean() - loss.backward() - - -if __name__ == '__main__': - test_with_vanilla_vit_config() diff --git a/tests/test_trainer/configs/test_trainer_resnet.py b/tests/test_trainer/configs/test_trainer_resnet.py index ff48d4e6c2cc..bd69dc475a43 100644 --- a/tests/test_trainer/configs/test_trainer_resnet.py +++ b/tests/test_trainer/configs/test_trainer_resnet.py @@ -1,77 +1,6 @@ import os from pathlib import Path -BATCH_SIZE = 128 -IMG_SIZE = 32 -num_epochs = 200 - -# resnet 50 -model = dict( - type='VanillaResNet', - block_type='ResNetBottleneck', - layers=[3, 4, 6, 3], - num_cls=10 -) - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -optimizer = dict( - type='SGD', - lr=0.2, - momentum=0.9, - weight_decay=5e-4 -) - -loss = dict( - type='CrossEntropyLoss', -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=1, mode=None), -) hooks = [ dict(type='LogMetricByEpochHook'), @@ -88,4 +17,3 @@ ), dict(type='SaveCheckpointHook', interval=5, checkpoint_dir='./ckpt'), ] - diff --git a/tests/test_trainer/test.sh b/tests/test_trainer/test.sh index 65c4fc4bd8cd..fa0ae78d5e3b 100644 --- a/tests/test_trainer/test.sh +++ b/tests/test_trainer/test.sh @@ -1,5 +1,4 @@ #!/usr/bin/env sh test_file=$1 -config_file=$2 -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 --config $config_file +python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 diff --git a/tests/test_engine/test_pipeline/debug_schedule.py b/tests/test_trainer/test_pipeline/debug_schedule.py similarity index 100% rename from tests/test_engine/test_pipeline/debug_schedule.py rename to tests/test_trainer/test_pipeline/debug_schedule.py diff --git a/tests/test_engine/test_pipeline/test_p2p.py b/tests/test_trainer/test_pipeline/test_p2p.py similarity index 97% rename from tests/test_engine/test_pipeline/test_p2p.py rename to tests/test_trainer/test_pipeline/test_p2p.py index aa1a0f5e1135..39cfa100322e 100644 --- a/tests/test_engine/test_pipeline/test_p2p.py +++ b/tests/test_trainer/test_pipeline/test_p2p.py @@ -13,7 +13,7 @@ from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import init_dist, parse_args -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device BATCH_SIZE = 32 @@ -65,7 +65,7 @@ def check_forward_backward(output_tensor, output_grad, rank, logger): tensor = send_backward_recv_forward(output_grad, output_tensor.shape) logger.info( 'Rank {} sent backward received forward. Correct tensor: {}'. - format(rank, check_equal(tensor, output_tensor))) + format(rank, check_equal(tensor, output_tensor))) if not gpc.is_last_rank(ParallelMode.PIPELINE): grad = send_forward_recv_backward(output_tensor, output_grad.shape) logger.info( @@ -128,7 +128,7 @@ def test_main(): world_size = args.world_size init_dist(CONFIG) - logger = get_global_dist_logger() + logger = get_dist_logger() rank = gpc.get_global_rank() prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE) up_ranks = gpc.get_ranks_in_group(ParallelMode.PIPELINE_PREV) diff --git a/tests/test_engine/test_pipeline/test_partition.py b/tests/test_trainer/test_pipeline/test_partition.py similarity index 91% rename from tests/test_engine/test_pipeline/test_partition.py rename to tests/test_trainer/test_pipeline/test_partition.py index 65c1081622e5..d3c811657c0b 100644 --- a/tests/test_engine/test_pipeline/test_partition.py +++ b/tests/test_trainer/test_pipeline/test_partition.py @@ -7,7 +7,7 @@ from colossalai.builder import build_dataset, ModelInitializer from colossalai.core import global_context from colossalai.initialize import init_dist -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger DIR_PATH = osp.dirname(osp.realpath(__file__)) CONFIG_PATH = osp.join(DIR_PATH, '../configs/pipeline_vanilla_resnet.py') @@ -17,7 +17,7 @@ @pytest.mark.dist def test_partition(): init_dist(CONFIG_PATH) - logger = get_global_dist_logger() + logger = get_dist_logger() logger.info('finished initialization') # build model diff --git a/tests/test_engine/test_pipeline/test_schedule.py b/tests/test_trainer/test_pipeline/test_schedule.py similarity index 92% rename from tests/test_engine/test_pipeline/test_schedule.py rename to tests/test_trainer/test_pipeline/test_schedule.py index 9125fb3eed84..7e2f320171ff 100644 --- a/tests/test_engine/test_pipeline/test_schedule.py +++ b/tests/test_trainer/test_pipeline/test_schedule.py @@ -8,7 +8,7 @@ from colossalai.context import ParallelMode from colossalai.core import global_context as gpc from colossalai.initialize import initialize -from colossalai.logging import get_global_dist_logger +from colossalai.logging import get_dist_logger NUM_BATCH = 128 @@ -24,7 +24,7 @@ @pytest.mark.dist def test_schedule(): engine, train_dataloader, test_dataloader = initialize(CONFIG_PATH) - logger = get_global_dist_logger() + logger = get_dist_logger() model = engine.model optimizer = engine.optimizer diff --git a/tests/test_trainer/test_trainer.py b/tests/test_trainer/test_trainer.py deleted file mode 100644 index 6a7681d00adb..000000000000 --- a/tests/test_trainer/test_trainer.py +++ /dev/null @@ -1,29 +0,0 @@ -import colossalai -from colossalai.core import global_context as gpc -from colossalai.logging import get_global_dist_logger -from colossalai.trainer import Trainer - - -def test_trainer(): - engine, train_dataloader, test_dataloader = colossalai.initialize() - logger = get_global_dist_logger() - - logger.info("engine is built", ranks=[0]) - - trainer = Trainer(engine=engine, - verbose=True) - logger.info("trainer is built", ranks=[0]) - - logger.info("start training", ranks=[0]) - trainer.fit( - train_dataloader=train_dataloader, - test_dataloader=test_dataloader, - hooks_cfg=gpc.config.hooks, - epochs=gpc.config.num_epochs, - display_progress=False, - test_interval=5 - ) - - -if __name__ == '__main__': - test_trainer() diff --git a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py new file mode 100644 index 000000000000..170f380878a6 --- /dev/null +++ b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py @@ -0,0 +1,113 @@ +import colossalai +import os +from colossalai.amp.amp_type import AMP_TYPE +import torch.nn as nn + +from pathlib import Path +from torchvision import transforms +from torch.optim import Adam +from colossalai.initialize import get_default_parser +from colossalai.core import global_context as gpc +from colossalai.logging import get_dist_logger +from colossalai.trainer import Trainer +from colossalai.utils import get_dataloader +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + +BATCH_SIZE = 128 +IMG_SIZE = 32 +NUM_EPOCHS = 200 + +CONFIG = dict( + # Config + fp16=dict( + mode=AMP_TYPE.TORCH + ) +) + + +def test_trainer(): + parser = get_default_parser() + args = parser.parse_args() + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + + test_dataset = CIFAR10( + root=Path(os.environ['DATA']), + train=False, + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + test_dataloader = get_dataloader(dataset=test_dataset, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + + logger = get_dist_logger() + logger.info("engine is built", ranks=[0]) + + trainer = Trainer(engine=engine, + logger=logger) + logger.info("trainer is built", ranks=[0]) + + logger.info("start training", ranks=[0]) + trainer.fit( + train_dataloader=train_dataloader, + test_dataloader=test_dataloader, + epochs=NUM_EPOCHS, + max_steps=100, + display_progress=True, + test_interval=5 + ) + + +if __name__ == '__main__': + test_trainer() diff --git a/tests/test_trainer/test_trainer_with_pipe_schedule.py b/tests/test_trainer/test_trainer_with_pipe_schedule.py new file mode 100644 index 000000000000..63a22f6ec0cf --- /dev/null +++ b/tests/test_trainer/test_trainer_with_pipe_schedule.py @@ -0,0 +1,146 @@ +import colossalai +import os +import torch +from colossalai.amp.amp_type import AMP_TYPE +from colossalai.context.parallel_mode import ParallelMode +import torch.nn as nn + +from pathlib import Path +from torchvision import transforms +from torch.optim import Adam +from colossalai.initialize import get_default_parser +from colossalai.core import global_context as gpc +from colossalai.logging import get_dist_logger +from colossalai.trainer import Trainer +from colossalai.utils import get_dataloader +from colossalai.engine.schedule import PipelineSchedule +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + +BATCH_SIZE = 32 +IMG_SIZE = 32 +NUM_EPOCHS = 200 + +CONFIG = dict( + parallel=dict( + pipeline=2, + ), + # Config + fp16=dict( + mode=AMP_TYPE.TORCH + ) +) + + +def test_trainer(): + parser = get_default_parser() + args = parser.parse_args() + colossalai.launch( + config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = resnet18(num_classes=10) + + if gpc.get_local_rank(ParallelMode.PIPELINE) == 0: + model = nn.Sequential( + model.conv1, + model.bn1, + model.relu, + model.maxpool, + model.layer1, + model.layer2 + ) + elif gpc.get_local_rank(ParallelMode.PIPELINE) == 1: + from functools import partial + + class Flatten(nn.Module): + + def forward(self, x): + return torch.flatten(x, 1) + + model = nn.Sequential( + model.layer3, + model.layer4, + model.avgpool, + Flatten(), + model.fc + ) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + + test_dataset = CIFAR10( + root=Path(os.environ['DATA']), + train=False, + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + test_dataloader = get_dataloader(dataset=test_dataset, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + + logger = get_dist_logger() + logger.info("engine is built", ranks=[0]) + pipe_schedule = PipelineSchedule(num_microbatches=4) + trainer = Trainer(engine=engine, + schedule=pipe_schedule, + logger=logger) + logger.info("trainer is built", ranks=[0]) + + logger.info("start training", ranks=[0]) + + trainer.fit( + train_dataloader=train_dataloader, + test_dataloader=test_dataloader, + epochs=NUM_EPOCHS, + max_steps=100, + display_progress=True, + test_interval=5 + ) + + +if __name__ == '__main__': + test_trainer() diff --git a/tests/test_utils/test_gradient_accumluation.py b/tests/test_utils/test_gradient_accumluation.py new file mode 100644 index 000000000000..4f7ccd09bc8a --- /dev/null +++ b/tests/test_utils/test_gradient_accumluation.py @@ -0,0 +1,117 @@ +import colossalai +import os +import pytest +import torch +import torch.multiprocessing as mp +import torch.nn as nn + +from functools import partial +from pathlib import Path +from torchvision import transforms +from torch.optim import Adam +from colossalai.core import global_context as gpc +from colossalai.logging import get_dist_logger +from colossalai.utils import report_memory_usage, get_dataloader +from colossalai.initialize import get_default_parser +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + + +# Config +BATCH_SIZE = 16 +IMG_SIZE = 224 +NUM_CLASSES = 10 + +CONFIG = dict( + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ), + clip_grad_norm=1.0, + gradient_accumulation=4 +) + + +def run_no_pipeline(rank, world_size): + + # init dist env + colossalai.launch( + config=CONFIG, + rank=rank, + world_size=world_size, + host='localhost', + port=29500, + backend='nccl' + ) + + # build model + model = resnet18(num_classes=10) + + # build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + pin_memory=True, + drop_last=True) + + # build optimizer + optimizer = Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + engine, train_dataloader, *args = colossalai.initialize( + model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader + ) + logger = get_dist_logger() + rank = torch.distributed.get_rank() + param_track = [] + grad_track = [] + next(model.parameters()).retain_grad() + + engine.train() + step = 0 + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + output = engine(img) + loss = engine.criterion(output, label) + engine.backward(loss) + engine.step() + + # check + param_track.append(next(model.parameters())[0].clone()) + grad_track.append(next(model.parameters()).grad[0].clone()) + step += 1 + if step == CONFIG['gradient_accumulation']: + break + + assert not torch.all(grad_track[0] == grad_track[-1]), 'grad should be different in different iterations' + assert torch.all(param_track[0] == param_track[1]) and not torch.all(param_track[0] == param_track[-1]), \ + 'param should be the same in the first few iterations and only changed in the last iteration' + + gpc.destroy() + + +@pytest.mark.skip("This test should be invoked using the test.sh provided") +@pytest.mark.dist +def test_engine(): + func = partial(run_no_pipeline, world_size=4) + mp.spawn(func, nprocs=4) + + +if __name__ == '__main__': + test_engine() diff --git a/tests/test_zero_data_parallel/config.py b/tests/test_zero_data_parallel/config.py index 3e9d081d1d51..8e263505b243 100644 --- a/tests/test_zero_data_parallel/config.py +++ b/tests/test_zero_data_parallel/config.py @@ -2,90 +2,3 @@ # -*- encoding: utf-8 -*- import os from pathlib import Path - -BATCH_SIZE = 128 -IMG_SIZE = 224 -NUM_CLS = 1000 - -# resnet 18 -model = dict( - type='VanillaResNet', - block_type='ResNetBottleneck', - layers=[3, 4, 6, 3], - num_cls=NUM_CLS -) - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomResizedCrop', size=IMG_SIZE), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) - ] - ), - dataloader=dict( - batch_size=64, - pin_memory=True, - num_workers=4, - sampler=dict( - type='DataParallelSampler', - shuffle=True, - ) - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=(IMG_SIZE, IMG_SIZE)), - dict(type='ToTensor'), - dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - ) -) - -dist_initializer = [ - dict(type='DataParallelInitializer'), -] - -parallelization = dict( - pipeline=1, - tensor=1, - sequence=-1 -) - -optimizer = dict( - type='Adam', - lr=0.01 -) - -loss = dict( - type='CrossEntropyLoss' -) - -trainer = dict( - max_epochs=5, - max_iters=1000 -) - -amp = dict( - fp16=None, -) - -level = 2 - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=1, mode=None) -) diff --git a/tests/test_zero_data_parallel/test_zero.py b/tests/test_zero_data_parallel/test_zero.py index e47ca61a58f4..6331a9a2b100 100644 --- a/tests/test_zero_data_parallel/test_zero.py +++ b/tests/test_zero_data_parallel/test_zero.py @@ -1,146 +1,118 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -import os.path as osp - +import os import pytest import torch -from torch.utils.data import DataLoader -import colossalai -from colossalai.builder import build_dataset, build_loss, build_data_sampler, build_model -from colossalai.core import global_context -from colossalai.engine.gradient_handler import DataParallelGradientHandler -from colossalai.nn.optimizer import ZeroRedundancyOptimizer_Level_1, ZeroRedundancyOptimizer_Level_3, \ - ZeroRedundancyOptimizer_Level_2 -from colossalai.utils import print_rank_0 +from pathlib import Path -DIR_PATH = osp.dirname(osp.abspath(__file__)) -CONFIG_PATH = osp.join(DIR_PATH, 'config.py') +import colossalai +from colossalai.initialize import get_default_parser +from colossalai.core import global_context as gpc +from colossalai.utils import get_dataloader +from torchvision import transforms +from torchvision.models import resnet18 +from torchvision.datasets import CIFAR10 + +BATCH_SIZE = 128 +IMG_SIZE = 224 +NUM_CLS = 1000 + +CONFIG = dict( + fp16=dict( + mode=None, + ), + zero=dict( + # ============== + # level 2 config + # ============== + # level=2, + # cpu_offload=True, + # verbose=False, + + # ============== + # level 3 config + # ============== + level=3, + verbose=False, + offload_optimizer_config=dict( + device='cpu', + pin_memory=True, + buffer_count=5, + fast_init=False + ), + offload_param_config=dict( + device='cpu', + pin_memory=True, + buffer_count=5, + buffer_size=1e8, + max_in_cpu=1e9 + ) + ), + parallel=dict( + pipeline=dict(size=1), + tensor=dict(size=1, mode=None) + ) +) def run_dist(): - colossalai.init_dist(CONFIG_PATH) - - # build resnet model - model = build_model(global_context.config.model) - model.build_from_cfg() - model = model.cuda() - - level = global_context.config.level - - if level > 1: - model = model.half() - - # test init cuda memory - _ = torch.rand(1).cuda() - torch.cuda.synchronize() - max_alloc = torch.cuda.max_memory_allocated() - max_reserved = torch.cuda.max_memory_reserved() - print(f'before run: max_allocation = {max_alloc}, max_reserved = {max_reserved}') - - # build dataloader - train_dataset = build_dataset(global_context.config.train_data.dataset) - - sampler_cfg = global_context.config.train_data.dataloader.pop('sampler', None) - if sampler_cfg is None: - train_dataloader = DataLoader(dataset=train_dataset, **global_context.config.train_data.dataloader) - else: - sampler = build_data_sampler(sampler_cfg, train_dataset) - train_dataloader = DataLoader(dataset=train_dataset, sampler=sampler, - **global_context.config.train_data.dataloader) - - test_dataset = build_dataset(global_context.config.test_data.dataset) - test_dataloader = DataLoader(dataset=test_dataset, **global_context.config.test_data.dataloader) + parser = get_default_parser() + args = parser.parse_args() + + colossalai.launch(config=CONFIG, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend) + + # build model + model = resnet18(num_classes=10) + + # build dataloader# build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) # build optimizer and loss # optimizer = build_optimizer(global_context.config.optimizer, model) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - if level == 1: - zero_optim = ZeroRedundancyOptimizer_Level_1(init_optimizer=optimizer, verbose=False) - elif level == 2: - zero_optim = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, cpu_offload=True, verbose=False) - elif level == 3: - zero_optim = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer, - module=model, - verbose=False, - offload_optimizer_config=dict( - device='cpu', - pin_memory=True, - buffer_count=5, - fast_init=False - ), - offload_param_config=dict( - device='cpu', - pin_memory=True, - buffer_count=5, - buffer_size=1e8, - max_in_cpu=1e9 - ) - ) - - loss_fn = build_loss(global_context.config.loss) - gradient_handler = DataParallelGradientHandler(model, zero_optim) - - # train - for epoch in range(100): - model.train() - - # train - avg_train_loss = 0 - train_iter = 0 + criterion = torch.nn.CrossEntropyLoss() - for idx, (data, label) in enumerate(train_dataloader): - # model = model.half() - data = data[0].cuda() - label = label[0].cuda() + engine, train_dataloader, *args = colossalai.initialize(model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader) - if level > 1: - data = data.half() - - output = model(data) - loss = loss_fn(output[0], label) - - if level > 1: - zero_optim.backward(loss) - zero_optim.overlapping_partition_gradients_reduce_epilogue() - else: - loss.backward() - gradient_handler.handle_gradient() - - zero_optim.step() - zero_optim.zero_grad() - - avg_train_loss += loss.detach().cpu().numpy() - train_iter += 1 - - print_rank_0(f'epoch: {epoch}, train loss: {avg_train_loss / train_iter}') - - if epoch % 2 == 0: - model.eval() - avg_eval_loss = 0 - correct = 0 - total = 0 - eval_iters = 0 - - for idx, (data, label) in enumerate(test_dataloader): - with torch.no_grad(): - data = data[0].cuda() - label = label[0].cuda() - - if level > 1: - data = data.half() - - output = model(data) - loss = loss_fn(output[0], label) - - avg_eval_loss += loss.detach().cpu().numpy() - preds = torch.argmax(output[0], dim=1) - total += data.size(0) - correct += sum(preds == label) - eval_iters += 1 - - print_rank_0(f'epoch: {epoch}, eval loss: {avg_eval_loss / eval_iters}, acc: {correct / total}') + # train + model.train() + for idx, (data, label) in enumerate(train_dataloader): + engine.zero_grad() + data = data.cuda() + label = label.cuda() + + output = engine(data) + loss = engine.criterion(output, label) + + engine.backward(loss) + engine.step() + break @pytest.mark.skip("This test should be invoked manually using the script provided") diff --git a/tests/test_zero_data_parallel/test_zero.sh b/tests/test_zero_data_parallel/test_zero.sh index b725f52aa4c5..c1effa2d1ca1 100644 --- a/tests/test_zero_data_parallel/test_zero.sh +++ b/tests/test_zero_data_parallel/test_zero.sh @@ -1,4 +1,4 @@ #!/bin/bash test_file="test_zero.py" -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file +python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file diff --git a/tests/test_zero_tensor_parallel/components.py b/tests/test_zero_tensor_parallel/components.py new file mode 100644 index 000000000000..8421f2c8f92f --- /dev/null +++ b/tests/test_zero_tensor_parallel/components.py @@ -0,0 +1,76 @@ + +import sys +from pathlib import Path +repo_path = Path(__file__).absolute().parents[2] +sys.path.append(str(repo_path)) + +try: + import model_zoo.vit.vision_transformer_from_config +except ImportError: + raise ImportError("model_zoo is not found, please check your path") + +BATCH_SIZE = 8 +IMG_SIZE = 32 +PATCH_SIZE = 4 +DIM = 512 +NUM_ATTENTION_HEADS = 8 +SUMMA_DIM = 2 +NUM_CLASSES = 10 +DEPTH = 6 + +model_cfg = dict( + type='VisionTransformerFromConfig', + tensor_splitting_cfg=dict( + type='ViTInputSplitter2D', + ), + embedding_cfg=dict( + type='ViTPatchEmbedding2D', + img_size=IMG_SIZE, + patch_size=PATCH_SIZE, + embed_dim=DIM, + ), + token_fusion_cfg=dict( + type='ViTTokenFuser2D', + img_size=IMG_SIZE, + patch_size=PATCH_SIZE, + embed_dim=DIM, + drop_rate=0.1 + ), + norm_cfg=dict( + type='LayerNorm2D', + normalized_shape=DIM, + eps=1e-6, + ), + block_cfg=dict( + type='ViTBlock', + attention_cfg=dict( + type='ViTSelfAttention2D', + hidden_size=DIM, + num_attention_heads=NUM_ATTENTION_HEADS, + attention_dropout_prob=0., + hidden_dropout_prob=0.1, + ), + droppath_cfg=dict( + type='VanillaViTDropPath', + ), + mlp_cfg=dict( + type='ViTMLP2D', + in_features=DIM, + dropout_prob=0.1, + mlp_ratio=1 + ), + norm_cfg=dict( + type='LayerNorm2D', + normalized_shape=DIM, + eps=1e-6, + ), + ), + head_cfg=dict( + type='ViTHead2D', + hidden_size=DIM, + num_classes=NUM_CLASSES, + ), + embed_dim=DIM, + depth=DEPTH, + drop_path_rate=0., +) diff --git a/tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py b/tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py deleted file mode 100644 index 61efa61ed978..000000000000 --- a/tests/test_zero_tensor_parallel/configs/vit_2d_zero1.py +++ /dev/null @@ -1,159 +0,0 @@ -import os -from pathlib import Path - -import torch - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -optimizer = dict( - type='ZeroRedundancyOptimizer', - optimizer_class=torch.optim.Adam, - lr=0.001, - weight_decay=0 -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - -parallel = dict( - pipeline=dict(size=1), - tensor=dict(size=4, mode='2d'), -) - -from colossalai.engine import AMP_TYPE - -fp16 = dict( - mode=AMP_TYPE.PARALLEL, - initial_scale=2 ** 4 -) - -# -# fp16 = dict( -# mode=None, -# ) - -# both level 2 and 3 work -# zero = dict( -# type='ZeroRedundancyOptimizer_Level_1', -# ) - -lr_scheduler = dict( - type='LinearWarmupLR', - warmup_epochs=5 -) - -num_epochs = 60 diff --git a/tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py b/tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py index 2ce42a88c12a..80c450a47966 100644 --- a/tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py +++ b/tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py @@ -1,149 +1,12 @@ -import os -from pathlib import Path - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - parallel = dict( pipeline=dict(size=1), tensor=dict(size=4, mode='2d'), ) -# from colossalai.engine import AMP_TYPE -# -# fp16 = dict( -# mode=AMP_TYPE.PARALLEL, -# initial_scale=2 ** 4 -# ) - fp16 = dict( mode=None, ) -# both level 2 and 3 work zero = dict( - type='ZeroRedundancyOptimizer_Level_2' -) - -lr_scheduler = dict( - type='LinearWarmupLR', - warmup_epochs=5 + level=2 ) - -num_epochs = 60 diff --git a/tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py b/tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py index 61f2a46f3735..58e026347cc7 100644 --- a/tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py +++ b/tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py @@ -1,149 +1,12 @@ -import os -from pathlib import Path - -BATCH_SIZE = 512 -IMG_SIZE = 32 -PATCH_SIZE = 4 -DIM = 512 -NUM_ATTENTION_HEADS = 8 -SUMMA_DIM = 2 -NUM_CLASSES = 10 -DEPTH = 6 - -train_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - transform_pipeline=[ - dict(type='RandomCrop', size=IMG_SIZE, padding=4), - dict(type='RandomHorizontalFlip'), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010]), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -test_data = dict( - dataset=dict( - type='CIFAR10Dataset', - root=Path(os.environ['DATA']), - train=False, - transform_pipeline=[ - dict(type='Resize', size=IMG_SIZE), - dict(type='ToTensor'), - dict(type='Normalize', - mean=[0.4914, 0.4822, 0.4465], - std=[0.2023, 0.1994, 0.2010] - ), - ] - ), - dataloader=dict( - batch_size=BATCH_SIZE, - pin_memory=True, - num_workers=4, - shuffle=True - ) -) - -optimizer = dict( - type='Adam', - lr=0.001, - weight_decay=0 -) - -loss = dict( - type='CrossEntropyLoss2D', -) - -model = dict( - type='VisionTransformerFromConfig', - tensor_splitting_cfg=dict( - type='ViTInputSplitter2D', - ), - embedding_cfg=dict( - type='ViTPatchEmbedding2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - ), - token_fusion_cfg=dict( - type='ViTTokenFuser2D', - img_size=IMG_SIZE, - patch_size=PATCH_SIZE, - embed_dim=DIM, - drop_rate=0.1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - block_cfg=dict( - type='ViTBlock', - attention_cfg=dict( - type='ViTSelfAttention2D', - hidden_size=DIM, - num_attention_heads=NUM_ATTENTION_HEADS, - attention_dropout_prob=0., - hidden_dropout_prob=0.1, - ), - droppath_cfg=dict( - type='VanillaViTDropPath', - ), - mlp_cfg=dict( - type='ViTMLP2D', - in_features=DIM, - dropout_prob=0.1, - mlp_ratio=1 - ), - norm_cfg=dict( - type='LayerNorm2D', - normalized_shape=DIM, - eps=1e-6, - ), - ), - head_cfg=dict( - type='ViTHead2D', - hidden_size=DIM, - num_classes=NUM_CLASSES, - ), - embed_dim=DIM, - depth=DEPTH, - drop_path_rate=0., -) - parallel = dict( pipeline=dict(size=1), tensor=dict(size=4, mode='2d'), ) -# from colossalai.engine import AMP_TYPE - -# fp16 = dict( -# mode=AMP_TYPE.PARALLEL, -# initial_scale=2 ** 4 -# ) - fp16 = dict( mode=None, ) -# both level 2 and 3 work zero = dict( - type='ZeroRedundancyOptimizer_Level_3' -) - -lr_scheduler = dict( - type='LinearWarmupLR', - warmup_epochs=5 + level=3 ) - -num_epochs = 60 diff --git a/tests/test_zero_tensor_parallel/test.sh b/tests/test_zero_tensor_parallel/test.sh index 24d0c54231ee..da5afd5aede6 100644 --- a/tests/test_zero_tensor_parallel/test.sh +++ b/tests/test_zero_tensor_parallel/test.sh @@ -1,4 +1,4 @@ #!/usr/bin/env sh test_file=$1 -python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file +python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 \ No newline at end of file diff --git a/tests/test_zero_tensor_parallel/test_vit_2d.py b/tests/test_zero_tensor_parallel/test_vit_2d.py new file mode 100644 index 000000000000..ef77e9f2e559 --- /dev/null +++ b/tests/test_zero_tensor_parallel/test_vit_2d.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +import os +from pathlib import Path + +import pytest +import torch.autograd + +import colossalai +import torch +from colossalai.initialize import get_default_parser +from colossalai.builder import build_model +from colossalai.context.parallel_mode import ParallelMode +from colossalai.core import global_context as gpc +from colossalai.logging import get_dist_logger +from colossalai.utils import get_dataloader +from colossalai.nn.layer._parallel_utilities import _gather +from colossalai.nn import CrossEntropyLoss2D +from torchvision import transforms +from torchvision.datasets import CIFAR10 +from components import * + +level = os.environ['LEVEL'] +CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py') + + +def train_epoch(engine, train_dataloader): + engine.train() + accumulated_loss = 0 + num_steps = len(train_dataloader) + data_iter = iter(train_dataloader) + for i in range(num_steps): + output, label, loss = engine.step(data_iter) + accumulated_loss += loss.detach().cpu().numpy() + avg_loss = accumulated_loss / num_steps + return avg_loss + + +@pytest.mark.dist +@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") +def test_2d_parallel_vision_transformer(): + parser = get_default_parser() + args = parser.parse_args() + colossalai.launch( + config=CONFIG_PATH, + rank=args.rank, + world_size=args.world_size, + host=args.host, + port=args.port, + backend=args.backend + ) + + # build model + model = build_model(model_cfg) + model.build_from_cfg() + + # build dataloader# build dataloaders + train_dataset = CIFAR10( + root=Path(os.environ['DATA']), + download=True, + transform=transforms.Compose( + [ + transforms.Resize(size=(IMG_SIZE, IMG_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) + ] + ) + ) + train_dataloader = get_dataloader(dataset=train_dataset, + shuffle=True, + batch_size=BATCH_SIZE, + num_workers=1, + pin_memory=True, + drop_last=True) + + # build optimizer and loss + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = CrossEntropyLoss2D() + + engine, train_dataloader, *args = colossalai.initialize(model=model, + optimizer=optimizer, + criterion=criterion, + train_dataloader=train_dataloader) + logger = get_dist_logger() + + logger.info('start training') + engine.train() + + for img, label in train_dataloader: + engine.zero_grad() + img = img.cuda() + label = label.cuda() + out = engine(img) + loss = engine.criterion(out, label) + engine.backward(loss) + engine.step() + break + + +if __name__ == '__main__': + test_2d_parallel_vision_transformer() diff --git a/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py b/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py deleted file mode 100644 index 5c78dfcc22bc..000000000000 --- a/tests/test_zero_tensor_parallel/test_vit_2d/test_vit_2d.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- - -import os -from pathlib import Path - -import pytest -import torch.autograd - -import colossalai -from colossalai.context.parallel_mode import ParallelMode -from colossalai.core import global_context as gpc -from colossalai.engine import Engine -from colossalai.logging import get_global_dist_logger -from colossalai.nn.layer._parallel_utilities import _gather - -level = os.environ['LEVEL'] -CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py') - - -def eval_epoch(engine: Engine, test_dataloader): - engine.eval() - accumulated_loss = 0 - correct_sum = 0 - total_sum = 0 - num_steps = len(test_dataloader) - data_iter = iter(test_dataloader) - - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - - output = _gather( - output[0], - ParallelMode.PARALLEL_2D_ROW, - 1 - ) - output = _gather( - output, - ParallelMode.PARALLEL_2D_COL, - 0, - ) - output = torch.argmax(output, dim=-1) - correct = torch.sum(label[0] == output) - correct_sum += correct - total_sum += label[0].size(0) - avg_loss = accumulated_loss / num_steps - return correct_sum, total_sum, avg_loss - - -def train_epoch(engine, train_dataloader): - engine.train() - accumulated_loss = 0 - num_steps = len(train_dataloader) - data_iter = iter(train_dataloader) - for i in range(num_steps): - output, label, loss = engine.step(data_iter) - accumulated_loss += loss.detach().cpu().numpy() - avg_loss = accumulated_loss / num_steps - return avg_loss - - -@pytest.mark.dist -@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus") -def test_2d_parallel_vision_transformer(): - # init dist - engine, train_dataloader, test_dataloader = colossalai.initialize(CONFIG_PATH) - logger = get_global_dist_logger() - - logger.info('start training') - for epoch in range(gpc.config.num_epochs): - train_loss = train_epoch(engine, train_dataloader) - - logger.info(f'epoch {epoch} - train loss: {train_loss}') - - if epoch % 2 == 0: - correct_sum, total_sum, eval_loss = eval_epoch(engine, test_dataloader) - logger.info( - f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, ' - f'correct: {correct_sum}, acc: {correct_sum / total_sum}') - - -if __name__ == '__main__': - test_2d_parallel_vision_transformer()