From 86be7440076de238bacdd4bf401513d44025c388 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Wed, 6 Jul 2022 17:34:24 +0800 Subject: [PATCH 1/2] make it faster --- tests/test_utils/test_colo_checkpoint.py | 39 +++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/tests/test_utils/test_colo_checkpoint.py b/tests/test_utils/test_colo_checkpoint.py index 6e7d4441d760..48742fc18a58 100644 --- a/tests/test_utils/test_colo_checkpoint.py +++ b/tests/test_utils/test_colo_checkpoint.py @@ -1,21 +1,20 @@ from abc import ABC, abstractmethod -import os, sys, shutil +import os, shutil import torch import torch.nn as nn import pytest import copy -import operator -import colossalai -from colossalai.context.parallel_mode import ParallelMode +from functools import partial + import torch.multiprocessing as mp import torch.distributed as dist + +import colossalai from colossalai.testing import rerun_if_address_is_in_use from colossalai.utils.cuda import get_current_device from colossalai.utils import free_port from colossalai.utils.model.colo_init_context import ColoInitContext -from colossalai.tensor import ColoTensorSpec, ComputePattern, ComputeSpec, DistSpecManager, distspec, ProcessGroup, ColoTensor -from colossalai.core import global_context as gpc -from functools import partial +from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, distspec, ProcessGroup from colossalai.nn.parallel.data_parallel import ColoDDP from colossalai.utils.checkpoint import save_checkpoint, load_checkpoint from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR @@ -46,15 +45,17 @@ def __len__(self): class DummyDataLoader(DummyDataGenerator): - batch_size = 128 - category = 16 - feature_size = 256 + + def __init__(self, batch_size, category, feature_size, length=10): + super().__init__(length) + self.batch_size = batch_size + self.category = category + self.feature_size = feature_size def generate(self): image_dict = {} - image_dict['pixel_values'] = torch.rand( - DummyDataLoader.batch_size, DummyDataLoader.feature_size, device=get_current_device()) * 2 - 1 - image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,), + image_dict['pixel_values'] = torch.rand(self.batch_size, self.feature_size, device=get_current_device()) * 2 - 1 + image_dict['label'] = torch.randint(self.category, (self.batch_size,), dtype=torch.int64, device=get_current_device()) return image_dict @@ -102,11 +103,15 @@ def remove(path): def run_checkpoint(init_spec_func, use_ddp, test_epoch, pg): - train_dataloader = DummyDataLoader(length=16) + batch = 3 + feature = 32 + category = 16 + train_dataloader = DummyDataLoader(batch, category, feature, length=16) with ColoInitContext(device=get_current_device()): - model = MLP(256, 16, 64) - model_reload = MLP(256, 16, 64) - model_ref = MLP(256, 16, 64) + model = MLP(feature, category) + model_reload = MLP(feature, category) + model_ref = MLP(feature, category) + model = model.cuda() model_reload = model_reload.cuda() model_ref = model_ref.cuda() From 1048396729953fcc86086f023745545f1e287282 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Thu, 14 Jul 2022 11:42:19 +0800 Subject: [PATCH 2/2] [hotfix] remove circle import --- colossalai/engine/_base_engine.py | 2 +- .../engine/schedule/_pipeline_schedule.py | 3 +- .../{engine => gemini}/ophooks/__init__.py | 0 .../ophooks/_memtracer_ophook.py | 2 +- .../ophooks/_shard_grad_ophook.py | 0 .../ophooks/_shard_param_ophook.py | 0 .../{engine => gemini}/ophooks/utils.py | 0 .../{engine => gemini}/paramhooks/__init__.py | 0 .../paramhooks/_param_hookmgr.py | 0 colossalai/initialize.py | 2 +- .../utils/profiler/legacy/mem_profiler.py | 2 +- .../profiler/stateful_tensor_mem_extention.py | 2 +- .../zero/sharded_model/sharded_model_v2.py | 4 +- colossalai/zero/utils/zero_hook.py | 2 +- docs/colossalai/colossalai.engine.ophooks.rst | 6 +- .../colossalai.engine.ophooks.zero_hook.rst | 4 +- docs/colossalai/colossalai.engine.rst | 2 +- tests/test_engine/test_param_hook.py | 86 ------------------- 18 files changed, 16 insertions(+), 101 deletions(-) rename colossalai/{engine => gemini}/ophooks/__init__.py (100%) rename colossalai/{engine => gemini}/ophooks/_memtracer_ophook.py (98%) rename colossalai/{engine => gemini}/ophooks/_shard_grad_ophook.py (100%) rename colossalai/{engine => gemini}/ophooks/_shard_param_ophook.py (100%) rename colossalai/{engine => gemini}/ophooks/utils.py (100%) rename colossalai/{engine => gemini}/paramhooks/__init__.py (100%) rename colossalai/{engine => gemini}/paramhooks/_param_hookmgr.py (100%) delete mode 100644 tests/test_engine/test_param_hook.py diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py index 074b9d0ccaad..146a29669227 100644 --- a/colossalai/engine/_base_engine.py +++ b/colossalai/engine/_base_engine.py @@ -7,7 +7,7 @@ from colossalai.logging import get_dist_logger from torch import Tensor -from colossalai.engine.ophooks import register_ophooks_recursively, BaseOpHook +from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule from typing import Optional, Type from colossalai.engine.gradient_handler import BaseGradientHandler diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py index 6e865ae8f7cf..97571fa024ba 100644 --- a/colossalai/engine/schedule/_pipeline_schedule.py +++ b/colossalai/engine/schedule/_pipeline_schedule.py @@ -12,7 +12,6 @@ from colossalai.logging import get_dist_logger from colossalai.utils import switch_virtual_pipeline_parallel_rank from colossalai.utils.cuda import get_current_device -from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2 from ._base_schedule import BaseSchedule @@ -157,6 +156,7 @@ def load_micro_batch(self): return self._move_to_device(mciro_batch_data) def pre_processing(self, engine): + from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2 # TODO: remove this after testing new zero with pipeline parallelism model = engine.model if isinstance(model, NaiveAMPModel): @@ -482,6 +482,7 @@ def __init__(self, self.num_model_chunks = num_model_chunks def pre_processing(self, engine): + from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2 if isinstance(engine.model, ShardedModelV2): self.dtype = torch.half elif isinstance(engine.model[0], NaiveAMPModel): diff --git a/colossalai/engine/ophooks/__init__.py b/colossalai/gemini/ophooks/__init__.py similarity index 100% rename from colossalai/engine/ophooks/__init__.py rename to colossalai/gemini/ophooks/__init__.py diff --git a/colossalai/engine/ophooks/_memtracer_ophook.py b/colossalai/gemini/ophooks/_memtracer_ophook.py similarity index 98% rename from colossalai/engine/ophooks/_memtracer_ophook.py rename to colossalai/gemini/ophooks/_memtracer_ophook.py index 4f16edfabbe0..71831f1aaf7d 100644 --- a/colossalai/engine/ophooks/_memtracer_ophook.py +++ b/colossalai/gemini/ophooks/_memtracer_ophook.py @@ -3,7 +3,7 @@ from pathlib import Path from colossalai.context.parallel_mode import ParallelMode import torch -from colossalai.engine.ophooks import BaseOpHook +from colossalai.gemini.ophooks import BaseOpHook from colossalai.registry import OPHOOKS from colossalai.logging import get_dist_logger from colossalai.core import global_context as gpc diff --git a/colossalai/engine/ophooks/_shard_grad_ophook.py b/colossalai/gemini/ophooks/_shard_grad_ophook.py similarity index 100% rename from colossalai/engine/ophooks/_shard_grad_ophook.py rename to colossalai/gemini/ophooks/_shard_grad_ophook.py diff --git a/colossalai/engine/ophooks/_shard_param_ophook.py b/colossalai/gemini/ophooks/_shard_param_ophook.py similarity index 100% rename from colossalai/engine/ophooks/_shard_param_ophook.py rename to colossalai/gemini/ophooks/_shard_param_ophook.py diff --git a/colossalai/engine/ophooks/utils.py b/colossalai/gemini/ophooks/utils.py similarity index 100% rename from colossalai/engine/ophooks/utils.py rename to colossalai/gemini/ophooks/utils.py diff --git a/colossalai/engine/paramhooks/__init__.py b/colossalai/gemini/paramhooks/__init__.py similarity index 100% rename from colossalai/engine/paramhooks/__init__.py rename to colossalai/gemini/paramhooks/__init__.py diff --git a/colossalai/engine/paramhooks/_param_hookmgr.py b/colossalai/gemini/paramhooks/_param_hookmgr.py similarity index 100% rename from colossalai/engine/paramhooks/_param_hookmgr.py rename to colossalai/gemini/paramhooks/_param_hookmgr.py diff --git a/colossalai/initialize.py b/colossalai/initialize.py index 086efaac3a5e..e907efddee69 100644 --- a/colossalai/initialize.py +++ b/colossalai/initialize.py @@ -22,7 +22,7 @@ from colossalai.engine.schedule import NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape from colossalai.engine import Engine -from colossalai.engine.ophooks import BaseOpHook +from colossalai.gemini.ophooks import BaseOpHook from colossalai.utils import (get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param) from colossalai.utils.moe import sync_moe_model_param diff --git a/colossalai/utils/profiler/legacy/mem_profiler.py b/colossalai/utils/profiler/legacy/mem_profiler.py index c4d7ca2ef35a..f80f6ecf5773 100644 --- a/colossalai/utils/profiler/legacy/mem_profiler.py +++ b/colossalai/utils/profiler/legacy/mem_profiler.py @@ -2,7 +2,7 @@ from typing import Union from colossalai.engine import Engine from torch.utils.tensorboard import SummaryWriter -from colossalai.engine.ophooks import MemTracerOpHook +from colossalai.gemini.ophooks import MemTracerOpHook from colossalai.utils.profiler.legacy.prof_utils import BaseProfiler diff --git a/colossalai/utils/profiler/stateful_tensor_mem_extention.py b/colossalai/utils/profiler/stateful_tensor_mem_extention.py index 7498235538d9..127055c8c1ef 100644 --- a/colossalai/utils/profiler/stateful_tensor_mem_extention.py +++ b/colossalai/utils/profiler/stateful_tensor_mem_extention.py @@ -5,7 +5,7 @@ from enum import Enum from typing import List from colossalai.gemini.stateful_tensor import StatefulTensor -from colossalai.engine.ophooks import BaseOpHook +from colossalai.gemini.ophooks import BaseOpHook from colossalai.engine import Engine from colossalai.utils.profiler.extention import ProfilerExtension diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py index 9940ea5e57e2..a0214f609467 100644 --- a/colossalai/zero/sharded_model/sharded_model_v2.py +++ b/colossalai/zero/sharded_model/sharded_model_v2.py @@ -8,9 +8,9 @@ import torch.nn as nn from colossalai.context.parallel_mode import ParallelMode from colossalai.core import global_context as gpc -from colossalai.engine.ophooks import register_ophooks_recursively +from colossalai.gemini.ophooks import register_ophooks_recursively from colossalai.zero.utils import ZeroHook -from colossalai.engine.paramhooks import BaseParamHookMgr +from colossalai.gemini.paramhooks import BaseParamHookMgr from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device, disposable from colossalai.gemini.memory_tracer.memstats_collector import MemStatsCollector diff --git a/colossalai/zero/utils/zero_hook.py b/colossalai/zero/utils/zero_hook.py index e292660214e8..189d1ad2de94 100644 --- a/colossalai/zero/utils/zero_hook.py +++ b/colossalai/zero/utils/zero_hook.py @@ -8,7 +8,7 @@ from colossalai.utils import get_current_device from colossalai.zero.shard_utils import BaseShardStrategy -from colossalai.engine.ophooks import BaseOpHook +from colossalai.gemini.ophooks import BaseOpHook from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr from colossalai.gemini.memory_tracer import MemStatsCollector diff --git a/docs/colossalai/colossalai.engine.ophooks.rst b/docs/colossalai/colossalai.engine.ophooks.rst index f4b8a8396038..0173aa2a4f8d 100644 --- a/docs/colossalai/colossalai.engine.ophooks.rst +++ b/docs/colossalai/colossalai.engine.ophooks.rst @@ -1,11 +1,11 @@ -colossalai.engine.ophooks +colossalai.gemini.ophooks ========================= -.. automodule:: colossalai.engine.ophooks +.. automodule:: colossalai.gemini.ophooks :members: .. toctree:: :maxdepth: 2 - colossalai.engine.ophooks.zero_hook + colossalai.gemini.ophooks.zero_hook diff --git a/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst b/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst index 270d1839ccf9..f7868dd3ad80 100644 --- a/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst +++ b/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst @@ -1,5 +1,5 @@ -colossalai.engine.ophooks.zero\_hook +colossalai.gemini.ophooks.zero\_hook ==================================== -.. automodule:: colossalai.engine.ophooks.zero_hook +.. automodule:: colossalai.gemini.ophooks.zero_hook :members: diff --git a/docs/colossalai/colossalai.engine.rst b/docs/colossalai/colossalai.engine.rst index 00028968a86b..740cb03342e9 100644 --- a/docs/colossalai/colossalai.engine.rst +++ b/docs/colossalai/colossalai.engine.rst @@ -8,5 +8,5 @@ colossalai.engine :maxdepth: 2 colossalai.engine.gradient_handler - colossalai.engine.ophooks + colossalai.gemini.ophooks colossalai.engine.schedule diff --git a/tests/test_engine/test_param_hook.py b/tests/test_engine/test_param_hook.py deleted file mode 100644 index 54639157fe62..000000000000 --- a/tests/test_engine/test_param_hook.py +++ /dev/null @@ -1,86 +0,0 @@ -import pytest -from colossalai.engine.paramhooks import BaseParamHookMgr -from torch import nn -import torch -import torch.nn.functional as F -import copy - -class SubNet(nn.Module): - def __init__(self, out_features) -> None: - super().__init__() - self.bias = nn.Parameter(torch.zeros(out_features)) - - def forward(self, x, weight): - return F.linear(x, weight, self.bias) - - -class Net(nn.Module): - def __init__(self, checkpoint=False) -> None: - super().__init__() - self.fc1 = nn.Linear(5, 5) - self.sub_fc = SubNet(5) - self.fc2 = nn.Linear(5, 1) - - def forward(self, x): - x = self.fc1(x) - x = self.sub_fc(x, self.fc1.weight) - x = self.fc1(x) - x = self.fc2(x) - return x - -def net_data(): - return (torch.randn(2, 5, dtype=torch.float, device='cuda'),) - -def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool: - if loose: - return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3) - return torch.allclose(tensor_a, tensor_b) - - -def test_base_param_hook(): - torch.manual_seed(0) - model = Net(checkpoint=True).cuda() - model.train() - inputs = net_data() - - def run_model(model, inputs, use_param_hook = False): - if use_param_hook: - class HooKWrapper: - def __init__(self) -> None: - self.hook_triggered_times = 0 - - def wrapper_func(self): - def hook(param, grad) -> torch.Tensor or None: - self.hook_triggered_times += 1 - return grad - return hook - - hookwrapper = HooKWrapper() - param_list = [p for p in model.parameters()] - hook_mgr = BaseParamHookMgr(param_list) - hook_mgr.register_backward_hooks(hookwrapper.wrapper_func()) - - model.zero_grad(set_to_none=True) - - with torch.cuda.amp.autocast(): - y = model(*inputs) - loss = y.sum() - loss.backward() - - if use_param_hook: - hook_mgr.remove_hooks() - return hookwrapper.hook_triggered_times - - model_copy = copy.deepcopy(model) - - run_model(model, inputs, False) - ret2 = run_model(model_copy, inputs, True) - - # Make sure param hook has only be fired once in case of parameter sharing - assert ret2 == len(list(model.parameters())) - - for p, p_copy in zip(model.parameters(), model_copy.parameters()): - assert allclose(p.grad, p_copy.grad), f"{p.grad} vs {p_copy.grad}" - -if __name__ == '__main__': - test_base_param_hook()