From 86be7440076de238bacdd4bf401513d44025c388 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Wed, 6 Jul 2022 17:34:24 +0800
Subject: [PATCH 1/2] make it faster

---
 tests/test_utils/test_colo_checkpoint.py | 39 +++++++++++++-----------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/tests/test_utils/test_colo_checkpoint.py b/tests/test_utils/test_colo_checkpoint.py
index 6e7d4441d760..48742fc18a58 100644
--- a/tests/test_utils/test_colo_checkpoint.py
+++ b/tests/test_utils/test_colo_checkpoint.py
@@ -1,21 +1,20 @@
 from abc import ABC, abstractmethod
-import os, sys, shutil
+import os, shutil
 import torch
 import torch.nn as nn
 import pytest
 import copy
-import operator
-import colossalai
-from colossalai.context.parallel_mode import ParallelMode
+from functools import partial
+
 import torch.multiprocessing as mp
 import torch.distributed as dist
+
+import colossalai
 from colossalai.testing import rerun_if_address_is_in_use
 from colossalai.utils.cuda import get_current_device
 from colossalai.utils import free_port
 from colossalai.utils.model.colo_init_context import ColoInitContext
-from colossalai.tensor import ColoTensorSpec, ComputePattern, ComputeSpec, DistSpecManager, distspec, ProcessGroup, ColoTensor
-from colossalai.core import global_context as gpc
-from functools import partial
+from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, distspec, ProcessGroup
 from colossalai.nn.parallel.data_parallel import ColoDDP
 from colossalai.utils.checkpoint import save_checkpoint, load_checkpoint
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -46,15 +45,17 @@ def __len__(self):
 
 
 class DummyDataLoader(DummyDataGenerator):
-    batch_size = 128
-    category = 16
-    feature_size = 256
+
+    def __init__(self, batch_size, category, feature_size, length=10):
+        super().__init__(length)
+        self.batch_size = batch_size
+        self.category = category
+        self.feature_size = feature_size
 
     def generate(self):
         image_dict = {}
-        image_dict['pixel_values'] = torch.rand(
-            DummyDataLoader.batch_size, DummyDataLoader.feature_size, device=get_current_device()) * 2 - 1
-        image_dict['label'] = torch.randint(DummyDataLoader.category, (DummyDataLoader.batch_size,),
+        image_dict['pixel_values'] = torch.rand(self.batch_size, self.feature_size, device=get_current_device()) * 2 - 1
+        image_dict['label'] = torch.randint(self.category, (self.batch_size,),
                                             dtype=torch.int64,
                                             device=get_current_device())
         return image_dict
@@ -102,11 +103,15 @@ def remove(path):
 
 
 def run_checkpoint(init_spec_func, use_ddp, test_epoch, pg):
-    train_dataloader = DummyDataLoader(length=16)
+    batch = 3
+    feature = 32
+    category = 16
+    train_dataloader = DummyDataLoader(batch, category, feature, length=16)
     with ColoInitContext(device=get_current_device()):
-        model = MLP(256, 16, 64)
-        model_reload = MLP(256, 16, 64)
-        model_ref = MLP(256, 16, 64)
+        model = MLP(feature, category)
+        model_reload = MLP(feature, category)
+        model_ref = MLP(feature, category)
+
     model = model.cuda()
     model_reload = model_reload.cuda()
     model_ref = model_ref.cuda()

From 1048396729953fcc86086f023745545f1e287282 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Thu, 14 Jul 2022 11:42:19 +0800
Subject: [PATCH 2/2] [hotfix] remove circle import

---
 colossalai/engine/_base_engine.py             |  2 +-
 .../engine/schedule/_pipeline_schedule.py     |  3 +-
 .../{engine => gemini}/ophooks/__init__.py    |  0
 .../ophooks/_memtracer_ophook.py              |  2 +-
 .../ophooks/_shard_grad_ophook.py             |  0
 .../ophooks/_shard_param_ophook.py            |  0
 .../{engine => gemini}/ophooks/utils.py       |  0
 .../{engine => gemini}/paramhooks/__init__.py |  0
 .../paramhooks/_param_hookmgr.py              |  0
 colossalai/initialize.py                      |  2 +-
 .../utils/profiler/legacy/mem_profiler.py     |  2 +-
 .../profiler/stateful_tensor_mem_extention.py |  2 +-
 .../zero/sharded_model/sharded_model_v2.py    |  4 +-
 colossalai/zero/utils/zero_hook.py            |  2 +-
 docs/colossalai/colossalai.engine.ophooks.rst |  6 +-
 .../colossalai.engine.ophooks.zero_hook.rst   |  4 +-
 docs/colossalai/colossalai.engine.rst         |  2 +-
 tests/test_engine/test_param_hook.py          | 86 -------------------
 18 files changed, 16 insertions(+), 101 deletions(-)
 rename colossalai/{engine => gemini}/ophooks/__init__.py (100%)
 rename colossalai/{engine => gemini}/ophooks/_memtracer_ophook.py (98%)
 rename colossalai/{engine => gemini}/ophooks/_shard_grad_ophook.py (100%)
 rename colossalai/{engine => gemini}/ophooks/_shard_param_ophook.py (100%)
 rename colossalai/{engine => gemini}/ophooks/utils.py (100%)
 rename colossalai/{engine => gemini}/paramhooks/__init__.py (100%)
 rename colossalai/{engine => gemini}/paramhooks/_param_hookmgr.py (100%)
 delete mode 100644 tests/test_engine/test_param_hook.py

diff --git a/colossalai/engine/_base_engine.py b/colossalai/engine/_base_engine.py
index 074b9d0ccaad..146a29669227 100644
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
@@ -7,7 +7,7 @@
 
 from colossalai.logging import get_dist_logger
 from torch import Tensor
-from colossalai.engine.ophooks import register_ophooks_recursively, BaseOpHook
+from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
 from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
 from typing import Optional, Type
 from colossalai.engine.gradient_handler import BaseGradientHandler
diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py
index 6e865ae8f7cf..97571fa024ba 100644
--- a/colossalai/engine/schedule/_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_pipeline_schedule.py
@@ -12,7 +12,6 @@
 from colossalai.logging import get_dist_logger
 from colossalai.utils import switch_virtual_pipeline_parallel_rank
 from colossalai.utils.cuda import get_current_device
-from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
 
 from ._base_schedule import BaseSchedule
 
@@ -157,6 +156,7 @@ def load_micro_batch(self):
         return self._move_to_device(mciro_batch_data)
 
     def pre_processing(self, engine):
+        from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
         # TODO: remove this after testing new zero with pipeline parallelism
         model = engine.model
         if isinstance(model, NaiveAMPModel):
@@ -482,6 +482,7 @@ def __init__(self,
         self.num_model_chunks = num_model_chunks
 
     def pre_processing(self, engine):
+        from colossalai.zero.sharded_model.sharded_model_v2 import ShardedModelV2
         if isinstance(engine.model, ShardedModelV2):
             self.dtype = torch.half
         elif isinstance(engine.model[0], NaiveAMPModel):
diff --git a/colossalai/engine/ophooks/__init__.py b/colossalai/gemini/ophooks/__init__.py
similarity index 100%
rename from colossalai/engine/ophooks/__init__.py
rename to colossalai/gemini/ophooks/__init__.py
diff --git a/colossalai/engine/ophooks/_memtracer_ophook.py b/colossalai/gemini/ophooks/_memtracer_ophook.py
similarity index 98%
rename from colossalai/engine/ophooks/_memtracer_ophook.py
rename to colossalai/gemini/ophooks/_memtracer_ophook.py
index 4f16edfabbe0..71831f1aaf7d 100644
--- a/colossalai/engine/ophooks/_memtracer_ophook.py
+++ b/colossalai/gemini/ophooks/_memtracer_ophook.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from colossalai.context.parallel_mode import ParallelMode
 import torch
-from colossalai.engine.ophooks import BaseOpHook
+from colossalai.gemini.ophooks import BaseOpHook
 from colossalai.registry import OPHOOKS
 from colossalai.logging import get_dist_logger
 from colossalai.core import global_context as gpc
diff --git a/colossalai/engine/ophooks/_shard_grad_ophook.py b/colossalai/gemini/ophooks/_shard_grad_ophook.py
similarity index 100%
rename from colossalai/engine/ophooks/_shard_grad_ophook.py
rename to colossalai/gemini/ophooks/_shard_grad_ophook.py
diff --git a/colossalai/engine/ophooks/_shard_param_ophook.py b/colossalai/gemini/ophooks/_shard_param_ophook.py
similarity index 100%
rename from colossalai/engine/ophooks/_shard_param_ophook.py
rename to colossalai/gemini/ophooks/_shard_param_ophook.py
diff --git a/colossalai/engine/ophooks/utils.py b/colossalai/gemini/ophooks/utils.py
similarity index 100%
rename from colossalai/engine/ophooks/utils.py
rename to colossalai/gemini/ophooks/utils.py
diff --git a/colossalai/engine/paramhooks/__init__.py b/colossalai/gemini/paramhooks/__init__.py
similarity index 100%
rename from colossalai/engine/paramhooks/__init__.py
rename to colossalai/gemini/paramhooks/__init__.py
diff --git a/colossalai/engine/paramhooks/_param_hookmgr.py b/colossalai/gemini/paramhooks/_param_hookmgr.py
similarity index 100%
rename from colossalai/engine/paramhooks/_param_hookmgr.py
rename to colossalai/gemini/paramhooks/_param_hookmgr.py
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 086efaac3a5e..e907efddee69 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -22,7 +22,7 @@
 
 from colossalai.engine.schedule import NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape
 from colossalai.engine import Engine
-from colossalai.engine.ophooks import BaseOpHook
+from colossalai.gemini.ophooks import BaseOpHook
 
 from colossalai.utils import (get_current_device, is_using_ddp, is_using_pp, is_using_sequence, sync_model_param)
 from colossalai.utils.moe import sync_moe_model_param
diff --git a/colossalai/utils/profiler/legacy/mem_profiler.py b/colossalai/utils/profiler/legacy/mem_profiler.py
index c4d7ca2ef35a..f80f6ecf5773 100644
--- a/colossalai/utils/profiler/legacy/mem_profiler.py
+++ b/colossalai/utils/profiler/legacy/mem_profiler.py
@@ -2,7 +2,7 @@
 from typing import Union
 from colossalai.engine import Engine
 from torch.utils.tensorboard import SummaryWriter
-from colossalai.engine.ophooks import MemTracerOpHook
+from colossalai.gemini.ophooks import MemTracerOpHook
 from colossalai.utils.profiler.legacy.prof_utils import BaseProfiler
 
 
diff --git a/colossalai/utils/profiler/stateful_tensor_mem_extention.py b/colossalai/utils/profiler/stateful_tensor_mem_extention.py
index 7498235538d9..127055c8c1ef 100644
--- a/colossalai/utils/profiler/stateful_tensor_mem_extention.py
+++ b/colossalai/utils/profiler/stateful_tensor_mem_extention.py
@@ -5,7 +5,7 @@
 from enum import Enum
 from typing import List
 from colossalai.gemini.stateful_tensor import StatefulTensor
-from colossalai.engine.ophooks import BaseOpHook
+from colossalai.gemini.ophooks import BaseOpHook
 from colossalai.engine import Engine
 from colossalai.utils.profiler.extention import ProfilerExtension
 
diff --git a/colossalai/zero/sharded_model/sharded_model_v2.py b/colossalai/zero/sharded_model/sharded_model_v2.py
index 9940ea5e57e2..a0214f609467 100644
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -8,9 +8,9 @@
 import torch.nn as nn
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.engine.ophooks import register_ophooks_recursively
+from colossalai.gemini.ophooks import register_ophooks_recursively
 from colossalai.zero.utils import ZeroHook
-from colossalai.engine.paramhooks import BaseParamHookMgr
+from colossalai.gemini.paramhooks import BaseParamHookMgr
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device, disposable
 from colossalai.gemini.memory_tracer.memstats_collector import MemStatsCollector
diff --git a/colossalai/zero/utils/zero_hook.py b/colossalai/zero/utils/zero_hook.py
index e292660214e8..189d1ad2de94 100644
--- a/colossalai/zero/utils/zero_hook.py
+++ b/colossalai/zero/utils/zero_hook.py
@@ -8,7 +8,7 @@
 from colossalai.utils import get_current_device
 
 from colossalai.zero.shard_utils import BaseShardStrategy
-from colossalai.engine.ophooks import BaseOpHook
+from colossalai.gemini.ophooks import BaseOpHook
 
 from colossalai.gemini.stateful_tensor_mgr import StatefulTensorMgr
 from colossalai.gemini.memory_tracer import MemStatsCollector
diff --git a/docs/colossalai/colossalai.engine.ophooks.rst b/docs/colossalai/colossalai.engine.ophooks.rst
index f4b8a8396038..0173aa2a4f8d 100644
--- a/docs/colossalai/colossalai.engine.ophooks.rst
+++ b/docs/colossalai/colossalai.engine.ophooks.rst
@@ -1,11 +1,11 @@
-colossalai.engine.ophooks
+colossalai.gemini.ophooks
 =========================
 
-.. automodule:: colossalai.engine.ophooks
+.. automodule:: colossalai.gemini.ophooks
    :members:
 
 
 .. toctree::
    :maxdepth: 2
 
-   colossalai.engine.ophooks.zero_hook
+   colossalai.gemini.ophooks.zero_hook
diff --git a/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst b/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst
index 270d1839ccf9..f7868dd3ad80 100644
--- a/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst
+++ b/docs/colossalai/colossalai.engine.ophooks.zero_hook.rst
@@ -1,5 +1,5 @@
-colossalai.engine.ophooks.zero\_hook
+colossalai.gemini.ophooks.zero\_hook
 ====================================
 
-.. automodule:: colossalai.engine.ophooks.zero_hook
+.. automodule:: colossalai.gemini.ophooks.zero_hook
    :members:
diff --git a/docs/colossalai/colossalai.engine.rst b/docs/colossalai/colossalai.engine.rst
index 00028968a86b..740cb03342e9 100644
--- a/docs/colossalai/colossalai.engine.rst
+++ b/docs/colossalai/colossalai.engine.rst
@@ -8,5 +8,5 @@ colossalai.engine
    :maxdepth: 2
 
    colossalai.engine.gradient_handler
-   colossalai.engine.ophooks
+   colossalai.gemini.ophooks
    colossalai.engine.schedule
diff --git a/tests/test_engine/test_param_hook.py b/tests/test_engine/test_param_hook.py
deleted file mode 100644
index 54639157fe62..000000000000
--- a/tests/test_engine/test_param_hook.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import pytest
-from colossalai.engine.paramhooks import BaseParamHookMgr
-from torch import nn
-import torch
-import torch.nn.functional as F
-import copy
-
-class SubNet(nn.Module):
-    def __init__(self, out_features) -> None:
-        super().__init__()
-        self.bias = nn.Parameter(torch.zeros(out_features))
-
-    def forward(self, x, weight):
-        return F.linear(x, weight, self.bias)
-
-
-class Net(nn.Module):
-    def __init__(self, checkpoint=False) -> None:
-        super().__init__()
-        self.fc1 = nn.Linear(5, 5)
-        self.sub_fc = SubNet(5)
-        self.fc2 = nn.Linear(5, 1)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.sub_fc(x, self.fc1.weight)
-        x = self.fc1(x)
-        x = self.fc2(x)
-        return x
-
-def net_data():
-    return (torch.randn(2, 5, dtype=torch.float, device='cuda'),)
-
-def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
-    if loose:
-        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
-    return torch.allclose(tensor_a, tensor_b)
-
-
-def test_base_param_hook():
-    torch.manual_seed(0)
-    model = Net(checkpoint=True).cuda()
-    model.train()
-    inputs = net_data()
-
-    def run_model(model, inputs, use_param_hook = False):
-        if use_param_hook:
-            class HooKWrapper:
-                def __init__(self) -> None:
-                    self.hook_triggered_times = 0
-
-                def wrapper_func(self):
-                    def hook(param, grad) -> torch.Tensor or None:
-                        self.hook_triggered_times += 1
-                        return grad
-                    return hook
-
-            hookwrapper = HooKWrapper()
-            param_list = [p for p in model.parameters()]
-            hook_mgr = BaseParamHookMgr(param_list)
-            hook_mgr.register_backward_hooks(hookwrapper.wrapper_func())
-        
-        model.zero_grad(set_to_none=True)
-
-        with torch.cuda.amp.autocast():
-            y = model(*inputs)
-            loss = y.sum()
-        loss.backward()
-
-        if use_param_hook:
-            hook_mgr.remove_hooks()
-            return hookwrapper.hook_triggered_times
-    
-    model_copy = copy.deepcopy(model)
-
-    run_model(model, inputs, False)
-    ret2 = run_model(model_copy, inputs, True)
-    
-    # Make sure param hook has only be fired once in case of parameter sharing
-    assert ret2 == len(list(model.parameters()))
-
-    for p, p_copy in zip(model.parameters(), model_copy.parameters()):
-        assert allclose(p.grad, p_copy.grad), f"{p.grad} vs {p_copy.grad}"
-
-if __name__ == '__main__':
-    test_base_param_hook()