diff --git a/colossalai/legacy/zero/gemini/__init__.py b/colossalai/legacy/zero/gemini/__init__.py index b272980d34d8..f30bccea4a95 100644 --- a/colossalai/legacy/zero/gemini/__init__.py +++ b/colossalai/legacy/zero/gemini/__init__.py @@ -1,3 +1,4 @@ +from .colo_init_context import ColoInitContext, post_process_colo_init_ctx from .ophooks import BaseOpHook, register_ophooks_recursively from .stateful_tensor import StatefulTensor from .stateful_tensor_mgr import StatefulTensorMgr @@ -11,4 +12,6 @@ "AutoTensorPlacementPolicy", "register_ophooks_recursively", "BaseOpHook", + "ColoInitContext", + "post_process_colo_init_ctx", ] diff --git a/colossalai/zero/gemini/colo_init_context.py b/colossalai/legacy/zero/gemini/colo_init_context.py similarity index 100% rename from colossalai/zero/gemini/colo_init_context.py rename to colossalai/legacy/zero/gemini/colo_init_context.py diff --git a/colossalai/shardformer/policies/gptj.py b/colossalai/shardformer/policies/gptj.py index fe52b00fbebe..9feb826c4624 100644 --- a/colossalai/shardformer/policies/gptj.py +++ b/colossalai/shardformer/policies/gptj.py @@ -163,7 +163,6 @@ def get_held_layers(self) -> List[nn.Module]: layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages) if stage_manager.is_first_stage(): held_layers.append(module.wte) - # held_layers.append(module.wpe) held_layers.append(module.drop) start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage) held_layers.extend(module.h[start_idx:end_idx]) diff --git a/colossalai/zero/__init__.py b/colossalai/zero/__init__.py index 90d0f8de1916..5ad59e83234b 100644 --- a/colossalai/zero/__init__.py +++ b/colossalai/zero/__init__.py @@ -1,11 +1,4 @@ -from .gemini import ( - ColoInitContext, - GeminiAdamOptimizer, - GeminiDDP, - GeminiOptimizer, - get_static_torch_model, - post_process_colo_init_ctx, -) +from .gemini import GeminiAdamOptimizer, GeminiDDP, GeminiOptimizer, get_static_torch_model from .low_level import LowLevelZeroOptimizer from .wrapper import zero_model_wrapper, zero_optim_wrapper @@ -16,7 +9,5 @@ "zero_model_wrapper", "zero_optim_wrapper", "LowLevelZeroOptimizer", - "ColoInitContext", - "post_process_colo_init_ctx", "get_static_torch_model", ] diff --git a/colossalai/zero/gemini/__init__.py b/colossalai/zero/gemini/__init__.py index 358d5c7fd289..6d93ca8edfda 100644 --- a/colossalai/zero/gemini/__init__.py +++ b/colossalai/zero/gemini/__init__.py @@ -1,5 +1,4 @@ from .chunk import ChunkManager, TensorInfo, TensorState, search_chunk_configuration -from .colo_init_context import ColoInitContext, post_process_colo_init_ctx from .gemini_ddp import GeminiDDP from .gemini_mgr import GeminiManager from .gemini_optimizer import GeminiAdamOptimizer, GeminiOptimizer @@ -15,6 +14,4 @@ "get_static_torch_model", "GeminiAdamOptimizer", "GeminiOptimizer", - "ColoInitContext", - "post_process_colo_init_ctx", ]