From b553453c6476ac151e481646cebe5485b7b84da3 Mon Sep 17 00:00:00 2001 From: duanjunwen <935724073@qq.com> Date: Mon, 23 Dec 2024 15:01:06 +0800 Subject: [PATCH 1/3] [fix] hotfix normalization --- colossalai/shardformer/layer/normalization.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py index 1c2b44fc8908..2edca3b3e1c4 100644 --- a/colossalai/shardformer/layer/normalization.py +++ b/colossalai/shardformer/layer/normalization.py @@ -76,18 +76,24 @@ def forward(self, input): FusedRMSNormWithHook = NPUFusedRMSNormWithHook else: - from apex.normalization import FusedRMSNorm as ApexFusedRMSNorm - - class CUDAFusedRMSNormWithHook(ApexFusedRMSNorm): - def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True): - super().__init__(normalized_shape, eps, elementwise_affine) - - def forward(self, input): - output = super().forward(input) - output = hook_parameter_in_backward(output, self.weight) - return output + try: + from apex.normalization import FusedRMSNorm as ApexFusedRMSNorm + + class CUDAFusedRMSNormWithHook(ApexFusedRMSNorm): + def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True): + super().__init__(normalized_shape, eps, elementwise_affine) + + def forward(self, input): + output = super().forward(input) + output = hook_parameter_in_backward(output, self.weight) + return output + + FusedRMSNormWithHook = CUDAFusedRMSNormWithHook + except ImportError: + warnings.warn( + "Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel" + ) - FusedRMSNormWithHook = CUDAFusedRMSNormWithHook FAST_LAYERNORM_SUPPORTED_SIZE = [ 1024, From a0914ab8a7f5ee53cf6eb0467387b556bcf7545a Mon Sep 17 00:00:00 2001 From: duanjunwen <935724073@qq.com> Date: Mon, 23 Dec 2024 15:20:40 +0800 Subject: [PATCH 2/3] [hotfix] force doc ci test --- docs/source/en/features/distributed_optimizers.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/features/distributed_optimizers.md b/docs/source/en/features/distributed_optimizers.md index 279bc8f9d58e..45ca7e700901 100644 --- a/docs/source/en/features/distributed_optimizers.md +++ b/docs/source/en/features/distributed_optimizers.md @@ -20,12 +20,12 @@ We now demonstrate how to use Distributed Adafactor with booster API combining T ### step 1. Import libraries ```python +import torch +import colossalai +from colossalai.booster import Booster from transformers import LlamaModel, LlamaConfig from colossalai.nn.optimizer.distributed_adafactor import DistributedAdaFactor -from colossalai.booster import Booster from colossalai.booster.plugin import HybridParallelPlugin -import colossalai -import torch ``` ### step 2. Initialize Distributed Environment and Parallism Group From 3c5ab92b64060073c5b6729f4f425fa18d9a210a Mon Sep 17 00:00:00 2001 From: duanjunwen <935724073@qq.com> Date: Mon, 23 Dec 2024 15:48:09 +0800 Subject: [PATCH 3/3] [hotfix] fallback doc --- docs/source/en/features/distributed_optimizers.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/features/distributed_optimizers.md b/docs/source/en/features/distributed_optimizers.md index 45ca7e700901..279bc8f9d58e 100644 --- a/docs/source/en/features/distributed_optimizers.md +++ b/docs/source/en/features/distributed_optimizers.md @@ -20,12 +20,12 @@ We now demonstrate how to use Distributed Adafactor with booster API combining T ### step 1. Import libraries ```python -import torch -import colossalai -from colossalai.booster import Booster from transformers import LlamaModel, LlamaConfig from colossalai.nn.optimizer.distributed_adafactor import DistributedAdaFactor +from colossalai.booster import Booster from colossalai.booster.plugin import HybridParallelPlugin +import colossalai +import torch ``` ### step 2. Initialize Distributed Environment and Parallism Group