From c67e789142b1627ff655a9d4214a578c10c04ca5 Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Thu, 14 Sep 2023 16:24:49 +0800 Subject: [PATCH 1/6] change replace linear to shardformer --- colossalai/gptq/cai_gptq/__init__.py | 2 +- colossalai/gptq/cai_gptq/cai_quant_linear.py | 194 ++++++++++++++++-- colossalai/gptq/gptq_tp.py | 180 ---------------- colossalai/gptq/models/__init__.py | 2 - colossalai/gptq/models/bloom.py | 18 -- colossalai/gptq/models/llama.py | 19 -- .../inference/tensor_parallel/engine.py | 6 +- .../tensor_parallel/policies/bloom.py | 42 ++-- .../tensor_parallel/policies/llama.py | 51 +++++ 9 files changed, 259 insertions(+), 255 deletions(-) delete mode 100644 colossalai/gptq/gptq_tp.py delete mode 100644 colossalai/gptq/models/__init__.py delete mode 100644 colossalai/gptq/models/bloom.py delete mode 100644 colossalai/gptq/models/llama.py diff --git a/colossalai/gptq/cai_gptq/__init__.py b/colossalai/gptq/cai_gptq/__init__.py index fcdef7734438..af1f274dd3b0 100644 --- a/colossalai/gptq/cai_gptq/__init__.py +++ b/colossalai/gptq/cai_gptq/__init__.py @@ -9,6 +9,6 @@ HAS_AUTO_GPTQ = False if HAS_AUTO_GPTQ: - from .cai_quant_linear import CaiQuantLinear, make_cai_quant_linear + from .cai_quant_linear import CaiQuantLinear, ColCaiQuantLinear, RowCaiQuantLinear, make_cai_quant_linear from .gptq_op import CaiGPTQLinearOp from .gptq_triton import gptq_fused_linear_triton diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/gptq/cai_gptq/cai_quant_linear.py index 93312716992d..1fe09a925a0e 100644 --- a/colossalai/gptq/cai_gptq/cai_quant_linear.py +++ b/colossalai/gptq/cai_gptq/cai_quant_linear.py @@ -2,11 +2,16 @@ import math import warnings +from typing import Callable, List, Optional, Tuple, Union import numpy as np import torch +import torch.distributed as dist import torch.nn as nn -import triton +from torch.distributed import ProcessGroup + +from colossalai.lazy import LazyInitContext +from colossalai.shardformer.layer import ParallelModule from .gptq_op import CaiGPTQLinearOp @@ -21,14 +26,6 @@ class CaiQuantLinear(nn.Module): - max_dq_buffer_size = 1 - max_inner_outer_dim = 1 - max_input_len = 1 - prepared_buffers = False - device_to_buffers = { - "temp_state": None, - "temp_dq": None, - } def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): super().__init__() @@ -82,12 +79,6 @@ def pack(self, linear, scales, zeros, g_idx=None): if linear.bias is not None: self.bias = linear.bias.clone().half() - # wn = 16 - # pbits = 64 - # ptype = torch.int64 - # unsign_type = np.uint64 - # sign_type = np.int64 - wn = 8 pbits = 32 ptype = torch.int32 @@ -107,9 +98,6 @@ def pack(self, linear, scales, zeros, g_idx=None): i = 0 row = 0 - # print("weight shape ", intweight.shape, qweight.shape, out_qweight.shape, bits) - # print("weight shape ", intweight[0].shape, qweight[0].shape, out_qweight[0].shape) - # print("weight value ", intweight[0], qweight[0]) while row < qweight.shape[0]: if self.bits in [2, 4, 8]: @@ -202,6 +190,176 @@ def forward(self, x): return output.view(outshape) +def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): + + qweights = gptq_linear.qweight.split(gptq_linear.out_features // split_num, dim=-1) + qzeros = gptq_linear.qzeros.split(gptq_linear.out_features // (32 // cai_linear.bits) // split_num, dim=-1) + scales = gptq_linear.scales.split(gptq_linear.out_features // split_num, dim=-1) + g_idx = gptq_linear.g_idx + if gptq_linear.bias is not None: + bias = gptq_linear.bias.split(gptq_linear.out_features // split_num, dim=-1) + + cai_split_out_features = cai_linear.outfeatures // split_num + zero_split_block = cai_linear.outfeatures // (32 // cai_linear.bits) // split_num + + for i in range(split_num): + cai_linear.qweight[:, i * cai_split_out_features:(i + 1) * + cai_split_out_features] = qweights[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + cai_linear.qzeros[:, i * zero_split_block:(i + 1) * + zero_split_block] = qzeros[i][:, tp_rank * zero_split_block:(tp_rank + 1) * zero_split_block] + cai_linear.scales[:, i * cai_split_out_features:(i + 1) * + cai_split_out_features] = scales[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + if cai_linear.bias is not None: + cai_linear.bias[i * cai_split_out_features:(i + 1) * + cai_split_out_features] = bias[i][tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + + cai_linear.g_idx.copy_(g_idx) + + +def split_row_copy(gptq_linear, cai_linear, tp_rank=0, split_num=1): + + qweights = gptq_linear.qweight.split(gptq_linear.in_features // split_num, dim=0) + qzeros = gptq_linear.qzeros.split(gptq_linear.in_features // split_num, dim=0) + scales = gptq_linear.scales.split(gptq_linear.in_features // split_num, dim=0) + g_idxs = gptq_linear.g_idx.split(gptq_linear.in_features // split_num, dim=0) + + cai_split_in_features = cai_linear.infeatures // (32 // cai_linear.bits) // split_num + zero_split_block = cai_linear.infeatures // cai_linear.groupsize // split_num + idx_split_features = cai_linear.infeatures // split_num + + for i in range(split_num): + cai_linear.qweight[i * cai_split_in_features:(i + 1) * + cai_split_in_features, :] = qweights[i][tp_rank * cai_split_in_features:(tp_rank + 1) * + cai_split_in_features, :] + cai_linear.qzeros[i * zero_split_block:(i + 1) * + zero_split_block, :] = qzeros[i][tp_rank * zero_split_block:(tp_rank + 1) * + zero_split_block, :] + cai_linear.scales[i * zero_split_block:(i + 1) * + zero_split_block, :] = scales[i][tp_rank * zero_split_block:(tp_rank + 1) * + zero_split_block, :] + cai_linear.g_idx[i * idx_split_features:(i + 1) * + idx_split_features] = g_idxs[i][tp_rank * idx_split_features:(tp_rank + 1) * + idx_split_features] + if cai_linear.bias is not None: + cai_linear.bias.copy_(gptq_linear.bias) + + +class RowCaiQuantLinear(CaiQuantLinear, ParallelModule): + + def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): + + super().__init__(bits, + groupsize, + infeatures, + outfeatures, + bias, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=row_split) + self.process_group = None + + @staticmethod + def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, + **kwargs) -> ParallelModule: + LazyInitContext.materialize(module) + # get the attributes + in_features = module.in_features + out_features = module.out_features + bias = module.bias is not None + device = module.weight.device + + # ensure only one process group is passed + if isinstance(process_group, (list, tuple)): + assert len(process_group) == 1, \ + f'Expected only one process group, got {len(process_group)}.' + process_group = process_group[0] + + tp_size = dist.get_world_size(process_group) + tp_rank = dist.get_rank(process_group) + + if in_features < tp_size: + return module + + if in_features % tp_size != 0: + raise ValueError( + f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!") + linear_1d = RowCaiQuantLinear(module.bits, + module.group_size, + module.in_features // tp_size, + module.out_features, + module.bias is not None, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=True) + linear_1d.process_group = process_group + + split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs) + return linear_1d + + def forward(self, x): + output = super().forward(x) + if self.tp_size > 1: + dist.all_reduce(output, op=dist.ReduceOp.SUM, group=self.process_group) + if self.bias is not None: + output.add_(self.bias) + return output + + +class ColCaiQuantLinear(CaiQuantLinear, ParallelModule): + + def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): + + super().__init__(bits, + groupsize, + infeatures, + outfeatures, + bias, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=row_split) + self.process_group = None + + @staticmethod + def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, + **kwargs) -> ParallelModule: + LazyInitContext.materialize(module) + # get the attributes + in_features = module.in_features + out_features = module.out_features + bias = module.bias is not None + device = module.weight.device + + # ensure only one process group is passed + if isinstance(process_group, (list, tuple)): + assert len(process_group) == 1, \ + f'Expected only one process group, got {len(process_group)}.' + process_group = process_group[0] + + tp_size = dist.get_world_size(process_group) + tp_rank = dist.get_rank(process_group) + + if in_features < tp_size: + return module + + if in_features % tp_size != 0: + raise ValueError( + f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!") + linear_1d = ColCaiQuantLinear(module.bits, + module.group_size, + module.in_features, + module.out_features // tp_size, + module.bias is not None, + tp_size=tp_size, + tp_rank=tp_rank) + linear_1d.process_group = process_group + + split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs) + return linear_1d + + def make_cai_quant_linear(module, names, bits, groupsize, name=''): if isinstance(module, CaiQuantLinear): return diff --git a/colossalai/gptq/gptq_tp.py b/colossalai/gptq/gptq_tp.py deleted file mode 100644 index cc6d184da458..000000000000 --- a/colossalai/gptq/gptq_tp.py +++ /dev/null @@ -1,180 +0,0 @@ -import warnings - -import torch -import torch.distributed as dist - -HAS_AUTO_GPTQ = False -try: - import auto_gptq - HAS_AUTO_GPTQ = True -except ImportError: - warnings.warn('please install auto-gptq from https://github.com/PanQiWei/AutoGPTQ') - HAS_AUTO_GPTQ = False - -from .cai_gptq import CaiQuantLinear -from .models import GPTQBloomConfig, GPTQLlamaConfig, reset_bloom_attention_params, reset_llama_attention_params - -model_config_map = { - "llama": GPTQLlamaConfig, - "bloom": GPTQBloomConfig, -} -attention_proc_map = { - "llama": reset_llama_attention_params, - "bloom": reset_bloom_attention_params, -} -if HAS_AUTO_GPTQ: - - def get_module_by_name_prefix(model, module_name: str): - for name, module in model.named_modules(): - if name.startswith(module_name): - return module - - def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): - - qweights = gptq_linear.qweight.split(gptq_linear.out_features // split_num, dim=-1) - qzeros = gptq_linear.qzeros.split(gptq_linear.out_features // (32 // cai_linear.bits) // split_num, dim=-1) - scales = gptq_linear.scales.split(gptq_linear.out_features // split_num, dim=-1) - g_idx = gptq_linear.g_idx - if gptq_linear.bias is not None: - bias = gptq_linear.bias.split(gptq_linear.out_features // split_num, dim=-1) - - cai_split_out_features = cai_linear.outfeatures // split_num - zero_split_block = cai_linear.outfeatures // (32 // cai_linear.bits) // split_num - - for i in range(split_num): - cai_linear.qweight[:, i * cai_split_out_features:(i + 1) * - cai_split_out_features] = qweights[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - cai_linear.qzeros[:, i * zero_split_block:(i + 1) * - zero_split_block] = qzeros[i][:, - tp_rank * zero_split_block:(tp_rank + 1) * zero_split_block] - cai_linear.scales[:, i * cai_split_out_features:(i + 1) * - cai_split_out_features] = scales[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - if cai_linear.bias is not None: - cai_linear.bias[i * cai_split_out_features:(i + 1) * - cai_split_out_features] = bias[i][tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - - cai_linear.g_idx.copy_(g_idx) - - def split_row_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): - - qweights = gptq_linear.qweight.split(gptq_linear.in_features // split_num, dim=0) - qzeros = gptq_linear.qzeros.split(gptq_linear.in_features // split_num, dim=0) - scales = gptq_linear.scales.split(gptq_linear.in_features // split_num, dim=0) - g_idxs = gptq_linear.g_idx.split(gptq_linear.in_features // split_num, dim=0) - - cai_split_in_features = cai_linear.infeatures // (32 // cai_linear.bits) // split_num - zero_split_block = cai_linear.infeatures // cai_linear.groupsize // split_num - idx_split_features = cai_linear.infeatures // split_num - - for i in range(split_num): - cai_linear.qweight[i * cai_split_in_features:(i + 1) * - cai_split_in_features, :] = qweights[i][tp_rank * cai_split_in_features:(tp_rank + 1) * - cai_split_in_features, :] - cai_linear.qzeros[i * zero_split_block:(i + 1) * - zero_split_block, :] = qzeros[i][tp_rank * zero_split_block:(tp_rank + 1) * - zero_split_block, :] - cai_linear.scales[i * zero_split_block:(i + 1) * - zero_split_block, :] = scales[i][tp_rank * zero_split_block:(tp_rank + 1) * - zero_split_block, :] - cai_linear.g_idx[i * idx_split_features:(i + 1) * - idx_split_features] = g_idxs[i][tp_rank * idx_split_features:(tp_rank + 1) * - idx_split_features] - if cai_linear.bias is not None: - cai_linear.bias.copy_(gptq_linear.bias) - - def replace_autogptq_linear(model, tp_size=1, tp_rank=0, tp_group=None): - - def all_reduce_hook(cai_linear, input, output): - dist.all_reduce(output, op=dist.ReduceOp.SUM, group=tp_group) - if cai_linear.bias is not None: - output.add_(cai_linear.bias) - - model_type_name = model.config.model_type - - gptq_model_config = model_config_map[model_type_name] - layers = get_module_by_name_prefix(model, gptq_model_config.layer_blocks) - - for layer in layers: - - attention_proc_map[model_type_name](layer, tp_size=tp_size) - for linear_name in gptq_model_config.linear_names[0]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #column split copy - cai_linear = CaiQuantLinear( - gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features, - gptq_linear.out_features // tp_size, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - ) - cai_linear.to(gptq_linear.qweight.device) - if len(gptq_model_config.linear_names[0]) == 1: - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank, split_num=3) - else: - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank, split_num=1) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[1]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #row split copy - cai_linear = CaiQuantLinear(gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features // tp_size, - gptq_linear.out_features, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - row_split=True) - cai_linear.to(gptq_linear.qweight.device) - split_row_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - - if tp_size > 1: - cai_linear.register_forward_hook(all_reduce_hook) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[2]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #column split copy - cai_linear = CaiQuantLinear( - gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features, - gptq_linear.out_features // tp_size, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - ) - cai_linear.to(gptq_linear.qweight.device) - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[3]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #row split copy - cai_linear = CaiQuantLinear(gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features // tp_size, - gptq_linear.out_features, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - row_split=True) - cai_linear.to(gptq_linear.qweight.device) - split_row_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - - if tp_size > 1: - cai_linear.register_forward_hook(all_reduce_hook) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) diff --git a/colossalai/gptq/models/__init__.py b/colossalai/gptq/models/__init__.py deleted file mode 100644 index ed444b4ed9cb..000000000000 --- a/colossalai/gptq/models/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .bloom import GPTQBloomConfig, reset_bloom_attention_params -from .llama import GPTQLlamaConfig, reset_llama_attention_params diff --git a/colossalai/gptq/models/bloom.py b/colossalai/gptq/models/bloom.py deleted file mode 100644 index b57fa3a5abbe..000000000000 --- a/colossalai/gptq/models/bloom.py +++ /dev/null @@ -1,18 +0,0 @@ -from dataclasses import dataclass, field, fields - - -@dataclass -class GPTQBloomConfig(): - layer_name = "BloomBlock" - layer_blocks = "transformer.h" - linear_names = [["self_attention.query_key_value"], ["self_attention.dense"], ["mlp.dense_h_to_4h"], - ["mlp.dense_4h_to_h"]] - model_names = ["transformer.word_embeddings", "transformer.word_embeddings_layernorm", "transformer.ln_f"] - attention = "self_attention" - mlp = "mlp" - - -def reset_bloom_attention_params(layer, tp_size=1): - attention = getattr(layer, "self_attention") - attention.hidden_size = attention.hidden_size // tp_size - attention.num_heads = attention.num_heads // tp_size diff --git a/colossalai/gptq/models/llama.py b/colossalai/gptq/models/llama.py deleted file mode 100644 index 71690ba748a5..000000000000 --- a/colossalai/gptq/models/llama.py +++ /dev/null @@ -1,19 +0,0 @@ -from dataclasses import dataclass, field, fields - - -@dataclass -class GPTQLlamaConfig(): - layer_name = "LlamaDecoderLayer" - layer_blocks = "model.layers" - linear_names = [["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - ["mlp.up_proj", "mlp.gate_proj"], ["mlp.down_proj"]] - model_names = ["model.embed_tokens", "model.norm"] - attention = "self_attn" - mlp = "mlp" - - -def reset_llama_attention_params(layer, tp_size=1): - attention = getattr(layer, "self_attn") - attention.hidden_size = attention.hidden_size // tp_size - attention.num_heads = attention.num_heads // tp_size - attention.num_key_value_heads = attention.num_key_value_heads // tp_size diff --git a/colossalai/inference/tensor_parallel/engine.py b/colossalai/inference/tensor_parallel/engine.py index 94b44136bebc..82a31113e238 100644 --- a/colossalai/inference/tensor_parallel/engine.py +++ b/colossalai/inference/tensor_parallel/engine.py @@ -10,7 +10,6 @@ from transformers.tokenization_utils_base import BatchEncoding from colossalai.gptq.cai_gptq import CaiQuantLinear -from colossalai.gptq.gptq_tp import replace_autogptq_linear from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.shardformer.policies.auto_policy import get_autopolicy @@ -179,12 +178,11 @@ def _shard_model_by(self, shardformer: ShardFormer, model: nn.Module) -> None: model_name = model.__class__.__name__ assert model_name in self.supported_models, f"Unsupported model cls {model_name} for TP inference." policy = get_autopolicy(model, inference_only=True) + self.model, _ = shardformer.optimize(model, policy) if self.shard_config.inference_gptq: - tp_rank = dist.get_rank(self.shard_config.tensor_parallel_process_group) - replace_autogptq_linear(model, tp_size=self.tp_size, tp_rank=tp_rank) self._post_init_gptq_buffer(model) - self.model, _ = shardformer.optimize(model, policy) + self.model = self.model.cuda() @property diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index 037b0ab85863..e36744cbde93 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -4,6 +4,7 @@ from torch.nn import LayerNorm import colossalai.shardformer.layer as col_nn +from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear from colossalai.shardformer.modeling.bloom import build_bloom_alibi_tensor_fn from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription from colossalai.shardformer.policies.bloom import BloomForCausalLMPolicy @@ -40,19 +41,34 @@ def module_policy(self): if not self.shard_config.inference_gptq: policy = super().module_policy() else: - policy[BloomModel] = ModulePolicyDescription( - attribute_replacement={ - "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, - }, - method_replacement={ - "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group) - }, - sub_module_replacement=[ - SubModuleReplacementDescription( - suffix="word_embeddings", - target_module=col_nn.VocabParallelEmbedding1D, - ) - ]) + policy[BloomBlock] = ModulePolicyDescription(attribute_replacement={ + "self_attention.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attention.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attention.num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, + }, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="self_attention.query_key_value", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 3}), + SubModuleReplacementDescription( + suffix="self_attention.dense", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}), + SubModuleReplacementDescription( + suffix="self_attention.attention_dropout", + target_module=col_nn.DropoutForParallelInput, + ), + SubModuleReplacementDescription( + suffix="mlp.dense_h_to_4h", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}), + SubModuleReplacementDescription( + suffix="mlp.dense_4h_to_h", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}), + ]) + # NOTE set inference mode to shard config self.shard_config._infer() diff --git a/colossalai/inference/tensor_parallel/policies/llama.py b/colossalai/inference/tensor_parallel/policies/llama.py index 6b6056501ac0..55d1fd66821d 100644 --- a/colossalai/inference/tensor_parallel/policies/llama.py +++ b/colossalai/inference/tensor_parallel/policies/llama.py @@ -3,6 +3,7 @@ import torch from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel, LlamaRMSNorm +from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear from colossalai.shardformer.layer import VocabParallelEmbedding1D from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription # import colossalai @@ -39,12 +40,62 @@ def module_policy(self): if not self.shard_config.inference_gptq: policy = super().module_policy() else: + decoder_attribute_replacement = { + "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, + } + if getattr(self.model.config, "num_key_value_heads", False): + decoder_attribute_replacement["self_attn.num_key_value_heads"] = \ + self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size + + policy[LlamaDecoderLayer] = ModulePolicyDescription( + attribute_replacement=decoder_attribute_replacement, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="self_attn.q_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.k_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.v_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.o_proj", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.gate_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.up_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.down_proj", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}, + ) + ], + ) + self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription( suffix="embed_tokens", target_module=VocabParallelEmbedding1D, ), policy=policy, target_key=LlamaModel) + self.shard_config._infer() infer_forward = LlamaInferenceForwards.llama_model_forward From 699af477989445a796f38ca9760f5b185974376b Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Thu, 14 Sep 2023 17:21:24 +0800 Subject: [PATCH 2/6] update bloom policy --- .../inference/tensor_parallel/policies/bloom.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index e36744cbde93..1ec0457b4711 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -68,7 +68,19 @@ def module_policy(self): target_module=RowCaiQuantLinear, kwargs={'split_num': 1}), ]) - + policy[BloomModel] = ModulePolicyDescription( + attribute_replacement={ + "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, + }, + method_replacement={ + "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group) + }, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="word_embeddings", + target_module=col_nn.VocabParallelEmbedding1D, + ) + ]) # NOTE set inference mode to shard config self.shard_config._infer() From 6d1d579218cc19f06a65486e0eefc5e693a92ca7 Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Fri, 15 Sep 2023 09:28:20 +0800 Subject: [PATCH 3/6] delete useless code --- colossalai/gptq/__init__.py | 2 +- colossalai/gptq/cai_gptq/__init__.py | 2 +- colossalai/gptq/cai_gptq/cai_quant_linear.py | 14 -------------- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/colossalai/gptq/__init__.py b/colossalai/gptq/__init__.py index 59b87d6ca692..09246ded2902 100644 --- a/colossalai/gptq/__init__.py +++ b/colossalai/gptq/__init__.py @@ -1,4 +1,4 @@ from .cai_gptq import HAS_AUTO_GPTQ if HAS_AUTO_GPTQ: - from .cai_gptq import CaiGPTQLinearOp, CaiQuantLinear, gptq_fused_linear_triton, make_cai_quant_linear + from .cai_gptq import CaiGPTQLinearOp, CaiQuantLinear, gptq_fused_linear_triton diff --git a/colossalai/gptq/cai_gptq/__init__.py b/colossalai/gptq/cai_gptq/__init__.py index af1f274dd3b0..ce2da621275f 100644 --- a/colossalai/gptq/cai_gptq/__init__.py +++ b/colossalai/gptq/cai_gptq/__init__.py @@ -9,6 +9,6 @@ HAS_AUTO_GPTQ = False if HAS_AUTO_GPTQ: - from .cai_quant_linear import CaiQuantLinear, ColCaiQuantLinear, RowCaiQuantLinear, make_cai_quant_linear + from .cai_quant_linear import CaiQuantLinear, ColCaiQuantLinear, RowCaiQuantLinear from .gptq_op import CaiGPTQLinearOp from .gptq_triton import gptq_fused_linear_triton diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/gptq/cai_gptq/cai_quant_linear.py index 1fe09a925a0e..8872d4e2dd65 100644 --- a/colossalai/gptq/cai_gptq/cai_quant_linear.py +++ b/colossalai/gptq/cai_gptq/cai_quant_linear.py @@ -358,17 +358,3 @@ def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, Lis split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs) return linear_1d - - -def make_cai_quant_linear(module, names, bits, groupsize, name=''): - if isinstance(module, CaiQuantLinear): - return - for attr in dir(module): - tmp = getattr(module, attr) - name1 = name + '.' + attr if name != '' else attr - if name1 in names: - delattr(module, attr) - setattr(module, attr, - CaiQuantLinear(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None)) - for name1, child in module.named_children(): - make_cai_quant_linear(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) From 2816c8a6947b1ffc03fecd367348107b89d27ffe Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Fri, 15 Sep 2023 10:34:59 +0800 Subject: [PATCH 4/6] fix import bug and delete uselss code --- colossalai/gptq/cai_gptq/cai_quant_linear.py | 8 +------ .../inference/tensor_parallel/engine.py | 19 ++++++++--------- .../tensor_parallel/policies/bloom.py | 21 +++---------------- .../tensor_parallel/policies/llama.py | 20 ++++-------------- 4 files changed, 17 insertions(+), 51 deletions(-) diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/gptq/cai_gptq/cai_quant_linear.py index 8872d4e2dd65..ca12c34ed958 100644 --- a/colossalai/gptq/cai_gptq/cai_quant_linear.py +++ b/colossalai/gptq/cai_gptq/cai_quant_linear.py @@ -2,7 +2,7 @@ import math import warnings -from typing import Callable, List, Optional, Tuple, Union +from typing import List, Union import numpy as np import torch @@ -267,9 +267,6 @@ def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, Lis LazyInitContext.materialize(module) # get the attributes in_features = module.in_features - out_features = module.out_features - bias = module.bias is not None - device = module.weight.device # ensure only one process group is passed if isinstance(process_group, (list, tuple)): @@ -328,9 +325,6 @@ def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, Lis LazyInitContext.materialize(module) # get the attributes in_features = module.in_features - out_features = module.out_features - bias = module.bias is not None - device = module.weight.device # ensure only one process group is passed if isinstance(process_group, (list, tuple)): diff --git a/colossalai/inference/tensor_parallel/engine.py b/colossalai/inference/tensor_parallel/engine.py index 82a31113e238..1b34bc7f2be4 100644 --- a/colossalai/inference/tensor_parallel/engine.py +++ b/colossalai/inference/tensor_parallel/engine.py @@ -9,19 +9,9 @@ from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.tokenization_utils_base import BatchEncoding -from colossalai.gptq.cai_gptq import CaiQuantLinear from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.shardformer.policies.auto_policy import get_autopolicy -HAS_GPTQ_CUDA = False -try: - from colossalai.kernel.op_builder.gptq import GPTQBuilder - gptq_cuda = GPTQBuilder().load() - HAS_GPTQ_CUDA = True -except ImportError: - warnings.warn('CUDA gptq is not installed') - HAS_GPTQ_CUDA = False - from .batch_infer_state import BatchInferState from .kvcache_manager import MemoryManager @@ -98,6 +88,15 @@ def _init_manager(self) -> None: self.layer_num) def _post_init_gptq_buffer(self, model: nn.Module) -> None: + from colossalai.gptq.cai_gptq import CaiQuantLinear + HAS_GPTQ_CUDA = False + try: + from colossalai.kernel.op_builder.gptq import GPTQBuilder + gptq_cuda = GPTQBuilder().load() + HAS_GPTQ_CUDA = True + except ImportError: + warnings.warn('CUDA gptq is not installed') + HAS_GPTQ_CUDA = False for name, submodule in model.named_modules(): if isinstance(submodule, CaiQuantLinear): diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index 1ec0457b4711..cca28e202125 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -4,7 +4,6 @@ from torch.nn import LayerNorm import colossalai.shardformer.layer as col_nn -from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear from colossalai.shardformer.modeling.bloom import build_bloom_alibi_tensor_fn from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription from colossalai.shardformer.policies.bloom import BloomForCausalLMPolicy @@ -37,10 +36,9 @@ def __init__(self) -> None: def module_policy(self): from transformers.models.bloom.modeling_bloom import BloomAttention, BloomBlock, BloomForCausalLM, BloomModel - policy = {} - if not self.shard_config.inference_gptq: - policy = super().module_policy() - else: + policy = super().module_policy() + if self.shard_config.inference_gptq: + from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear policy[BloomBlock] = ModulePolicyDescription(attribute_replacement={ "self_attention.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attention.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, @@ -68,19 +66,6 @@ def module_policy(self): target_module=RowCaiQuantLinear, kwargs={'split_num': 1}), ]) - policy[BloomModel] = ModulePolicyDescription( - attribute_replacement={ - "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, - }, - method_replacement={ - "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group) - }, - sub_module_replacement=[ - SubModuleReplacementDescription( - suffix="word_embeddings", - target_module=col_nn.VocabParallelEmbedding1D, - ) - ]) # NOTE set inference mode to shard config self.shard_config._infer() diff --git a/colossalai/inference/tensor_parallel/policies/llama.py b/colossalai/inference/tensor_parallel/policies/llama.py index 55d1fd66821d..3832ecc8c3d4 100644 --- a/colossalai/inference/tensor_parallel/policies/llama.py +++ b/colossalai/inference/tensor_parallel/policies/llama.py @@ -3,7 +3,6 @@ import torch from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel, LlamaRMSNorm -from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear from colossalai.shardformer.layer import VocabParallelEmbedding1D from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription # import colossalai @@ -36,18 +35,14 @@ def __init__(self) -> None: super().__init__() def module_policy(self): - policy = {} - if not self.shard_config.inference_gptq: - policy = super().module_policy() - else: + policy = super().module_policy() + + if self.shard_config.inference_gptq: + from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, } - if getattr(self.model.config, "num_key_value_heads", False): - decoder_attribute_replacement["self_attn.num_key_value_heads"] = \ - self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size - policy[LlamaDecoderLayer] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, sub_module_replacement=[ @@ -89,13 +84,6 @@ def module_policy(self): ], ) - self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription( - suffix="embed_tokens", - target_module=VocabParallelEmbedding1D, - ), - policy=policy, - target_key=LlamaModel) - self.shard_config._infer() infer_forward = LlamaInferenceForwards.llama_model_forward From 63fb7f1271d2d95e15f50136e88ff76b13ef4393 Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Fri, 15 Sep 2023 10:49:19 +0800 Subject: [PATCH 5/6] change colossalai/gptq to colossalai/quant/gptq --- colossalai/{ => inference/quant}/gptq/__init__.py | 0 colossalai/{ => inference/quant}/gptq/cai_gptq/__init__.py | 0 .../{ => inference/quant}/gptq/cai_gptq/cai_quant_linear.py | 0 colossalai/{ => inference/quant}/gptq/cai_gptq/gptq_op.py | 0 colossalai/{ => inference/quant}/gptq/cai_gptq/gptq_triton.py | 0 colossalai/inference/tensor_parallel/engine.py | 2 +- colossalai/inference/tensor_parallel/policies/bloom.py | 2 +- colossalai/inference/tensor_parallel/policies/llama.py | 3 ++- 8 files changed, 4 insertions(+), 3 deletions(-) rename colossalai/{ => inference/quant}/gptq/__init__.py (100%) rename colossalai/{ => inference/quant}/gptq/cai_gptq/__init__.py (100%) rename colossalai/{ => inference/quant}/gptq/cai_gptq/cai_quant_linear.py (100%) rename colossalai/{ => inference/quant}/gptq/cai_gptq/gptq_op.py (100%) rename colossalai/{ => inference/quant}/gptq/cai_gptq/gptq_triton.py (100%) diff --git a/colossalai/gptq/__init__.py b/colossalai/inference/quant/gptq/__init__.py similarity index 100% rename from colossalai/gptq/__init__.py rename to colossalai/inference/quant/gptq/__init__.py diff --git a/colossalai/gptq/cai_gptq/__init__.py b/colossalai/inference/quant/gptq/cai_gptq/__init__.py similarity index 100% rename from colossalai/gptq/cai_gptq/__init__.py rename to colossalai/inference/quant/gptq/cai_gptq/__init__.py diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py similarity index 100% rename from colossalai/gptq/cai_gptq/cai_quant_linear.py rename to colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py diff --git a/colossalai/gptq/cai_gptq/gptq_op.py b/colossalai/inference/quant/gptq/cai_gptq/gptq_op.py similarity index 100% rename from colossalai/gptq/cai_gptq/gptq_op.py rename to colossalai/inference/quant/gptq/cai_gptq/gptq_op.py diff --git a/colossalai/gptq/cai_gptq/gptq_triton.py b/colossalai/inference/quant/gptq/cai_gptq/gptq_triton.py similarity index 100% rename from colossalai/gptq/cai_gptq/gptq_triton.py rename to colossalai/inference/quant/gptq/cai_gptq/gptq_triton.py diff --git a/colossalai/inference/tensor_parallel/engine.py b/colossalai/inference/tensor_parallel/engine.py index 1b34bc7f2be4..9e379892b182 100644 --- a/colossalai/inference/tensor_parallel/engine.py +++ b/colossalai/inference/tensor_parallel/engine.py @@ -88,7 +88,7 @@ def _init_manager(self) -> None: self.layer_num) def _post_init_gptq_buffer(self, model: nn.Module) -> None: - from colossalai.gptq.cai_gptq import CaiQuantLinear + from colossalai.inference.quant.gptq.cai_gptq import CaiQuantLinear HAS_GPTQ_CUDA = False try: from colossalai.kernel.op_builder.gptq import GPTQBuilder diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index cca28e202125..bceb14c6af7b 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -38,7 +38,7 @@ def module_policy(self): from transformers.models.bloom.modeling_bloom import BloomAttention, BloomBlock, BloomForCausalLM, BloomModel policy = super().module_policy() if self.shard_config.inference_gptq: - from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear + from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear policy[BloomBlock] = ModulePolicyDescription(attribute_replacement={ "self_attention.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attention.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, diff --git a/colossalai/inference/tensor_parallel/policies/llama.py b/colossalai/inference/tensor_parallel/policies/llama.py index 3832ecc8c3d4..f281e2918b37 100644 --- a/colossalai/inference/tensor_parallel/policies/llama.py +++ b/colossalai/inference/tensor_parallel/policies/llama.py @@ -38,7 +38,8 @@ def module_policy(self): policy = super().module_policy() if self.shard_config.inference_gptq: - from colossalai.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear + from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear + decoder_attribute_replacement = { "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, From 611311f9e359b3a21e9e67e4286fd1ec88cfde13 Mon Sep 17 00:00:00 2001 From: Xu Kai Date: Fri, 15 Sep 2023 10:55:25 +0800 Subject: [PATCH 6/6] update import linear for tests --- tests/test_gptq/test_gptq_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gptq/test_gptq_linear.py b/tests/test_gptq/test_gptq_linear.py index 718060c22908..fca5e1a0ebc3 100644 --- a/tests/test_gptq/test_gptq_linear.py +++ b/tests/test_gptq/test_gptq_linear.py @@ -21,7 +21,7 @@ from auto_gptq.utils.import_utils import dynamically_import_QuantLinear from exllama_kernels import prepare_buffers, set_tuning_params - from colossalai.gptq import CaiQuantLinear + from colossalai.inference.quant.gptq import CaiQuantLinear HAS_AUTO_GPTQ = True except: HAS_AUTO_GPTQ = False