diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/gptq/cai_gptq/cai_quant_linear.py deleted file mode 100644 index 93312716992d..000000000000 --- a/colossalai/gptq/cai_gptq/cai_quant_linear.py +++ /dev/null @@ -1,216 +0,0 @@ -# Adapted from AutoGPTQ auto_gptq: https://github.com/PanQiWei/AutoGPTQ - -import math -import warnings - -import numpy as np -import torch -import torch.nn as nn -import triton - -from .gptq_op import CaiGPTQLinearOp - -HAS_GPTQ_CUDA = False -try: - from colossalai.kernel.op_builder.gptq import GPTQBuilder - gptq_cuda = GPTQBuilder().load() - HAS_GPTQ_CUDA = True -except ImportError: - warnings.warn('CUDA gptq is not installed') - HAS_GPTQ_CUDA = False - - -class CaiQuantLinear(nn.Module): - max_dq_buffer_size = 1 - max_inner_outer_dim = 1 - max_input_len = 1 - prepared_buffers = False - device_to_buffers = { - "temp_state": None, - "temp_dq": None, - } - - def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): - super().__init__() - if bits not in [2, 4, 8]: - raise NotImplementedError("Only 2,4,8 bits are supported.") - self.infeatures = infeatures - self.outfeatures = outfeatures - self.bits = bits - self.maxq = 2**self.bits - 1 - self.groupsize = groupsize if groupsize != -1 else infeatures - - self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32)) - self.register_buffer( - 'qzeros', - torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32)) - self.register_buffer('scales', - torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16)) - if row_split: - self.register_buffer( - 'g_idx', - torch.tensor([(i + (tp_rank * self.infeatures)) // self.groupsize for i in range(infeatures)], - dtype=torch.int32)) - else: - self.register_buffer('g_idx', - torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32)) - - if bias: - self.register_buffer('bias', torch.zeros((outfeatures), dtype=torch.float16)) - else: - self.bias = None - - self.gptq_linear = CaiGPTQLinearOp(groupsize, bits) - - self.q4 = None - self.empty_tensor = torch.empty((1, 1), device="meta") - self.tp_size = tp_size - self.tp_rank = tp_rank - self.row_split = row_split - - def pack(self, linear, scales, zeros, g_idx=None): - - g_idx = g_idx.clone() if g_idx is not None else torch.tensor( - [i // self.groupsize for i in range(self.infeatures)], dtype=torch.int32) - - scales = scales.t().contiguous() - zeros = zeros.t().contiguous() - scale_zeros = zeros * scales - half_scales = scales.clone().half() - # print("scale shape ", scales.shape, scale_zeros.shape, linear.weight.shape) - self.scales = scales.clone().half() - if linear.bias is not None: - self.bias = linear.bias.clone().half() - - # wn = 16 - # pbits = 64 - # ptype = torch.int64 - # unsign_type = np.uint64 - # sign_type = np.int64 - - wn = 8 - pbits = 32 - ptype = torch.int32 - unsign_type = np.uint32 - sign_type = np.int32 - - intweight = [] - for idx in range(self.infeatures): - intweight.append( - torch.round( - (linear.weight.data[:, idx] + scale_zeros[g_idx[idx]]) / half_scales[g_idx[idx]]).to(ptype)[:, - None]) - intweight = torch.cat(intweight, dim=1) - intweight = intweight.t().contiguous() - intweight = intweight.numpy().astype(unsign_type) - qweight = np.zeros((intweight.shape[0] // pbits * self.bits, intweight.shape[1]), dtype=unsign_type) - - i = 0 - row = 0 - # print("weight shape ", intweight.shape, qweight.shape, out_qweight.shape, bits) - # print("weight shape ", intweight[0].shape, qweight[0].shape, out_qweight[0].shape) - # print("weight value ", intweight[0], qweight[0]) - - while row < qweight.shape[0]: - if self.bits in [2, 4, 8]: - for j in range(i, i + (pbits // self.bits)): - qweight[row] |= intweight[j] << (self.bits * (j - i)) - i += pbits // self.bits - row += 1 - else: - raise NotImplementedError("Only 2,4,8 bits are supported.") - qweight = qweight.astype(sign_type) - qweight1 = torch.from_numpy(qweight) - qweight1 = qweight1.contiguous() #.to("cuda") - self.qweight.data.copy_(qweight1) - - qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // pbits * self.bits), dtype=unsign_type) - zeros -= 1 - zeros = zeros.numpy().astype(unsign_type) - i = 0 - col = 0 - while col < qzeros.shape[1]: - if self.bits in [2, 4, 8]: - for j in range(i, i + (pbits // self.bits)): - qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) - i += pbits // self.bits - col += 1 - else: - raise NotImplementedError("Only 2,4,8 bits are supported.") - qzeros = qzeros.astype(sign_type) - qzeros = torch.from_numpy(qzeros) - qzeros = qzeros - self.qzeros.data.copy_(qzeros) - - if torch.equal(self.g_idx.to(g_idx.device), g_idx): - self.g_idx = None - else: - self.g_idx = g_idx - - def init_q4(self): - assert self.qweight.device.type == "cuda" - self.q4_width = self.qweight.shape[1] - if self.g_idx is not None: - if self.row_split and torch.equal( - self.g_idx, - torch.tensor( - [(i + (self.tp_rank * self.infeatures)) // self.groupsize for i in range(self.infeatures)], - dtype=torch.int32, - device=self.g_idx.device)): - self.g_idx = None - elif torch.equal( - self.g_idx, - torch.tensor([i // self.groupsize for i in range(self.infeatures)], - dtype=torch.int32, - device=self.g_idx.device)): - self.g_idx = None - - if self.g_idx is not None: - g_idx = self.g_idx.to("cpu") - else: - g_idx = self.empty_tensor - - self.q4 = gptq_cuda.make_q4(self.qweight, self.qzeros, self.scales, g_idx, torch.cuda.current_device()) - torch.cuda.synchronize() - - def forward(self, x): - outshape = x.shape[:-1] + (self.outfeatures,) - - if HAS_GPTQ_CUDA and self.bits == 4: - - if self.q4 is None: - self.init_q4() - - x = x.view(-1, x.shape[-1]) - output = torch.empty((x.shape[0], self.outfeatures), dtype=torch.float16, device=x.device) - gptq_cuda.q4_matmul(x.half(), self.q4, output) - if self.bias is not None and (not self.row_split or self.tp_size == 1): - output.add_(self.bias) - else: - if self.bias is not None and (not self.row_split or self.tp_size == 1): - bias = self.bias - else: - bias = None - output = self.gptq_linear( - x, - self.qweight, - self.scales, - self.qzeros, - g_idx=self.g_idx, - bias=bias, - ) - return output.view(outshape) - - -def make_cai_quant_linear(module, names, bits, groupsize, name=''): - if isinstance(module, CaiQuantLinear): - return - for attr in dir(module): - tmp = getattr(module, attr) - name1 = name + '.' + attr if name != '' else attr - if name1 in names: - delattr(module, attr) - setattr(module, attr, - CaiQuantLinear(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None)) - for name1, child in module.named_children(): - make_cai_quant_linear(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) diff --git a/colossalai/gptq/gptq_tp.py b/colossalai/gptq/gptq_tp.py deleted file mode 100644 index cc6d184da458..000000000000 --- a/colossalai/gptq/gptq_tp.py +++ /dev/null @@ -1,180 +0,0 @@ -import warnings - -import torch -import torch.distributed as dist - -HAS_AUTO_GPTQ = False -try: - import auto_gptq - HAS_AUTO_GPTQ = True -except ImportError: - warnings.warn('please install auto-gptq from https://github.com/PanQiWei/AutoGPTQ') - HAS_AUTO_GPTQ = False - -from .cai_gptq import CaiQuantLinear -from .models import GPTQBloomConfig, GPTQLlamaConfig, reset_bloom_attention_params, reset_llama_attention_params - -model_config_map = { - "llama": GPTQLlamaConfig, - "bloom": GPTQBloomConfig, -} -attention_proc_map = { - "llama": reset_llama_attention_params, - "bloom": reset_bloom_attention_params, -} -if HAS_AUTO_GPTQ: - - def get_module_by_name_prefix(model, module_name: str): - for name, module in model.named_modules(): - if name.startswith(module_name): - return module - - def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): - - qweights = gptq_linear.qweight.split(gptq_linear.out_features // split_num, dim=-1) - qzeros = gptq_linear.qzeros.split(gptq_linear.out_features // (32 // cai_linear.bits) // split_num, dim=-1) - scales = gptq_linear.scales.split(gptq_linear.out_features // split_num, dim=-1) - g_idx = gptq_linear.g_idx - if gptq_linear.bias is not None: - bias = gptq_linear.bias.split(gptq_linear.out_features // split_num, dim=-1) - - cai_split_out_features = cai_linear.outfeatures // split_num - zero_split_block = cai_linear.outfeatures // (32 // cai_linear.bits) // split_num - - for i in range(split_num): - cai_linear.qweight[:, i * cai_split_out_features:(i + 1) * - cai_split_out_features] = qweights[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - cai_linear.qzeros[:, i * zero_split_block:(i + 1) * - zero_split_block] = qzeros[i][:, - tp_rank * zero_split_block:(tp_rank + 1) * zero_split_block] - cai_linear.scales[:, i * cai_split_out_features:(i + 1) * - cai_split_out_features] = scales[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - if cai_linear.bias is not None: - cai_linear.bias[i * cai_split_out_features:(i + 1) * - cai_split_out_features] = bias[i][tp_rank * cai_split_out_features:(tp_rank + 1) * - cai_split_out_features] - - cai_linear.g_idx.copy_(g_idx) - - def split_row_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): - - qweights = gptq_linear.qweight.split(gptq_linear.in_features // split_num, dim=0) - qzeros = gptq_linear.qzeros.split(gptq_linear.in_features // split_num, dim=0) - scales = gptq_linear.scales.split(gptq_linear.in_features // split_num, dim=0) - g_idxs = gptq_linear.g_idx.split(gptq_linear.in_features // split_num, dim=0) - - cai_split_in_features = cai_linear.infeatures // (32 // cai_linear.bits) // split_num - zero_split_block = cai_linear.infeatures // cai_linear.groupsize // split_num - idx_split_features = cai_linear.infeatures // split_num - - for i in range(split_num): - cai_linear.qweight[i * cai_split_in_features:(i + 1) * - cai_split_in_features, :] = qweights[i][tp_rank * cai_split_in_features:(tp_rank + 1) * - cai_split_in_features, :] - cai_linear.qzeros[i * zero_split_block:(i + 1) * - zero_split_block, :] = qzeros[i][tp_rank * zero_split_block:(tp_rank + 1) * - zero_split_block, :] - cai_linear.scales[i * zero_split_block:(i + 1) * - zero_split_block, :] = scales[i][tp_rank * zero_split_block:(tp_rank + 1) * - zero_split_block, :] - cai_linear.g_idx[i * idx_split_features:(i + 1) * - idx_split_features] = g_idxs[i][tp_rank * idx_split_features:(tp_rank + 1) * - idx_split_features] - if cai_linear.bias is not None: - cai_linear.bias.copy_(gptq_linear.bias) - - def replace_autogptq_linear(model, tp_size=1, tp_rank=0, tp_group=None): - - def all_reduce_hook(cai_linear, input, output): - dist.all_reduce(output, op=dist.ReduceOp.SUM, group=tp_group) - if cai_linear.bias is not None: - output.add_(cai_linear.bias) - - model_type_name = model.config.model_type - - gptq_model_config = model_config_map[model_type_name] - layers = get_module_by_name_prefix(model, gptq_model_config.layer_blocks) - - for layer in layers: - - attention_proc_map[model_type_name](layer, tp_size=tp_size) - for linear_name in gptq_model_config.linear_names[0]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #column split copy - cai_linear = CaiQuantLinear( - gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features, - gptq_linear.out_features // tp_size, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - ) - cai_linear.to(gptq_linear.qweight.device) - if len(gptq_model_config.linear_names[0]) == 1: - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank, split_num=3) - else: - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank, split_num=1) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[1]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #row split copy - cai_linear = CaiQuantLinear(gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features // tp_size, - gptq_linear.out_features, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - row_split=True) - cai_linear.to(gptq_linear.qweight.device) - split_row_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - - if tp_size > 1: - cai_linear.register_forward_hook(all_reduce_hook) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[2]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #column split copy - cai_linear = CaiQuantLinear( - gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features, - gptq_linear.out_features // tp_size, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - ) - cai_linear.to(gptq_linear.qweight.device) - split_column_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) - - for linear_name in gptq_model_config.linear_names[3]: - gptq_linear = get_module_by_name_prefix(layer, linear_name) - #row split copy - cai_linear = CaiQuantLinear(gptq_linear.bits, - gptq_linear.group_size, - gptq_linear.in_features // tp_size, - gptq_linear.out_features, - gptq_linear.bias is not None, - tp_size=tp_size, - tp_rank=tp_rank, - row_split=True) - cai_linear.to(gptq_linear.qweight.device) - split_row_copy(gptq_linear, cai_linear, tp_size=tp_size, tp_rank=tp_rank) - - if tp_size > 1: - cai_linear.register_forward_hook(all_reduce_hook) - name1, name2 = linear_name.split(".") - parent_module = get_module_by_name_prefix(layer, name1) - setattr(parent_module, name2, cai_linear) diff --git a/colossalai/gptq/models/__init__.py b/colossalai/gptq/models/__init__.py deleted file mode 100644 index ed444b4ed9cb..000000000000 --- a/colossalai/gptq/models/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .bloom import GPTQBloomConfig, reset_bloom_attention_params -from .llama import GPTQLlamaConfig, reset_llama_attention_params diff --git a/colossalai/gptq/models/bloom.py b/colossalai/gptq/models/bloom.py deleted file mode 100644 index b57fa3a5abbe..000000000000 --- a/colossalai/gptq/models/bloom.py +++ /dev/null @@ -1,18 +0,0 @@ -from dataclasses import dataclass, field, fields - - -@dataclass -class GPTQBloomConfig(): - layer_name = "BloomBlock" - layer_blocks = "transformer.h" - linear_names = [["self_attention.query_key_value"], ["self_attention.dense"], ["mlp.dense_h_to_4h"], - ["mlp.dense_4h_to_h"]] - model_names = ["transformer.word_embeddings", "transformer.word_embeddings_layernorm", "transformer.ln_f"] - attention = "self_attention" - mlp = "mlp" - - -def reset_bloom_attention_params(layer, tp_size=1): - attention = getattr(layer, "self_attention") - attention.hidden_size = attention.hidden_size // tp_size - attention.num_heads = attention.num_heads // tp_size diff --git a/colossalai/gptq/models/llama.py b/colossalai/gptq/models/llama.py deleted file mode 100644 index 71690ba748a5..000000000000 --- a/colossalai/gptq/models/llama.py +++ /dev/null @@ -1,19 +0,0 @@ -from dataclasses import dataclass, field, fields - - -@dataclass -class GPTQLlamaConfig(): - layer_name = "LlamaDecoderLayer" - layer_blocks = "model.layers" - linear_names = [["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"], - ["mlp.up_proj", "mlp.gate_proj"], ["mlp.down_proj"]] - model_names = ["model.embed_tokens", "model.norm"] - attention = "self_attn" - mlp = "mlp" - - -def reset_llama_attention_params(layer, tp_size=1): - attention = getattr(layer, "self_attn") - attention.hidden_size = attention.hidden_size // tp_size - attention.num_heads = attention.num_heads // tp_size - attention.num_key_value_heads = attention.num_key_value_heads // tp_size diff --git a/colossalai/gptq/__init__.py b/colossalai/inference/quant/gptq/__init__.py similarity index 73% rename from colossalai/gptq/__init__.py rename to colossalai/inference/quant/gptq/__init__.py index 59b87d6ca692..09246ded2902 100644 --- a/colossalai/gptq/__init__.py +++ b/colossalai/inference/quant/gptq/__init__.py @@ -1,4 +1,4 @@ from .cai_gptq import HAS_AUTO_GPTQ if HAS_AUTO_GPTQ: - from .cai_gptq import CaiGPTQLinearOp, CaiQuantLinear, gptq_fused_linear_triton, make_cai_quant_linear + from .cai_gptq import CaiGPTQLinearOp, CaiQuantLinear, gptq_fused_linear_triton diff --git a/colossalai/gptq/cai_gptq/__init__.py b/colossalai/inference/quant/gptq/cai_gptq/__init__.py similarity index 79% rename from colossalai/gptq/cai_gptq/__init__.py rename to colossalai/inference/quant/gptq/cai_gptq/__init__.py index fcdef7734438..ce2da621275f 100644 --- a/colossalai/gptq/cai_gptq/__init__.py +++ b/colossalai/inference/quant/gptq/cai_gptq/__init__.py @@ -9,6 +9,6 @@ HAS_AUTO_GPTQ = False if HAS_AUTO_GPTQ: - from .cai_quant_linear import CaiQuantLinear, make_cai_quant_linear + from .cai_quant_linear import CaiQuantLinear, ColCaiQuantLinear, RowCaiQuantLinear from .gptq_op import CaiGPTQLinearOp from .gptq_triton import gptq_fused_linear_triton diff --git a/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py b/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py new file mode 100644 index 000000000000..ca12c34ed958 --- /dev/null +++ b/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py @@ -0,0 +1,354 @@ +# Adapted from AutoGPTQ auto_gptq: https://github.com/PanQiWei/AutoGPTQ + +import math +import warnings +from typing import List, Union + +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed import ProcessGroup + +from colossalai.lazy import LazyInitContext +from colossalai.shardformer.layer import ParallelModule + +from .gptq_op import CaiGPTQLinearOp + +HAS_GPTQ_CUDA = False +try: + from colossalai.kernel.op_builder.gptq import GPTQBuilder + gptq_cuda = GPTQBuilder().load() + HAS_GPTQ_CUDA = True +except ImportError: + warnings.warn('CUDA gptq is not installed') + HAS_GPTQ_CUDA = False + + +class CaiQuantLinear(nn.Module): + + def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): + super().__init__() + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + self.infeatures = infeatures + self.outfeatures = outfeatures + self.bits = bits + self.maxq = 2**self.bits - 1 + self.groupsize = groupsize if groupsize != -1 else infeatures + + self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32)) + self.register_buffer( + 'qzeros', + torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32)) + self.register_buffer('scales', + torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16)) + if row_split: + self.register_buffer( + 'g_idx', + torch.tensor([(i + (tp_rank * self.infeatures)) // self.groupsize for i in range(infeatures)], + dtype=torch.int32)) + else: + self.register_buffer('g_idx', + torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32)) + + if bias: + self.register_buffer('bias', torch.zeros((outfeatures), dtype=torch.float16)) + else: + self.bias = None + + self.gptq_linear = CaiGPTQLinearOp(groupsize, bits) + + self.q4 = None + self.empty_tensor = torch.empty((1, 1), device="meta") + self.tp_size = tp_size + self.tp_rank = tp_rank + self.row_split = row_split + + def pack(self, linear, scales, zeros, g_idx=None): + + g_idx = g_idx.clone() if g_idx is not None else torch.tensor( + [i // self.groupsize for i in range(self.infeatures)], dtype=torch.int32) + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + half_scales = scales.clone().half() + # print("scale shape ", scales.shape, scale_zeros.shape, linear.weight.shape) + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + wn = 8 + pbits = 32 + ptype = torch.int32 + unsign_type = np.uint32 + sign_type = np.int32 + + intweight = [] + for idx in range(self.infeatures): + intweight.append( + torch.round( + (linear.weight.data[:, idx] + scale_zeros[g_idx[idx]]) / half_scales[g_idx[idx]]).to(ptype)[:, + None]) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(unsign_type) + qweight = np.zeros((intweight.shape[0] // pbits * self.bits, intweight.shape[1]), dtype=unsign_type) + + i = 0 + row = 0 + + while row < qweight.shape[0]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (pbits // self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += pbits // self.bits + row += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + qweight = qweight.astype(sign_type) + qweight1 = torch.from_numpy(qweight) + qweight1 = qweight1.contiguous() #.to("cuda") + self.qweight.data.copy_(qweight1) + + qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // pbits * self.bits), dtype=unsign_type) + zeros -= 1 + zeros = zeros.numpy().astype(unsign_type) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (pbits // self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += pbits // self.bits + col += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + qzeros = qzeros.astype(sign_type) + qzeros = torch.from_numpy(qzeros) + qzeros = qzeros + self.qzeros.data.copy_(qzeros) + + if torch.equal(self.g_idx.to(g_idx.device), g_idx): + self.g_idx = None + else: + self.g_idx = g_idx + + def init_q4(self): + assert self.qweight.device.type == "cuda" + self.q4_width = self.qweight.shape[1] + if self.g_idx is not None: + if self.row_split and torch.equal( + self.g_idx, + torch.tensor( + [(i + (self.tp_rank * self.infeatures)) // self.groupsize for i in range(self.infeatures)], + dtype=torch.int32, + device=self.g_idx.device)): + self.g_idx = None + elif torch.equal( + self.g_idx, + torch.tensor([i // self.groupsize for i in range(self.infeatures)], + dtype=torch.int32, + device=self.g_idx.device)): + self.g_idx = None + + if self.g_idx is not None: + g_idx = self.g_idx.to("cpu") + else: + g_idx = self.empty_tensor + + self.q4 = gptq_cuda.make_q4(self.qweight, self.qzeros, self.scales, g_idx, torch.cuda.current_device()) + torch.cuda.synchronize() + + def forward(self, x): + outshape = x.shape[:-1] + (self.outfeatures,) + + if HAS_GPTQ_CUDA and self.bits == 4: + + if self.q4 is None: + self.init_q4() + + x = x.view(-1, x.shape[-1]) + output = torch.empty((x.shape[0], self.outfeatures), dtype=torch.float16, device=x.device) + gptq_cuda.q4_matmul(x.half(), self.q4, output) + if self.bias is not None and (not self.row_split or self.tp_size == 1): + output.add_(self.bias) + else: + if self.bias is not None and (not self.row_split or self.tp_size == 1): + bias = self.bias + else: + bias = None + output = self.gptq_linear( + x, + self.qweight, + self.scales, + self.qzeros, + g_idx=self.g_idx, + bias=bias, + ) + return output.view(outshape) + + +def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1): + + qweights = gptq_linear.qweight.split(gptq_linear.out_features // split_num, dim=-1) + qzeros = gptq_linear.qzeros.split(gptq_linear.out_features // (32 // cai_linear.bits) // split_num, dim=-1) + scales = gptq_linear.scales.split(gptq_linear.out_features // split_num, dim=-1) + g_idx = gptq_linear.g_idx + if gptq_linear.bias is not None: + bias = gptq_linear.bias.split(gptq_linear.out_features // split_num, dim=-1) + + cai_split_out_features = cai_linear.outfeatures // split_num + zero_split_block = cai_linear.outfeatures // (32 // cai_linear.bits) // split_num + + for i in range(split_num): + cai_linear.qweight[:, i * cai_split_out_features:(i + 1) * + cai_split_out_features] = qweights[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + cai_linear.qzeros[:, i * zero_split_block:(i + 1) * + zero_split_block] = qzeros[i][:, tp_rank * zero_split_block:(tp_rank + 1) * zero_split_block] + cai_linear.scales[:, i * cai_split_out_features:(i + 1) * + cai_split_out_features] = scales[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + if cai_linear.bias is not None: + cai_linear.bias[i * cai_split_out_features:(i + 1) * + cai_split_out_features] = bias[i][tp_rank * cai_split_out_features:(tp_rank + 1) * + cai_split_out_features] + + cai_linear.g_idx.copy_(g_idx) + + +def split_row_copy(gptq_linear, cai_linear, tp_rank=0, split_num=1): + + qweights = gptq_linear.qweight.split(gptq_linear.in_features // split_num, dim=0) + qzeros = gptq_linear.qzeros.split(gptq_linear.in_features // split_num, dim=0) + scales = gptq_linear.scales.split(gptq_linear.in_features // split_num, dim=0) + g_idxs = gptq_linear.g_idx.split(gptq_linear.in_features // split_num, dim=0) + + cai_split_in_features = cai_linear.infeatures // (32 // cai_linear.bits) // split_num + zero_split_block = cai_linear.infeatures // cai_linear.groupsize // split_num + idx_split_features = cai_linear.infeatures // split_num + + for i in range(split_num): + cai_linear.qweight[i * cai_split_in_features:(i + 1) * + cai_split_in_features, :] = qweights[i][tp_rank * cai_split_in_features:(tp_rank + 1) * + cai_split_in_features, :] + cai_linear.qzeros[i * zero_split_block:(i + 1) * + zero_split_block, :] = qzeros[i][tp_rank * zero_split_block:(tp_rank + 1) * + zero_split_block, :] + cai_linear.scales[i * zero_split_block:(i + 1) * + zero_split_block, :] = scales[i][tp_rank * zero_split_block:(tp_rank + 1) * + zero_split_block, :] + cai_linear.g_idx[i * idx_split_features:(i + 1) * + idx_split_features] = g_idxs[i][tp_rank * idx_split_features:(tp_rank + 1) * + idx_split_features] + if cai_linear.bias is not None: + cai_linear.bias.copy_(gptq_linear.bias) + + +class RowCaiQuantLinear(CaiQuantLinear, ParallelModule): + + def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): + + super().__init__(bits, + groupsize, + infeatures, + outfeatures, + bias, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=row_split) + self.process_group = None + + @staticmethod + def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, + **kwargs) -> ParallelModule: + LazyInitContext.materialize(module) + # get the attributes + in_features = module.in_features + + # ensure only one process group is passed + if isinstance(process_group, (list, tuple)): + assert len(process_group) == 1, \ + f'Expected only one process group, got {len(process_group)}.' + process_group = process_group[0] + + tp_size = dist.get_world_size(process_group) + tp_rank = dist.get_rank(process_group) + + if in_features < tp_size: + return module + + if in_features % tp_size != 0: + raise ValueError( + f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!") + linear_1d = RowCaiQuantLinear(module.bits, + module.group_size, + module.in_features // tp_size, + module.out_features, + module.bias is not None, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=True) + linear_1d.process_group = process_group + + split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs) + return linear_1d + + def forward(self, x): + output = super().forward(x) + if self.tp_size > 1: + dist.all_reduce(output, op=dist.ReduceOp.SUM, group=self.process_group) + if self.bias is not None: + output.add_(self.bias) + return output + + +class ColCaiQuantLinear(CaiQuantLinear, ParallelModule): + + def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False): + + super().__init__(bits, + groupsize, + infeatures, + outfeatures, + bias, + tp_size=tp_size, + tp_rank=tp_rank, + row_split=row_split) + self.process_group = None + + @staticmethod + def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, + **kwargs) -> ParallelModule: + LazyInitContext.materialize(module) + # get the attributes + in_features = module.in_features + + # ensure only one process group is passed + if isinstance(process_group, (list, tuple)): + assert len(process_group) == 1, \ + f'Expected only one process group, got {len(process_group)}.' + process_group = process_group[0] + + tp_size = dist.get_world_size(process_group) + tp_rank = dist.get_rank(process_group) + + if in_features < tp_size: + return module + + if in_features % tp_size != 0: + raise ValueError( + f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!") + linear_1d = ColCaiQuantLinear(module.bits, + module.group_size, + module.in_features, + module.out_features // tp_size, + module.bias is not None, + tp_size=tp_size, + tp_rank=tp_rank) + linear_1d.process_group = process_group + + split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs) + return linear_1d diff --git a/colossalai/gptq/cai_gptq/gptq_op.py b/colossalai/inference/quant/gptq/cai_gptq/gptq_op.py similarity index 100% rename from colossalai/gptq/cai_gptq/gptq_op.py rename to colossalai/inference/quant/gptq/cai_gptq/gptq_op.py diff --git a/colossalai/gptq/cai_gptq/gptq_triton.py b/colossalai/inference/quant/gptq/cai_gptq/gptq_triton.py similarity index 100% rename from colossalai/gptq/cai_gptq/gptq_triton.py rename to colossalai/inference/quant/gptq/cai_gptq/gptq_triton.py diff --git a/colossalai/inference/tensor_parallel/engine.py b/colossalai/inference/tensor_parallel/engine.py index 94b44136bebc..9e379892b182 100644 --- a/colossalai/inference/tensor_parallel/engine.py +++ b/colossalai/inference/tensor_parallel/engine.py @@ -9,20 +9,9 @@ from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.tokenization_utils_base import BatchEncoding -from colossalai.gptq.cai_gptq import CaiQuantLinear -from colossalai.gptq.gptq_tp import replace_autogptq_linear from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.shardformer.policies.auto_policy import get_autopolicy -HAS_GPTQ_CUDA = False -try: - from colossalai.kernel.op_builder.gptq import GPTQBuilder - gptq_cuda = GPTQBuilder().load() - HAS_GPTQ_CUDA = True -except ImportError: - warnings.warn('CUDA gptq is not installed') - HAS_GPTQ_CUDA = False - from .batch_infer_state import BatchInferState from .kvcache_manager import MemoryManager @@ -99,6 +88,15 @@ def _init_manager(self) -> None: self.layer_num) def _post_init_gptq_buffer(self, model: nn.Module) -> None: + from colossalai.inference.quant.gptq.cai_gptq import CaiQuantLinear + HAS_GPTQ_CUDA = False + try: + from colossalai.kernel.op_builder.gptq import GPTQBuilder + gptq_cuda = GPTQBuilder().load() + HAS_GPTQ_CUDA = True + except ImportError: + warnings.warn('CUDA gptq is not installed') + HAS_GPTQ_CUDA = False for name, submodule in model.named_modules(): if isinstance(submodule, CaiQuantLinear): @@ -179,12 +177,11 @@ def _shard_model_by(self, shardformer: ShardFormer, model: nn.Module) -> None: model_name = model.__class__.__name__ assert model_name in self.supported_models, f"Unsupported model cls {model_name} for TP inference." policy = get_autopolicy(model, inference_only=True) + self.model, _ = shardformer.optimize(model, policy) if self.shard_config.inference_gptq: - tp_rank = dist.get_rank(self.shard_config.tensor_parallel_process_group) - replace_autogptq_linear(model, tp_size=self.tp_size, tp_rank=tp_rank) self._post_init_gptq_buffer(model) - self.model, _ = shardformer.optimize(model, policy) + self.model = self.model.cuda() @property diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index 037b0ab85863..bceb14c6af7b 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -36,23 +36,36 @@ def __init__(self) -> None: def module_policy(self): from transformers.models.bloom.modeling_bloom import BloomAttention, BloomBlock, BloomForCausalLM, BloomModel - policy = {} - if not self.shard_config.inference_gptq: - policy = super().module_policy() - else: - policy[BloomModel] = ModulePolicyDescription( - attribute_replacement={ - "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, - }, - method_replacement={ - "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group) - }, - sub_module_replacement=[ - SubModuleReplacementDescription( - suffix="word_embeddings", - target_module=col_nn.VocabParallelEmbedding1D, - ) - ]) + policy = super().module_policy() + if self.shard_config.inference_gptq: + from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear + policy[BloomBlock] = ModulePolicyDescription(attribute_replacement={ + "self_attention.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attention.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attention.num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, + }, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="self_attention.query_key_value", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 3}), + SubModuleReplacementDescription( + suffix="self_attention.dense", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}), + SubModuleReplacementDescription( + suffix="self_attention.attention_dropout", + target_module=col_nn.DropoutForParallelInput, + ), + SubModuleReplacementDescription( + suffix="mlp.dense_h_to_4h", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}), + SubModuleReplacementDescription( + suffix="mlp.dense_4h_to_h", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}), + ]) # NOTE set inference mode to shard config self.shard_config._infer() diff --git a/colossalai/inference/tensor_parallel/policies/llama.py b/colossalai/inference/tensor_parallel/policies/llama.py index 6b6056501ac0..f281e2918b37 100644 --- a/colossalai/inference/tensor_parallel/policies/llama.py +++ b/colossalai/inference/tensor_parallel/policies/llama.py @@ -35,16 +35,56 @@ def __init__(self) -> None: super().__init__() def module_policy(self): - policy = {} - if not self.shard_config.inference_gptq: - policy = super().module_policy() - else: - self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription( - suffix="embed_tokens", - target_module=VocabParallelEmbedding1D, - ), - policy=policy, - target_key=LlamaModel) + policy = super().module_policy() + + if self.shard_config.inference_gptq: + from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear + + decoder_attribute_replacement = { + "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size, + "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size, + } + policy[LlamaDecoderLayer] = ModulePolicyDescription( + attribute_replacement=decoder_attribute_replacement, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="self_attn.q_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.k_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.v_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="self_attn.o_proj", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.gate_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.up_proj", + target_module=ColCaiQuantLinear, + kwargs={'split_num': 1}, + ), + SubModuleReplacementDescription( + suffix="mlp.down_proj", + target_module=RowCaiQuantLinear, + kwargs={'split_num': 1}, + ) + ], + ) + self.shard_config._infer() infer_forward = LlamaInferenceForwards.llama_model_forward diff --git a/tests/test_gptq/test_gptq_linear.py b/tests/test_gptq/test_gptq_linear.py index 718060c22908..fca5e1a0ebc3 100644 --- a/tests/test_gptq/test_gptq_linear.py +++ b/tests/test_gptq/test_gptq_linear.py @@ -21,7 +21,7 @@ from auto_gptq.utils.import_utils import dynamically_import_QuantLinear from exllama_kernels import prepare_buffers, set_tuning_params - from colossalai.gptq import CaiQuantLinear + from colossalai.inference.quant.gptq import CaiQuantLinear HAS_AUTO_GPTQ = True except: HAS_AUTO_GPTQ = False