diff --git a/colossalai/gptq/cai_gptq/cai_quant_linear.py b/colossalai/gptq/cai_gptq/cai_quant_linear.py index 78a37e7bbfb3..93312716992d 100644 --- a/colossalai/gptq/cai_gptq/cai_quant_linear.py +++ b/colossalai/gptq/cai_gptq/cai_quant_linear.py @@ -147,49 +147,6 @@ def pack(self, linear, scales, zeros, g_idx=None): else: self.g_idx = g_idx - def prepare_buffers(self): - assert self.qweight.device.type == "cuda" - device = self.qweight.device - if self.g_idx is not None: - if self.row_split and torch.equal( - self.g_idx, - torch.tensor( - [(i + (self.tp_rank * self.infeatures)) // self.groupsize for i in range(self.infeatures)], - dtype=torch.int32, - device=self.g_idx.device)): - self.g_idx = None - elif torch.equal( - self.g_idx, - torch.tensor([i // self.groupsize for i in range(self.infeatures)], - dtype=torch.int32, - device=self.g_idx.device)): - self.g_idx = None - - CaiQuantLinear.max_dq_buffer_size = max(CaiQuantLinear.max_dq_buffer_size, self.qweight.numel() * 8) - - if self.g_idx is not None: - CaiQuantLinear.max_inner_outer_dim = max(CaiQuantLinear.max_inner_outer_dim, self.infeatures, - self.outfeatures) - CaiQuantLinear.max_input_len = 4096 - # The temp_state buffer is required to reorder X in the act-order case. - # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. - CaiQuantLinear.device_to_buffers['temp_state'] = torch.zeros( - (CaiQuantLinear.max_input_len, CaiQuantLinear.max_inner_outer_dim), dtype=torch.float16, device=device) - CaiQuantLinear.device_to_buffers['temp_dp'] = torch.zeros((1, CaiQuantLinear.max_dq_buffer_size), - dtype=torch.float16, - device=device) - - gptq_cuda.prepare_buffers(torch.device(device), CaiQuantLinear.device_to_buffers['temp_state'], - CaiQuantLinear.device_to_buffers['temp_dp']) - - # Using the default from exllama repo here. - matmul_recons_thd = 8 - matmul_fused_remap = False - matmul_no_half2 = False - gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) - - torch.cuda.empty_cache() - def init_q4(self): assert self.qweight.device.type == "cuda" self.q4_width = self.qweight.shape[1] @@ -219,21 +176,18 @@ def init_q4(self): def forward(self, x): outshape = x.shape[:-1] + (self.outfeatures,) - if HAS_GPTQ_CUDA: - if CaiQuantLinear.prepared_buffers == False: - self.prepare_buffers() - CaiQuantLinear.prepared_buffers = True + if HAS_GPTQ_CUDA and self.bits == 4: if self.q4 is None: self.init_q4() x = x.view(-1, x.shape[-1]) output = torch.empty((x.shape[0], self.outfeatures), dtype=torch.float16, device=x.device) - gptq_cuda.q4_matmul(x, self.q4, output) - if (self.bias is not None and not self.row_split) or self.tp_size == 1: + gptq_cuda.q4_matmul(x.half(), self.q4, output) + if self.bias is not None and (not self.row_split or self.tp_size == 1): output.add_(self.bias) else: - if (self.bias is not None and not self.row_split) or self.tp_size == 1: + if self.bias is not None and (not self.row_split or self.tp_size == 1): bias = self.bias else: bias = None diff --git a/colossalai/gptq/gptq_tp.py b/colossalai/gptq/gptq_tp.py index e8d1d7f00fe8..cc6d184da458 100644 --- a/colossalai/gptq/gptq_tp.py +++ b/colossalai/gptq/gptq_tp.py @@ -95,7 +95,7 @@ def all_reduce_hook(cai_linear, input, output): model_type_name = model.config.model_type gptq_model_config = model_config_map[model_type_name] - layers = get_module_by_name_prefix(model.model, gptq_model_config.layer_blocks) + layers = get_module_by_name_prefix(model, gptq_model_config.layer_blocks) for layer in layers: diff --git a/colossalai/inference/tensor_parallel/engine.py b/colossalai/inference/tensor_parallel/engine.py index a5a55702ade0..94b44136bebc 100644 --- a/colossalai/inference/tensor_parallel/engine.py +++ b/colossalai/inference/tensor_parallel/engine.py @@ -1,15 +1,28 @@ +import warnings from typing import Any, Callable, Dict, List, Optional, Union import torch +import torch.distributed as dist import torch.nn as nn from transformers import BloomForCausalLM, LlamaForCausalLM from transformers.generation import GenerationConfig from transformers.generation.stopping_criteria import StoppingCriteriaList from transformers.tokenization_utils_base import BatchEncoding +from colossalai.gptq.cai_gptq import CaiQuantLinear +from colossalai.gptq.gptq_tp import replace_autogptq_linear from colossalai.shardformer import ShardConfig, ShardFormer from colossalai.shardformer.policies.auto_policy import get_autopolicy +HAS_GPTQ_CUDA = False +try: + from colossalai.kernel.op_builder.gptq import GPTQBuilder + gptq_cuda = GPTQBuilder().load() + HAS_GPTQ_CUDA = True +except ImportError: + warnings.warn('CUDA gptq is not installed') + HAS_GPTQ_CUDA = False + from .batch_infer_state import BatchInferState from .kvcache_manager import MemoryManager @@ -66,6 +79,13 @@ def __init__(self, self.tp_size = -1 # to be set with given shard config in self.prepare_shard_config self.cache_manager = None + self.max_dq_buffer_size = 1 + self.max_inner_outer_dim = 1 + self.gptq_temp_state_buffer = None + self.gptq_temp_dq_buffer = None + self.bits = -1 + self.use_act_order = False + self.shard_config = shard_config self.model = None # optimize the original model by sharding with ShardFormer @@ -78,6 +98,41 @@ def _init_manager(self) -> None: self.cache_manager = MemoryManager(self.max_total_token_num, self.dtype, self.head_num, self.head_dim, self.layer_num) + def _post_init_gptq_buffer(self, model: nn.Module) -> None: + + for name, submodule in model.named_modules(): + if isinstance(submodule, CaiQuantLinear): + self.max_dq_buffer_size = max(self.max_dq_buffer_size, submodule.qweight.numel() * 8) + + if self.use_act_order: + self.max_inner_outer_dim = max(self.max_inner_outer_dim, submodule.infeatures, + submodule.outfeatures) + self.bits = submodule.bits + if not (HAS_GPTQ_CUDA and self.bits == 4): + return + + max_input_len = 1 + if self.use_act_order: + max_input_len = self.max_input_len + # The temp_state buffer is required to reorder X in the act-order case. + # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. + self.gptq_temp_state_buffer = torch.zeros((max_input_len, self.max_inner_outer_dim), + dtype=torch.float16, + device=torch.cuda.current_device()) + self.gptq_temp_dq_buffer = torch.zeros((1, self.max_dq_buffer_size), + dtype=torch.float16, + device=torch.cuda.current_device()) + + gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), self.gptq_temp_state_buffer, + self.gptq_temp_dq_buffer) + # Using the default from exllama repo here. + matmul_recons_thd = 8 + matmul_fused_remap = False + matmul_no_half2 = False + gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) + + torch.cuda.empty_cache() + def _optimize_model(self, model: nn.Module) -> None: """ Optimize the original model by sharding with ShardFormer. @@ -124,6 +179,11 @@ def _shard_model_by(self, shardformer: ShardFormer, model: nn.Module) -> None: model_name = model.__class__.__name__ assert model_name in self.supported_models, f"Unsupported model cls {model_name} for TP inference." policy = get_autopolicy(model, inference_only=True) + + if self.shard_config.inference_gptq: + tp_rank = dist.get_rank(self.shard_config.tensor_parallel_process_group) + replace_autogptq_linear(model, tp_size=self.tp_size, tp_rank=tp_rank) + self._post_init_gptq_buffer(model) self.model, _ = shardformer.optimize(model, policy) self.model = self.model.cuda() diff --git a/colossalai/inference/tensor_parallel/policies/bloom.py b/colossalai/inference/tensor_parallel/policies/bloom.py index 63791fe27284..037b0ab85863 100644 --- a/colossalai/inference/tensor_parallel/policies/bloom.py +++ b/colossalai/inference/tensor_parallel/policies/bloom.py @@ -3,6 +3,9 @@ import torch from torch.nn import LayerNorm +import colossalai.shardformer.layer as col_nn +from colossalai.shardformer.modeling.bloom import build_bloom_alibi_tensor_fn +from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription from colossalai.shardformer.policies.bloom import BloomForCausalLMPolicy from ..modeling.bloom import BloomInferenceForwards @@ -33,7 +36,23 @@ def __init__(self) -> None: def module_policy(self): from transformers.models.bloom.modeling_bloom import BloomAttention, BloomBlock, BloomForCausalLM, BloomModel - policy = super().module_policy() + policy = {} + if not self.shard_config.inference_gptq: + policy = super().module_policy() + else: + policy[BloomModel] = ModulePolicyDescription( + attribute_replacement={ + "num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size, + }, + method_replacement={ + "build_alibi_tensor": build_bloom_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group) + }, + sub_module_replacement=[ + SubModuleReplacementDescription( + suffix="word_embeddings", + target_module=col_nn.VocabParallelEmbedding1D, + ) + ]) # NOTE set inference mode to shard config self.shard_config._infer() diff --git a/colossalai/inference/tensor_parallel/policies/llama.py b/colossalai/inference/tensor_parallel/policies/llama.py index e819f2a8810c..6b6056501ac0 100644 --- a/colossalai/inference/tensor_parallel/policies/llama.py +++ b/colossalai/inference/tensor_parallel/policies/llama.py @@ -1,14 +1,13 @@ from functools import partial + import torch -from transformers.models.llama.modeling_llama import ( - LlamaAttention, - LlamaDecoderLayer, - LlamaModel, - LlamaRMSNorm -) +from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel, LlamaRMSNorm +from colossalai.shardformer.layer import VocabParallelEmbedding1D +from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription # import colossalai from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy + from ..modeling.llama import LlamaInferenceForwards, get_llama_vllm_rmsnorm_forward try: @@ -18,23 +17,34 @@ print("you should install triton from https://github.com/openai/triton") HAS_TRITON_RMSNORM = False - + def get_triton_rmsnorm_forward(): if HAS_TRITON_RMSNORM: + def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor): return rmsnorm_forward(hidden_states, self.weight.data, self.variance_epsilon) - + return _triton_rmsnorm_forward else: return None - + + class LlamaModelInferPolicy(LlamaForCausalLMPolicy): def __init__(self) -> None: super().__init__() def module_policy(self): - policy = super().module_policy() + policy = {} + if not self.shard_config.inference_gptq: + policy = super().module_policy() + else: + self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription( + suffix="embed_tokens", + target_module=VocabParallelEmbedding1D, + ), + policy=policy, + target_key=LlamaModel) self.shard_config._infer() infer_forward = LlamaInferenceForwards.llama_model_forward @@ -59,12 +69,11 @@ def module_policy(self): else: # NOTE: adding rms_norm from cuda kernels caused precision issue, fix @tiandiao123 infer_forward = get_llama_vllm_rmsnorm_forward() - + if infer_forward is not None: method_replacement = {'forward': partial(infer_forward)} self.append_or_create_method_replacement(description=method_replacement, - policy=policy, - target_key=LlamaRMSNorm) + policy=policy, + target_key=LlamaRMSNorm) return policy - diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py index 4380ac30814d..303e0b008041 100644 --- a/colossalai/shardformer/shard/shard_config.py +++ b/colossalai/shardformer/shard/shard_config.py @@ -33,9 +33,9 @@ class ShardConfig: enable_sequence_parallelism: bool = False enable_sequence_overlap: bool = False inference_only: bool = False + inference_gptq: bool = False enable_sequence_parallelism: bool = False enable_sequence_overlap: bool = False - # pipeline_parallel_size: int # data_parallel_size: int # tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d'] diff --git a/examples/inference/gptq_bloom.py b/examples/inference/gptq_bloom.py new file mode 100644 index 000000000000..43e118cc0aa5 --- /dev/null +++ b/examples/inference/gptq_bloom.py @@ -0,0 +1,123 @@ +import argparse +import logging +import os +import time + +import torch +from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig +from auto_gptq.nn_modules.qlinear import GeneralQuantLinear +from transformers import AutoTokenizer, BloomForCausalLM, BloomTokenizerFast, LlamaForCausalLM, LlamaTokenizer + +import colossalai +from colossalai.inference.tensor_parallel.engine import TPInferEngine +from colossalai.logging import disable_existing_loggers +from colossalai.shardformer import ShardConfig +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn + +os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' + + +def print_perf_stats(latency_set, config, bs, warmup=3): + # trim warmup queries + latency_set = list(latency_set) + latency_set = latency_set[warmup:] + count = len(latency_set) + + if count > 0: + latency_set.sort() + avg = sum(latency_set) / count + num_layers = getattr(config, "num_layers", config.num_hidden_layers) + num_parameters = num_layers * config.hidden_size * config.hidden_size * 12 + num_bytes = 2 # float16 + + print("Avg Per Token Latency: {0:8.2f} ms".format(avg * 1000)) + print("Avg BW: {0:8.2f} GB/s".format(1 / avg * num_parameters * num_bytes / 1e9)) + print("Avg flops: {0:8.2f} TFlops/s".format(1 / avg * num_parameters * num_bytes * bs / 1e12)) + print("Avg Throughput: tokens/s: {}".format((1000 / (avg * 1000)) * bs)) + + +def bench_bloom(args): + + pretrained_model_dir = args.path + quantized_model_dir = args.quantized_path + max_batch_size = args.batch_size + max_input_len = args.input_len + max_output_len = args.output_len + + tokenizer = BloomTokenizerFast.from_pretrained(pretrained_model_dir) + tokenizer.pad_token = tokenizer.eos_token + + # load quantized model to the first GPU + model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, + device=torch.cuda.current_device(), + inject_fused_attention=False) + + model = model.half() + + model_config = model.config + shard_config = ShardConfig(enable_tensor_parallelism=True if args.tp_size > 1 else False, inference_only=True) + infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) + generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) + + input_tokens = { + "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device='cuda'), + "attention_mask": torch.ones((max_batch_size, max_input_len), device='cuda') + } + + # init TPInferEngine and shard the original model + # To benchmark torch original, comment out the line of optimizing model + shard_config = ShardConfig(enable_tensor_parallelism=True if args.tp_size > 1 else False, + inference_only=True, + inference_gptq=True) + infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) + + # prepare data for generation + generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) + input_tokens = { + "input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)), + "attention_mask": torch.ones((max_batch_size, max_input_len)) + } + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) + # print(f" input_tokens[{t}].shape: {input_tokens[t].shape}") + + iters = 10 + times = [] + for i in range(iters): + torch.cuda.synchronize() + start = time.time() + outputs = infer_engine.generate(input_tokens, **generate_kwargs) + torch.cuda.synchronize() + end = time.time() + out_len = outputs.shape[1] + print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s") + times.append((end - start) / (out_len - max_input_len)) + + print_perf_stats(times, model_config, max_batch_size) + + +def check_bloom(rank, world_size, port, args): + disable_existing_loggers() + colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') + bench_bloom(args) + + +@rerun_if_address_is_in_use() +@clear_cache_before_run() +def test_bloom(args): + spawn(check_bloom, args.tp_size, args=args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--path', type=str, help='Model path', required=True) + parser.add_argument('-q', '--quantized_path', type=str, help='Model path', required=True) + parser.add_argument('-tp', '--tp_size', type=int, default=1, help='Tensor parallel size') + parser.add_argument('-b', '--batch_size', type=int, default=16, help='Maximum batch size') + parser.add_argument('--input_len', type=int, default=1024, help='Maximum input length') + parser.add_argument('--output_len', type=int, default=128, help='Maximum output length') + + args = parser.parse_args() + + test_bloom(args) diff --git a/examples/inference/gptq_llama.py b/examples/inference/gptq_llama.py index ae398740dcdb..818ae0035e87 100644 --- a/examples/inference/gptq_llama.py +++ b/examples/inference/gptq_llama.py @@ -1,71 +1,135 @@ +import argparse import logging +import os +import time import torch from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from auto_gptq.nn_modules.qlinear import GeneralQuantLinear from torch import distributed as dist +from torch.profiler import ProfilerActivity, profile, record_function from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, TextGenerationPipeline +import colossalai from colossalai.gptq import CaiQuantLinear from colossalai.gptq.gptq_tp import replace_autogptq_linear +from colossalai.inference.tensor_parallel.engine import TPInferEngine +from colossalai.logging import disable_existing_loggers +from colossalai.shardformer import ShardConfig +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn -logging.basicConfig(format="%(asctime)s %(levelname)s [%(name)s] %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S") -dist.init_process_group(backend="nccl") -pretrained_model_dir = "/data/scratch/llama-7b-hf" -# quantized_model_dir = "llama-7b-with-act-4bit" -quantized_model_dir = "/home/lcxk/data3/test_gptq_llama/llama-7b-no-act-4bit" -rank = dist.get_rank() -world_size = dist.get_world_size() -# rank = 1 -# world_size=2 -torch.cuda.set_device(rank) -print("world size {0} rank {1} deivce {2}".format(world_size, rank, torch.cuda.current_device())) -tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) -examples = [ - tokenizer( - "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.") -] - -# quantize_config = BaseQuantizeConfig( -# bits=4, # quantize model to 4-bit -# group_size=128, # it is recommended to set the value to 128 -# desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad -# ) - -# # load un-quantized model, by default, the model will always be loaded into CPU memory -# model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - -# # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" -# model.quantize(examples) - -# # save quantized model -# model.save_quantized(quantized_model_dir) - -# # save quantized model using safetensors -# model.save_quantized(quantized_model_dir, use_safetensors=True) - -# load quantized model to the first GPU -model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, - device=torch.cuda.current_device(), - inject_fused_attention=False) - -replace_autogptq_linear(model, tp_size=world_size, tp_rank=rank) - -# if rank == 0: -# print(model.config) -# print(model) -# download quantized model from Hugging Face Hub and load to the first GPU -# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False) - -# inference with model.generate -print("input is:", "auto-gptq is") -print( - tokenizer.decode( - model.generate(**tokenizer("auto-gptq is", return_tensors="pt").to(model.device), max_new_tokens=128)[0])) -dist.barrier() -print("input is:", "today is") -print( - tokenizer.decode( - model.generate(**tokenizer("today is ", return_tensors="pt").to(model.device), max_new_tokens=128)[0])) +os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' + + +def init_to_get_rotary(self, base=10000): + self.config.head_dim_ = self.config.hidden_size // self.config.num_attention_heads + if not hasattr(self.config, "rope_scaling"): + rope_scaling_factor = 1.0 + else: + rope_scaling_factor = self.config.rope_scaling.factor if self.config.rope_scaling is not None else 1.0 + if hasattr(self.config, "max_sequence_length"): + max_seq_len = self.config.max_sequence_length + elif hasattr(self.config, "max_position_embeddings"): + max_seq_len = self.config.max_position_embeddings * rope_scaling_factor + else: + max_seq_len = 2048 * rope_scaling_factor + base = float(base) + inv_freq = 1.0 / (base**(torch.arange(0, self.config.head_dim_, 2, device="cpu", dtype=torch.float32) / + self.config.head_dim_)) + t = torch.arange(max_seq_len + 1024 * 64, device="cpu", dtype=torch.float32) / rope_scaling_factor + freqs = torch.outer(t, inv_freq) + + self._cos_cached = torch.cos(freqs).to(torch.float16).cuda() + self._sin_cached = torch.sin(freqs).to(torch.float16).cuda() + return + + +def print_perf_stats(latency_set, config, bs, warmup=3): + # trim warmup queries + latency_set = list(latency_set) + latency_set = latency_set[warmup:] + count = len(latency_set) + + if count > 0: + latency_set.sort() + avg = sum(latency_set) / count + num_layers = getattr(config, "num_layers", config.num_hidden_layers) + num_parameters = num_layers * config.hidden_size * config.hidden_size * 12 + num_bytes = 2 + + print("Avg Per Token Latency: {0:8.2f} ms".format(avg * 1000)) + print("Avg BW: {0:8.2f} GB/s".format(1 / avg * num_parameters * num_bytes / 1e9)) + print("Avg flops: {0:8.2f} TFlops/s".format(1 / avg * num_parameters * num_bytes * bs / 1e12)) + print("Avg Throughput: tokens/s: {}".format((1000 / (avg * 1000)) * bs)) + + +def run_llama_test(args): + pretrained_model_dir = args.path + quantized_model_dir = args.quantized_path + max_batch_size = args.batch_size + max_input_len = args.input_len + max_output_len = args.output_len + + tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) + tokenizer.pad_token_id = tokenizer.eos_token_id + + # load quantized model to the first GPU + model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, + device=torch.cuda.current_device(), + inject_fused_attention=False) + + init_to_get_rotary(model.model.model, base=10000) + + model_config = model.config + shard_config = ShardConfig(enable_tensor_parallelism=True if args.tp_size > 1 else False, + inference_only=True, + inference_gptq=True) + infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len) + + generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False) + + input_tokens = { + "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device='cuda'), + "attention_mask": torch.ones((max_batch_size, max_input_len), device='cuda') + } + + iters = 10 + times = [] + + for i in range(iters): + torch.cuda.synchronize() + start = time.time() + outputs = infer_engine.generate(input_tokens, **generate_kwargs) + torch.cuda.synchronize() + end = time.time() + out_len = outputs.shape[1] + print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s") + times.append((end - start) / (out_len - max_input_len)) + + print_perf_stats(times, model_config, max_batch_size) + + +def check_llama(rank, world_size, port, args): + disable_existing_loggers() + colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') + run_llama_test(args) + + +@rerun_if_address_is_in_use() +@clear_cache_before_run() +def test_llama(args): + spawn(check_llama, args.tp_size, args=args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--path', type=str, help='Model path', required=True) + parser.add_argument('-q', '--quantized_path', type=str, help='Model path', required=True) + parser.add_argument('-tp', '--tp_size', type=int, default=1, help='Tensor parallel size') + parser.add_argument('-b', '--batch_size', type=int, default=16, help='Maximum batch size') + parser.add_argument('--input_len', type=int, default=1024, help='Maximum input length') + parser.add_argument('--output_len', type=int, default=128, help='Maximum output length') + + args = parser.parse_args() + + test_llama(args) diff --git a/tests/test_gptq/test_gptq_linear.py b/tests/test_gptq/test_gptq_linear.py index 0d0343a5c407..718060c22908 100644 --- a/tests/test_gptq/test_gptq_linear.py +++ b/tests/test_gptq/test_gptq_linear.py @@ -17,308 +17,133 @@ print("please install triton from https://github.com/openai/triton") try: - from auto_gptq.modeling._utils import autogptq_post_init, find_layers, pack_model - from auto_gptq.nn_modules.qlinear.qlinear_triton import QuantLinear - from auto_gptq.quantization import GPTQ - from auto_gptq.quantization.quantizer import Quantizer + from auto_gptq.modeling._utils import autogptq_post_init + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear + from exllama_kernels import prepare_buffers, set_tuning_params - from colossalai.gptq import CaiGPTQLinearOp, CaiQuantLinear + from colossalai.gptq import CaiQuantLinear HAS_AUTO_GPTQ = True except: HAS_AUTO_GPTQ = False print("please install triton from https://github.com/PanQiWei/AutoGPTQ") -TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4') - -wbits = 4 -trits = False -nsamples = 1 -percdamp = .01 -groupsize = 128 -act_order = False -sym = False - - -class MLinear(nn.Module): - - def __init__(self, infeature, outfeature): - super(MLinear, self).__init__() - self.linear = torch.nn.Linear(infeature, outfeature, dtype=torch.float16) +import warnings - def forward(self, x): - out = self.linear(x) - return out - - -@torch.no_grad() -def model_quant(model, inps, dev): - print('Starting ...') - layers = [model] - layers[0] = layers[0].to(dev) - - dtype = next(iter(model.parameters())).dtype - cache = {'i': 0} +HAS_GPTQ_CUDA = False +try: + from colossalai.kernel.op_builder.gptq import GPTQBuilder + gptq_cuda = GPTQBuilder().load() + HAS_GPTQ_CUDA = True +except ImportError: + warnings.warn('CUDA gptq is not installed') + HAS_GPTQ_CUDA = False - class Catcher(nn.Module): +TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4') - def __init__(self, module): - super().__init__() - self.module = module +max_inner_outer_dim = 1 +max_input_len = 1 +max_dq_buffer_size = 1 +gptq_temp_dq_buffer = None +gptq_temp_state_buffer = None - def forward(self, inp, **kwargs): - inps[cache['i']] = inp - cache['i'] += 1 - raise ValueError - layers[0] = Catcher(layers[0]) - # for batch in inps: - try: - model(inps.to(dev)) - except ValueError: - pass - layers[0] = layers[0].module - - outs = torch.zeros(inps.shape[0], layers[0].linear.weight.shape[0]) - - print('Ready.') - - quantizers = {} - for i in range(len(layers)): - layer = layers[i].to(dev) - subset = find_layers(layer) - gptq = {} - for name in subset: - gptq[name] = GPTQ(subset[name]) - # gptq[name].quantizer = Quantizer() - gptq[name].quantizer.configure(wbits, perchannel=True, sym=sym, mse=False, trits=trits) - - def add_batch(name): - - def tmp(_, inp, out): - gptq[name].add_batch(inp[0].data, out.data) - - return tmp - - handles = [] - for name in subset: - handles.append(subset[name].register_forward_hook(add_batch(name))) - - for j in range(nsamples): - outs[j] = layer(inps[j].unsqueeze(0))[0] - - for h in handles: - h.remove() - for name in subset: - print(f'Quantizing {name} in layer {i+1}/{len(layers)}...') - scale, zero, g_idx = gptq[name].fasterquant(percdamp=percdamp, group_size=groupsize, actorder=act_order) - # quantizers['%s' % (name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu()) - quantizers['%s' % (name)] = (gptq[name].layer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu()) - - gptq[name].free() - for j in range(nsamples): - layer = layer.to(dev) - outs[j] = layer(inps[j].unsqueeze(0))[0] - - layers[i] = layer.cpu() - del layer - del gptq - torch.cuda.empty_cache() - - inps, outs = outs, inps - - return quantizers - - -def model_pack(model, quantizers, wbits, groupsize): - pack_model(model, quantizers, wbits, groupsize) - return model - - -def cai_linear_pack(linear, scales, zeros, out_qweight, out_qscales, out_qzeros, qg_idx, infeatures, groupsize, bits): - g_idx = qg_idx.clone() if qg_idx is not None else torch.tensor([i // groupsize for i in range(infeatures)], - dtype=torch.int32) - - scales = scales.t().contiguous() - zeros = zeros.t().contiguous() - scale_zeros = zeros * scales - half_scales = scales.clone().half() - # print("scale shape ", scales.shape, scale_zeros.shape, linear.weight.shape) - - out_qscales.data.copy_(scales) - - # wn = 16 - # pbits = 64 - # ptype = torch.int64 - # unsign_type = np.uint64 - # sign_type = np.int64 - - wn = 8 - pbits = 32 - ptype = torch.int32 - unsign_type = np.uint32 - sign_type = np.int32 - - intweight = [] - for idx in range(infeatures): - intweight.append( - torch.round( - (linear.weight.data[:, idx] + scale_zeros[g_idx[idx]]) / half_scales[g_idx[idx]]).to(ptype)[:, None]) - intweight = torch.cat(intweight, dim=1) - intweight = intweight.t().contiguous() - intweight = intweight.numpy().astype(unsign_type) - qweight = np.zeros((intweight.shape[0] // pbits * bits, intweight.shape[1]), dtype=unsign_type) - - i = 0 - row = 0 - # print("weight shape ", intweight.shape, qweight.shape, out_qweight.shape, bits) - # print("weight shape ", intweight[0].shape, qweight[0].shape, out_qweight[0].shape) - # print("weight value ", intweight[0], qweight[0]) - - while row < qweight.shape[0]: - if bits in [2, 4, 8]: - for j in range(i, i + (pbits // bits)): - qweight[row] |= intweight[j] << (bits * (j - i)) - i += pbits // bits - row += 1 - else: - raise NotImplementedError("Only 2,4,8 bits are supported.") - qweight = qweight.astype(sign_type) - qweight1 = torch.from_numpy(qweight) - qweight1 = qweight1.contiguous().to("cuda") - out_qweight.data.copy_(qweight1) - - qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // pbits * bits), dtype=unsign_type) - zeros -= 1 - zeros = zeros.numpy().astype(unsign_type) - i = 0 - col = 0 - while col < qzeros.shape[1]: - if bits in [2, 4, 8]: - for j in range(i, i + (pbits // bits)): - qzeros[:, col] |= zeros[:, j] << (bits * (j - i)) - i += pbits // bits - col += 1 - else: - raise NotImplementedError("Only 2,4,8 bits are supported.") - qzeros = qzeros.astype(sign_type) - qzeros = torch.from_numpy(qzeros) - qzeros = qzeros.to("cuda") - out_qzeros.data.copy_(qzeros) - - return out_qweight, out_qscales, out_qzeros - - -def get_model_param(model, quantizers): - layers = find_layers(model) - layers = {n: layers[n] for n in quantizers} - with torch.no_grad(): - for name in layers: - _, scale, zero, g_idx = quantizers[name] +def init_buffer(cai_linear, use_act_order=False): + global max_dq_buffer_size + global max_input_len + global max_dq_buffer_size + global max_inner_outer_dim + global gptq_temp_dq_buffer + global gptq_temp_state_buffer - return scale, zero, g_idx + max_dq_buffer_size = max(max_dq_buffer_size, cai_linear.qweight.numel() * 8) + if use_act_order: + max_inner_outer_dim = max(max_inner_outer_dim, cai_linear.infeatures, cai_linear.outfeatures) -def model_cai_pack(model, quantizers, qweight, qscales, qzeros, wbits, groupsize): - layers = find_layers(model) - layers = {n: layers[n] for n in quantizers} - with torch.no_grad(): - for name in layers: - _, scale, zero, g_idx = quantizers[name] - qweight, qscales, qzeros = cai_linear_pack(layers[name], scale, zero, qweight, qscales, qzeros, g_idx, - layers[name].weight.shape[-1], groupsize, wbits) + if use_act_order: + max_input_len = 4096 + # The temp_state buffer is required to reorder X in the act-order case. + # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. + gptq_temp_state_buffer = torch.zeros((max_input_len, max_inner_outer_dim), + dtype=torch.float16, + device=torch.cuda.current_device()) + gptq_temp_dq_buffer = torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device()) - # print("cai pack", layers) - return qweight, qscales, qzeros + gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), gptq_temp_state_buffer, gptq_temp_dq_buffer) + # Using the default from exllama repo here. + matmul_recons_thd = 8 + matmul_fused_remap = False + matmul_no_half2 = False + gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) @pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ, reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq") def test_gptq_linear(): - infeature = 5120 - outfeature = 5120 + infeature = 1024 + outfeature = 1024 + group_size = 128 + wbits = 4 - weight = torch.randn(outfeature, infeature).to(torch.float16).to(torch.cuda.current_device()) - bias = torch.zeros(outfeature).to(torch.float16).to(torch.cuda.current_device()) - # wn = 16 - # ptype = torch.int64 + inps = torch.ones(1, 1, infeature).to(torch.float16).to(torch.cuda.current_device()) + batch_inps = torch.randn(1, 16, infeature).to(torch.float16).to(torch.cuda.current_device()) - wn = 8 - ptype = torch.int32 + device = torch.device("cuda:0") - qweight = torch.zeros(infeature // wn, outfeature, dtype=ptype, device=torch.cuda.current_device()).contiguous() - qscales = torch.zeros(infeature // groupsize, outfeature, dtype=torch.float16, - device=torch.cuda.current_device()).contiguous() - qzeros = torch.zeros(infeature // groupsize, outfeature // wn, dtype=ptype, - device=torch.cuda.current_device()).contiguous() + linear_class = dynamically_import_QuantLinear(use_triton=False, desc_act=False, group_size=group_size, bits=wbits) - act_func = nn.SiLU() - inps = torch.ones(1, 1, infeature).to(torch.float16).to(torch.cuda.current_device()) - batch_inps = torch.randn(1, 4096, infeature).to(torch.float16).to(torch.cuda.current_device()) + linear = linear_class( + bits=4, + group_size=group_size, + infeatures=infeature, + outfeatures=outfeature, + bias=False, + ) - linear = MLinear(infeature, outfeature) - linear.to(torch.cuda.current_device()) + torch.manual_seed(42) - with torch.no_grad(): - linear.linear.weight.data.copy_(weight) - linear.linear.bias.data.copy_(bias) + linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32) + linear.scales = linear.scales + 0.002 - with torch.no_grad(): - torch_out = linear(inps) - batch_torch_out = linear(batch_inps) - # torch_out = act_func(torch_out) - # batch_torch_out = act_func(batch_torch_out) + linear = linear.to(device) - # linear.to("cuda") - quantizers = model_quant(linear, inps, torch.cuda.current_device()) - # qweight, qscales, qzeros = model_cai_pack(linear, quantizers, qweight, qscales, qzeros, wbits, groupsize) + cai_linear = CaiQuantLinear(wbits, group_size, infeature, outfeature, True) + cai_linear.qweight.data.copy_(linear.qweight) + cai_linear.scales = cai_linear.scales + 0.002 + cai_linear = cai_linear.to(device) - scale, zero, g_idx = get_model_param(linear, quantizers) - cai_linear = CaiQuantLinear(wbits, groupsize, infeature, outfeature, True) + linear = autogptq_post_init(linear, use_act_order=False) - cai_linear.to("cuda") - cai_linear.pack(linear.linear, scale, zero, g_idx) - cai_linear.to("cuda") + max_inner_outer_dim = max(infeature, outfeature) + max_dq_buffer_size = linear.infeatures * linear.outfeatures + max_input_len = 2048 + buffers = { + "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device), + "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device) + } - gptq_model = model_pack(linear, quantizers, wbits, groupsize) - gptq_model.to(torch.cuda.current_device()) - gptq_model = autogptq_post_init(gptq_model, False) + prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"]) + + # Using the default from exllama repo here. + matmul_recons_thd = 8 + matmul_fused_remap = False + matmul_no_half2 = False + set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) with torch.no_grad(): - gptq_out = gptq_model(inps) - batch_gptq_out = gptq_model(batch_inps) + gptq_out = linear(inps) + batch_gptq_out = linear(batch_inps) torch.cuda.synchronize() cai_out = cai_linear(inps) torch.cuda.synchronize() batch_cai_out = cai_linear(batch_inps) torch.cuda.synchronize() - # batch_gptq_out = act_func(batch_gptq_out) - # gptq_out = act_func(gptq_out) assert torch.allclose(cai_out, gptq_out, rtol=1e-01, atol=1e-01) assert torch.allclose(batch_cai_out, batch_gptq_out, rtol=1e-01, atol=1e-01) - # mean_diff = torch.mean(torch.abs(cai_out - gptq_out)) - # max_diff = torch.max(torch.abs(cai_out - gptq_out)) - # print("cai vs gptq: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - # mean_diff = torch.mean(torch.abs(torch_out - gptq_out)) - # max_diff = torch.max(torch.abs(torch_out - gptq_out)) - # print("torch vs gptq: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - # mean_diff = torch.mean(torch.abs(torch_out - cai_out)) - # max_diff = torch.max(torch.abs(torch_out - cai_out)) - # print("torch vs cai: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - - # mean_diff = torch.mean(torch.abs(batch_cai_out - batch_gptq_out)) - # max_diff = torch.max(torch.abs(batch_cai_out - batch_gptq_out)) - # print("batch cai vs gptq: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - # mean_diff = torch.mean(torch.abs(batch_torch_out - batch_gptq_out)) - # max_diff = torch.max(torch.abs(batch_torch_out - batch_gptq_out)) - # print("batch torch vs gptq: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - # mean_diff = torch.mean(torch.abs(batch_torch_out - batch_cai_out)) - # max_diff = torch.max(torch.abs(batch_torch_out - batch_cai_out)) - # print("batch torch vs cai: mean_diff=%.8f, max_diff=%.8f" % (mean_diff, max_diff)) - if __name__ == "__main__":