From 3c03564ca045918d832c3a9e3889a222f607df95 Mon Sep 17 00:00:00 2001 From: shaoxin Date: Thu, 29 Jun 2023 20:16:02 +0800 Subject: [PATCH 1/3] support int8 & share context --- examples/cpp/llama/llama_example.cc | 36 ++- src/fastertransformer/.DS_Store | Bin 0 -> 6148 bytes src/fastertransformer/models/.DS_Store | Bin 0 -> 8196 bytes src/fastertransformer/models/llama/Llama.cc | 186 ++++++++----- src/fastertransformer/models/llama/Llama.h | 15 +- .../models/llama/LlamaContextDecoder.cc | 132 +++++++-- .../models/llama/LlamaContextDecoder.h | 11 +- .../models/llama/LlamaDecoder.cc | 9 +- .../models/llama/LlamaDecoder.h | 3 + .../models/llama/LlamaDecoderLayerWeight.cc | 253 ++++++++++++++---- .../models/llama/LlamaDecoderLayerWeight.h | 9 +- .../models/llama/LlamaWeight.cc | 6 +- .../models/llama/LlamaWeight.h | 3 + 13 files changed, 514 insertions(+), 149 deletions(-) create mode 100644 src/fastertransformer/.DS_Store create mode 100644 src/fastertransformer/models/.DS_Store diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index a64c86bca..49d966772 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -81,6 +81,7 @@ void llama_example(const INIReader reader) int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + int int8_mode = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0); const size_t head_num = reader.GetInteger(model_name, "head_num"); const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); @@ -177,6 +178,7 @@ void llama_example(const INIReader reader) tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end()); } + int* d_stop_words = nullptr; deviceMalloc(&d_stop_words, tiled_stop_words.size(), false); cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size()); @@ -193,6 +195,7 @@ void llama_example(const INIReader reader) 1, "../examples/cpp/llama/start_ids.csv"); + int* d_input_ids; int* d_input_lengths; if (max_input_len == 0) { @@ -274,6 +277,7 @@ void llama_example(const INIReader reader) cublas_wrapper.setFP32GemmConfig(); } + printf("******* Enter gpt_weights ********** \n"); const bool use_gptj_residual = false; fastertransformer::LlamaWeight gpt_weights(hidden_units, inter_size, @@ -285,9 +289,12 @@ void llama_example(const INIReader reader) pipeline_para.world_size_, pipeline_para.rank_, use_gptj_residual, + int8_mode, prompt_learning_type, prefix_prompt_table_pair); + printf("******* Enter loadModel ********* \n"); + gpt_weights.loadModel(model_dir); unsigned long long random_seed; if (rank == 0) { @@ -305,6 +312,8 @@ void llama_example(const INIReader reader) false, // with_relative_position_bias true); // causal_mask + printf("******* Inilize Llama ********* \n"); + Llama gpt = Llama(head_num, size_per_head, inter_size, @@ -331,12 +340,22 @@ void llama_example(const INIReader reader) &allocator, false, &prop, - attention_type); + attention_type, + int8_mode, + nullptr, + 0, + 1.0f); int* d_output_ids; int* d_sequence_lengths; + + printf("******* deviceMalloc start ********* \n"); + deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); + + printf("******* deviceMalloc end ********* \n"); + std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", @@ -402,6 +421,8 @@ void llama_example(const INIReader reader) print_mem_usage(); + printf("******* before cudaDeviceSynchronize ********* \n"); + int ite = 1; cudaDeviceSynchronize(); mpi::barrier(); @@ -411,15 +432,21 @@ void llama_example(const INIReader reader) ite = 1; ft_nvtx::setScope("warmup_time"); PUSH_RANGE("warmup time") + + printf("******* before gpt.forward ********* \n"); for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } + + printf("******* end gpt.forward ********* \n"); cudaDeviceSynchronize(); mpi::barrier(); POP_RANGE; ft_nvtx::resetScope(); + printf("******* end cudaDeviceSynchronize ********* \n"); + if (rank == 0) { std::string fName = "out"; @@ -430,8 +457,12 @@ void llama_example(const INIReader reader) else { size_t outCount = total_output_len * request_batch_size * beam_width; int* hBuf = new int[outCount]; + + printf("******* before cudaD2Hcpy ********* \n"); + cudaD2Hcpy(hBuf, d_output_ids, outCount); + printf("******* end cudaD2Hcpy ********* \n"); { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; @@ -465,10 +496,11 @@ void llama_example(const INIReader reader) ft_nvtx::setScope("total_time"); PUSH_RANGE("total time") + printf("******* before gpt forward ********* \n"); for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } - + printf("******* after gpt forward ********* \n"); cudaDeviceSynchronize(); mpi::barrier(); diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d1845e2dff13bedf5e41cf42a41978e6bf67cb38 GIT binary patch literal 6148 zcmeHK%}T>S5Z<-5O({YT3VK`cS}<)D3SL63FJMFuDm5WNgK4%jsX3HF?)pN$h|lB9 z?&c5-coVTRu=~x<&u->}>pQRquM&4!|OBpn}{f&<68nz7<3F48o>j? zbt<4v<>raObvoFEiE|7V8g)A3YGs(mtXw``xLO_TLWMK#Xr!JPAO@-oH1x2C=l>=A zGPRHVY6^{r0b<~vF~FNscRGQh%-Q;_JUnX!v`1(t7+0VI0(#{V00Z1d4wX~;1?mvz X7%Vj6ENEBhfOHX1giuEe`~m}CV@gb8 literal 0 HcmV?d00001 diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71 GIT binary patch literal 8196 zcmeHML2uJA6n^eHnlgrZ0BIbMB5|F@x@lq(m(q;`5?m-^2S6oR!xomsRg9 zH}DrY^GEnEoZx%5J2~!pMF{N5eu@2kFMgk$rY|KT(H#%kL~BG8pt5bPpt+{-IM=yS zGd)Ze;M22A9>$rBao(Ck`wp*wSHLUa74Qmp1^xyF@XY2CEO_qgQQvw6yaNBF0{s0D zqO$E6JJnWwb)Zox0BjxIs^D1t^oI?(1GZ!AR9p0*2u+36RAEvKVKN7y=@{=A{8U>_ zC!wwkKjzB9WGKQU9wJn55*=-Q>lN?{%qzgTd!1s6Xhfxxzdym)#xGSA@fLM~D=47@ z)Rd0sm`d89V~6+6;j#Z@-oLo-;DQ;B`t*T@v|IHnK0ElPg-tv(r2K~|T%Ies-nb#a~N;K?2CzgESw=Nmq{ZL7})kJrFd5X1{gs;$pkqUJu(_k9IEC!;9VR?e%bHYxDB5 z8QgvF@bSz3Np_k`-exGmJZWfEpI^0KaAbva)*I!S%z3*o<0eFryoA`o5}W>%w|-Sh z!tz%HS_NthUWqN)SBo_^*108lBUG4mY~Kl3dG!i>t5k5!84i9I-YOMF4YS#)dgU!p z!7R}kyg75^TS0uXmOs#_MORj8*!X2clt976^{k?=BF~}Pbx4liMqVwRKky281+)T7 z+N`a>K3)9&-^=9{@Cy9H3W)lCZ@-HGXTNydSUJ}YQQxC-Vcb+(MM0y|aafg(!(RPi bh~p4Y=5&mmYKtBe{~|!i;2W>NA64Kt&SGW% literal 0 HcmV?d00001 diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 8e3f1cedf..8f66e7def 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -42,6 +42,7 @@ void Llama::initialize() is_free_buffer_after_forward_, is_context_qk_buf_float_, attention_type_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -59,6 +60,7 @@ void Llama::initialize() cublas_wrapper_, allocator_, is_free_buffer_after_forward_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -165,6 +167,13 @@ void Llama::allocateBuffer( generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + if (shared_contexts_ratio_ > 0.0f) { + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); + } + is_allocate_buffer_ = true; } @@ -216,6 +225,11 @@ void Llama::freeBuffer() allocator_->free((void**)(&generation_should_stop_), true); + if (shared_contexts_ratio_ > 0.0f) { + allocator_->free((void**)(&shared_contexts_idx_)); + allocator_->free((void**)(&compact_size_)); + } + is_allocate_buffer_ = false; } } @@ -246,8 +260,10 @@ Llama::Llama(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -263,7 +279,9 @@ Llama::Llama(size_t head_num, use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { tensor_para_.world_size_ = 1; tensor_para_.rank_ = 0; @@ -310,8 +328,10 @@ Llama::Llama(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -331,7 +351,9 @@ Llama::Llama(size_t head_num, local_head_num_(head_num / tensor_para.world_size_), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { @@ -363,7 +385,9 @@ Llama::Llama(Llama const& gpt): vocab_size_padded_(gpt.vocab_size_padded_), custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), - attention_type_(gpt.attention_type_) + attention_type_(gpt.attention_type_), + int8_mode_(gpt.int8_mode_), + shared_contexts_ratio_(gpt.shared_contexts_ratio_) { initialize(); } @@ -585,6 +609,23 @@ void Llama::forward(std::unordered_map* output_ten cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); } + int compact_size; + bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1); + if (use_shared_contexts) { + invokeFindContextDups(shared_contexts_idx_, + batch_to_compact_idx_, + compact_idx_, + compact_size_, + input_tensors->at("input_ids").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + cudaD2Hcpy(&compact_size, compact_size_, 1); + use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size; + sync_check_cuda_error(); + } + // Prefix prompts if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, @@ -686,6 +727,14 @@ void Llama::forward(std::unordered_map* output_ten {batch_size * beam_width}, has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + if (use_shared_contexts) { + decoder_input_tensors.insert( + {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)}); + decoder_input_tensors.insert( + {"batch_to_compact_idx", + Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)}); + } + std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, @@ -877,67 +926,71 @@ void Llama::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); - if (tensor_para_.world_size_ == 1) { - float alpha = 1.0f; - float beta = 0.0f; - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - vocab_size_padded_, // n - local_batch_size * beam_width, - hidden_units_, // k - &alpha, - padded_embedding_kernel_ptr_, - gemm_data_type, - hidden_units_, // k - normed_decoder_output_buf_ + hidden_units_offset, - gemm_data_type, - hidden_units_, // k - &beta, - logits_buf_ + vocab_size_units_offset, - CUDA_R_32F, - vocab_size_padded_, /* n */ - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - } - else { - FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); - const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; - float alpha = 1.0f; - float beta = 0.0f; - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - local_vocab_size, // n - local_batch_size * beam_width, - hidden_units_, // k - &alpha, - padded_embedding_kernel_ptr_ - + tensor_para_.rank_ * local_vocab_size * hidden_units_, - gemm_data_type, - hidden_units_, // k - normed_decoder_output_buf_ + hidden_units_offset, - gemm_data_type, - hidden_units_, // k - &beta, - nccl_logits_buf_ + vocab_size_units_offset - + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, - CUDA_R_32F, - local_vocab_size, /* n */ - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, - nccl_logits_buf_ + vocab_size_units_offset, - local_batch_size * beam_width * local_vocab_size, - tensor_para_.rank_, - tensor_para_, - stream_); - invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, - nccl_logits_buf_ + vocab_size_units_offset, - tensor_para_.world_size_, - local_batch_size * beam_width, - local_vocab_size, - stream_); - } - + + // if (tensor_para_.world_size_ == 1) { + // float alpha = 1.0f; + // float beta = 0.0f; + // cublas_wrapper_->Gemm(CUBLAS_OP_T, + // CUBLAS_OP_N, + // vocab_size_padded_, // n + // local_batch_size * beam_width, + // hidden_units_, // k + // &alpha, + // padded_embedding_kernel_ptr_, + // gemm_data_type, + // hidden_units_, // k + // normed_decoder_output_buf_ + hidden_units_offset, + // gemm_data_type, + // hidden_units_, // k + // &beta, + // logits_buf_ + vocab_size_units_offset, + // CUDA_R_32F, + // vocab_size_padded_, /* n */ + // CUDA_R_32F, + // cublasGemmAlgo_t(-1)); + // } + // else { + // FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); + // const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; + // float alpha = 1.0f; + // float beta = 0.0f; + // cublas_wrapper_->Gemm(CUBLAS_OP_T, + // CUBLAS_OP_N, + // local_vocab_size, // n + // local_batch_size * beam_width, + // hidden_units_, // k + // &alpha, + // padded_embedding_kernel_ptr_ + // + tensor_para_.rank_ * local_vocab_size * hidden_units_, + // gemm_data_type, + // hidden_units_, // k + // normed_decoder_output_buf_ + hidden_units_offset, + // gemm_data_type, + // hidden_units_, // k + // &beta, + // nccl_logits_buf_ + vocab_size_units_offset + // + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, + // CUDA_R_32F, + // local_vocab_size, /* n */ + // CUDA_R_32F, + // cublasGemmAlgo_t(-1)); + + + // ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, + // nccl_logits_buf_ + vocab_size_units_offset, + // local_batch_size * beam_width * local_vocab_size, + // tensor_para_.rank_, + // tensor_para_, + // stream_); + // invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, + // nccl_logits_buf_ + vocab_size_units_offset, + // tensor_para_.world_size_, + // local_batch_size * beam_width, + // local_vocab_size, + // stream_); + // } + + int tmp_local_batch_size = local_batch_size; bool is_initialize_random_table = step == max_input_length; std::unordered_map dynamic_decode_input_tensors{ @@ -1229,5 +1282,4 @@ template class Llama; #ifdef ENABLE_BF16 template class Llama<__nv_bfloat16>; #endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h index f318e6947..a0958280e 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/Llama.h @@ -41,6 +41,7 @@ class Llama: public BaseLayer { float layernorm_eps_; static constexpr bool neox_rotary_style_ = true; + float shared_contexts_ratio_; int start_id_; int end_id_; @@ -54,6 +55,7 @@ class Llama: public BaseLayer { int enable_custom_all_reduce_; AttentionType attention_type_; + const int int8_mode_ = 0; size_t vocab_size_padded_; const bool is_context_qk_buf_float_ = @@ -120,6 +122,11 @@ class Llama: public BaseLayer { bool* generation_should_stop_ = nullptr; + int* shared_contexts_idx_ = nullptr; + int* compact_idx_ = nullptr; + int* batch_to_compact_idx_ = nullptr; + int* compact_size_ = nullptr; + T* context_decoder_input_buf_; T* context_decoder_output_buf_; float* output_log_probs_buf_; @@ -165,8 +172,10 @@ class Llama: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); Llama(size_t head_num, size_t size_per_head, @@ -195,8 +204,10 @@ class Llama: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); Llama(Llama const& Llama); diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index e3afd1780..a1d78c852 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -40,10 +40,11 @@ void LlamaContextDecoder::initialize() is_free_buffer_after_forward_, is_qk_buf_float_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); +// TODO: 这里的SiluFfnLayer是不支持int8的dataType,再不做修改的情况下试一下会不会报错。 ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 0, // max_seq_len head_num_, @@ -69,7 +70,7 @@ void LlamaContextDecoder::allocateBuffer() } template -void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -83,6 +84,20 @@ void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + + if (use_shared_contexts) { + compact_decoder_features_ = reinterpret_cast( + allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + compact_attention_mask_ = reinterpret_cast( + allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); + compact_input_lengths_ = + reinterpret_cast(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false)); + k_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + v_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + } + is_allocate_buffer_ = true; } @@ -97,6 +112,13 @@ void LlamaContextDecoder::freeBuffer() allocator_->free((void**)(&h_pinned_token_num_ptr_), true); allocator_->free((void**)(&padding_offset_)); allocator_->free((void**)(&cu_seqlens_)); + if (compact_decoder_features_ != nullptr) { + allocator_->free((void**)(&compact_decoder_features_)); + allocator_->free((void**)(&compact_attention_mask_)); + allocator_->free((void**)(&compact_input_lengths_)); + allocator_->free((void**)(&k_cache_layer_)); + allocator_->free((void**)(&v_cache_layer_)); + } is_allocate_buffer_ = false; } } @@ -147,6 +169,7 @@ LlamaContextDecoder::LlamaContextDecoder(size_t bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -163,6 +186,7 @@ LlamaContextDecoder::LlamaContextDecoder(size_t pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -185,6 +209,7 @@ LlamaContextDecoder::LlamaContextDecoder(LlamaContextDecoder const& decode pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -238,15 +263,23 @@ void LlamaContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 5); + FT_CHECK(input_tensors->size() >= 5); FT_CHECK(output_tensors->size() == 4); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; + const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end(); + FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end())); + const size_t request_batch_size = input_tensors->at("decoder_input").shape[0]; + // compacted batch size. + const size_t batch_size = + use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; // max_input_len + // The maximum length of generation. + const size_t max_seq_len = output_tensors->at("value_cache").shape[3]; + const int max_prompt_length = input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; const DataType data_type = getTensorType(); - allocateBuffer(batch_size, seq_len); + allocateBuffer(batch_size, seq_len, use_shared_contexts); T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); @@ -254,6 +287,20 @@ void LlamaContextDecoder::forward(std::unordered_map* const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + if (use_shared_contexts) { + invokeCompactInputs(compact_decoder_features_, + compact_attention_mask_, + compact_input_lengths_, + decoder_input, + attention_mask, + input_tensors->at("input_lengths").getPtr(), + input_tensors->at("compact_idx").getPtr(), + batch_size, + seq_len, + hidden_units_, + stream_); + } + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); FT_CHECK(batch_size % local_batch_size == 0); const int iteration_num = batch_size / local_batch_size; @@ -271,6 +318,12 @@ void LlamaContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } + if (use_shared_contexts) { + // we use k_cache_layer_ and v_cache_layer_ + self_k_cache_size[3] = seq_len; + self_v_cache_size[2] = seq_len; + } + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? getUnfusedAttentionType(attention_type_) : attention_type_; @@ -279,7 +332,8 @@ void LlamaContextDecoder::forward(std::unordered_map* for (int ite = 0; ite < iteration_num; ite++) { size_t h_token_num = local_batch_size * seq_len; if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* base_input_lengths = + use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr(); invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, &h_token_num, padding_offset_, @@ -295,8 +349,9 @@ void LlamaContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { + const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input); invokeRemovePadding(decoder_layer_output_, - decoder_input + ite * local_batch_size * seq_len * hidden_units_, + base_input + ite * local_batch_size * seq_len * hidden_units_, padding_offset_, h_token_num, hidden_units_, @@ -308,11 +363,11 @@ void LlamaContextDecoder::forward(std::unordered_map* T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { if (l == 0) { - layer_input = decoder_input; + layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input; layer_input += ite * local_batch_size * seq_len * hidden_units_; } if (l == num_layer_ - 1) { - layer_output = decoder_output; + layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output; layer_output += ite * local_batch_size * seq_len * hidden_units_; } } @@ -328,7 +383,7 @@ void LlamaContextDecoder::forward(std::unordered_map* ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); } } - + // TODO: 这里用的LN跟neox不一样,不太清楚这里需不需要改成int8的LN invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, @@ -337,8 +392,11 @@ void LlamaContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, stream_); + sync_check_cuda_error(); + const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask; + TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, @@ -346,7 +404,7 @@ void LlamaContextDecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, - attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; @@ -381,17 +439,43 @@ void LlamaContextDecoder::forward(std::unordered_map* } cache_offset += ite_cache_offset; + T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset(cache_offset); + T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset(cache_offset); + TensorMap self_attention_output_tensors{ {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", - Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}}; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &gpt_decoder_layer_weight->at(l)->self_attention_weights); + if (use_shared_contexts) { + // Even with local batches, we must process the whole K/V caches as any + // element in batch_idx_to_compact_idx may reference the local batch + // we're processing. We also need to discard references that aren't in + // that particular local batch. + const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len; + const size_t cache_layer_offset = + (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch; + invokeUnCompactCaches(k_cache.getPtrWithOffset(cache_layer_offset), + v_cache.getPtrWithOffset(cache_layer_offset), + k_cache_layer_, + v_cache_layer_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch_size (uncompact) + v_cache.shape[2], // local_head_num + max_seq_len, + seq_len, + size_per_head_, + local_batch_size, + ite, + stream_); + sync_check_cuda_error(); + } + if (is_final == false) { if (use_gptj_residual_) { invokeGeneralLayerNorm(decoder_normed_input_, @@ -402,10 +486,11 @@ void LlamaContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { + // TODO: modify or not ? invokeGeneralAddResidualT5PreLayerNorm( self_attn_output_, decoder_normed_input_, @@ -472,7 +557,8 @@ void LlamaContextDecoder::forward(std::unordered_map* } if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output; + invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_, decoder_layer_output_, padding_offset_, h_token_num, @@ -483,12 +569,22 @@ void LlamaContextDecoder::forward(std::unordered_map* } } + if (use_shared_contexts) { + invokeUnCompactOutputs(decoder_output, + compact_decoder_features_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch + seq_len * hidden_units_, + stream_); + sync_check_cuda_error(); + } + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), output_tensors->at("decoder_output").getPtr(), input_tensors->at("input_lengths").getPtr(), seq_len, - batch_size, + request_batch_size, hidden_units_, stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h index 788d1d45d..a2f91f7b8 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.h +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h @@ -56,13 +56,15 @@ class LlamaContextDecoder: public BaseLayer { AttentionType attention_type_; + int int8_mode_ = 0; + bool is_qk_buf_float_; BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -81,6 +83,12 @@ class LlamaContextDecoder: public BaseLayer { int* padding_offset_ = nullptr; int* cu_seqlens_ = nullptr; + T* compact_decoder_features_ = nullptr; + T* compact_attention_mask_ = nullptr; + int* compact_input_lengths_ = nullptr; + T* k_cache_layer_ = nullptr; + T* v_cache_layer_ = nullptr; + public: LlamaContextDecoder(size_t head_num, size_t size_per_head, @@ -98,6 +106,7 @@ class LlamaContextDecoder: public BaseLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type = AttentionType::FUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index a5cffa731..4685217b0 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -35,10 +35,11 @@ void LlamaDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); + // TODO: SiLu ftn layer not support int8 ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 1, head_num_, @@ -133,6 +134,7 @@ LlamaDecoder::LlamaDecoder(size_t head_num, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -147,6 +149,7 @@ LlamaDecoder::LlamaDecoder(size_t head_num, hidden_units_(head_num_ * size_per_head), tensor_para_(tensor_para), pipeline_para_(pipeline_para), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -167,6 +170,7 @@ LlamaDecoder::LlamaDecoder(LlamaDecoder const& decoder): hidden_units_(decoder.hidden_units_), tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -260,6 +264,7 @@ void LlamaDecoder::forward(std::unordered_map* } } + // TODO 使用的是T5 LN,这里是没有int8的参数支持 invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, @@ -301,7 +306,7 @@ void LlamaDecoder::forward(std::unordered_map* local_batch_size, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h index 6cdd7df27..dc44b0f32 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.h +++ b/src/fastertransformer/models/llama/LlamaDecoder.h @@ -70,6 +70,8 @@ class LlamaDecoder: public BaseLayer { BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; + int int8_mode_ = 0; + public: LlamaDecoder(size_t head_num, size_t size_per_head, @@ -85,6 +87,7 @@ class LlamaDecoder: public BaseLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index 3e97b67d0..a8dadefea 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -24,15 +24,26 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, const int tensor_para_rank, - const bool use_gptj_residual): + const bool use_gptj_residual, + const int int8_mode): hidden_units_(hidden_units), inter_size_(inter_size), tensor_para_size_(tensor_para_size), tensor_para_rank_(tensor_para_rank), + int8_mode_(int8_mode), use_gptj_residual_(use_gptj_residual) { mallocWeights(); setWeightPtr(); + + FT_CHECK_WITH_INFO(int8_mode_ != 2, "Llama doesn't support int8_model == 2"); + FT_CHECK_WITH_INFO(!(std::is_same::value && int8_mode_ == 1), + "Weight only quant does not work with FP32 compute."); +} + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode) +{ } template @@ -60,38 +71,92 @@ LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() ffn_weights.intermediate_weight2.bias = nullptr; ffn_weights.output_weight.kernel = nullptr; ffn_weights.output_weight.bias = nullptr; + + if (int8_mode_ != 0) { + for (int i = 0; i < int8_weights_ptr.size(); i++) { + if (int8_weights_ptr[i] != nullptr) { + deviceFree(int8_weights_ptr[i]); + } + } + + if (int8_mode_ == 1) { + for (int i = 0; i < weight_only_scale_ptr.size(); i++) { + if (weight_only_scale_ptr[i] != nullptr) { + deviceFree(weight_only_scale_ptr[i]); + } + } + } + + self_attention_weights.query_weight.int8_kernel = nullptr; + self_attention_weights.query_weight.weight_only_quant_scale = nullptr; + self_attention_weights.attention_output_weight.int8_kernel = nullptr; + self_attention_weights.attention_output_weight.weight_only_quant_scale = nullptr; + + // 作一下标记 intermediate_weight => gate_proj; intermediate_weight2 => up_proj; output_weight => down_proj. + ffn_weights.intermediate_weight.int8_kernel = nullptr; + ffn_weights.intermediate_weight.weight_only_quant_scale = nullptr; + ffn_weights.intermediate_weight2.int8_kernel = nullptr; + ffn_weights.intermediate_weight2.weight_only_quant_scale = nullptr; + ffn_weights.output_weight.int8_kernel = nullptr; + ffn_weights.output_weight.weight_only_quant_scale = nullptr; + } + is_maintain_buffer = false; } } template -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other): - hidden_units_(other.hidden_units_), - inter_size_(other.inter_size_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_), - use_gptj_residual_(other.use_gptj_residual_) +void LlamaDecoderLayerWeight::copyFrom(const LlamaDecoderLayerWeight& other) { - mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); if (!use_gptj_residual_) { cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); } - - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); - - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + + if (int8_mode_ == 0) { + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); + } + else { + cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[4], other.int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_); + cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + + // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的,后面做一下验证,回来再改一下 + cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_); + } + } +} + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + int8_mode_(other.int8_mode_), + use_gptj_residual_(other.use_gptj_residual_) +{ + mallocWeights(); + copyFrom(other); setWeightPtr(); } @@ -102,26 +167,12 @@ LlamaDecoderLayerWeight& LlamaDecoderLayerWeight::operator=(const LlamaDec inter_size_ = other.inter_size_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; + int8_mode_ = other.int8_mode_; use_gptj_residual_ = other.use_gptj_residual_; mallocWeights(); - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - } - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); - cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); - cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + copyFrom(other); setWeightPtr(); return *this; } @@ -137,42 +188,90 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[2], + deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0); + + if (!use_gptj_residual_) { + deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0); + } + + // FIXME(sunpeng17): check if the weights are correct + // loadWeightFromBin(weights_ptr[6], + // {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + // dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + // model_file_type); + + deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + // loadWeightFromBin(weights_ptr[10], + // {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + // dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", + // model_file_type); + + + deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0); + + deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0); + loadWeightFromBin( + weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); + + if (int8_mode_ == 0) { + loadWeightFromBin(weights_ptr[2], {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0); - loadWeightFromBin(weights_ptr[4], + loadWeightFromBin(weights_ptr[4], {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".attention.dense.weight." + rank_spec + ".bin", model_file_type); - if (!use_gptj_residual_) { - deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0); - } - // FIXME(sunpeng17): check if the weights are correct - loadWeightFromBin(weights_ptr[6], + loadWeightFromBin(weights_ptr[6], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); - loadWeightFromBin(weights_ptr[8], - {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + loadWeightFromBin(weights_ptr[8], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); - - loadWeightFromBin(weights_ptr[10], + loadWeightFromBin(weights_ptr[10], {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0); + } + else if (int8_mode_ == 1) { + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[0], + weight_only_scale_ptr[0], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); - deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0); - loadWeightFromBin( - weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[1], + weight_only_scale_ptr[1], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[2], + weight_only_scale_ptr[2], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[3], + weight_only_scale_ptr[3], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", + model_file_type); + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[4], + weight_only_scale_ptr[4], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", + model_file_type); + + } } template @@ -194,6 +293,23 @@ void LlamaDecoderLayerWeight::setWeightPtr() post_attention_layernorm_weights.beta = weights_ptr[12]; post_attention_layernorm_weights.gamma = weights_ptr[13]; + + if (int8_mode_ != 0) { + self_attention_weights.query_weight.int8_kernel = int8_weights_ptr[0]; + self_attention_weights.attention_output_weight.int8_kernel = int8_weights_ptr[1]; + ffn_weights.intermediate_weight.int8_kernel = int8_weights_ptr[2]; + ffn_weights.intermediate_weight2.int8_kernel = int8_weights_ptr[3]; + ffn_weights.output_weight.int8_kernel = int8_weights_ptr[4]; + + if (int8_mode_ == 1) { + self_attention_weights.query_weight.weight_only_quant_scale = weight_only_scale_ptr[0]; + self_attention_weights.attention_output_weight.weight_only_quant_scale = weight_only_scale_ptr[1]; + ffn_weights.intermediate_weight.weight_only_quant_scale = weight_only_scale_ptr[2]; + ffn_weights.intermediate_weight2.weight_only_quant_scale = weight_only_scale_ptr[3]; + ffn_weights.output_weight.weight_only_quant_scale = weight_only_scale_ptr[4]; + } + } + is_maintain_buffer = true; } @@ -202,21 +318,48 @@ void LlamaDecoderLayerWeight::mallocWeights() { deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma - deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel + // deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias - deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + // deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight if (!use_gptj_residual_) { deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias } - deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel + // deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); // intermediate_weight bias - deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel + // deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_); // intermediate_weight2 bias - deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel + // deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel deviceMalloc(&weights_ptr[11], hidden_units_); // output_weight bias deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma + + if (int8_mode_ == 0) { + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv weight + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel + deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel + deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel + } + else { + // Alloc FFN and Attention int8 weights + deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_); + + + if (int8_mode_ == 1) { + // Alloc scales for weight only quant for attention and FFN weights + deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[1], hidden_units_); + deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[3], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[4], hidden_units_); + } + } + } template struct LlamaDecoderLayerWeight; diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h index 008e1a3b4..5a76ba4c5 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h @@ -29,11 +29,13 @@ template struct LlamaDecoderLayerWeight { public: LlamaDecoderLayerWeight() = default; + LlamaDecoderLayerWeight(const int int8_mode); LlamaDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, const int tensor_para_rank = 0, - const bool use_gptj_residual = true); + const bool use_gptj_residual = true, + const int int8_mode = 0); ~LlamaDecoderLayerWeight(); LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other); LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other); @@ -54,9 +56,14 @@ struct LlamaDecoderLayerWeight { const int attention_dense_bias_weight_id = 5; bool is_maintain_buffer = false; T* weights_ptr[14]; + int int8_mode_ = 0; + + std::vector int8_weights_ptr = std::vector(5, nullptr); + std::vector weight_only_scale_ptr = std::vector(5, nullptr); void setWeightPtr(); void mallocWeights(); + void copyFrom(const LlamaDecoderLayerWeight& other); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc index 6105267ff..e9e11b6a1 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.cc +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -29,6 +29,7 @@ LlamaWeight::LlamaWeight(const int hidden_un const int layer_para_size, const int layer_para_rank, const bool use_gptj_residual, + const int int8_mode, PromptLearningType prompt_learning_type, std::map> prompt_learning_pair): hidden_units_(hidden_units), @@ -41,6 +42,7 @@ LlamaWeight::LlamaWeight(const int hidden_un layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank), use_gptj_residual_(use_gptj_residual), + int8_mode_(int8_mode), prompt_learning_type_(prompt_learning_type), prompt_learning_pair_(prompt_learning_pair) { @@ -62,7 +64,7 @@ LlamaWeight::LlamaWeight(const int hidden_un for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights.push_back(new LlamaDecoderLayerWeight( - hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_)); } else { // Layer-parallelism: allocate empty layer because @@ -103,6 +105,7 @@ LlamaWeight::LlamaWeight(const LlamaWeight& other): layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), use_gptj_residual_(other.use_gptj_residual_), + int8_mode_(other.int8_mode_), prompt_token_weight_size_(other.prompt_token_weight_size_), malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), prompt_learning_type_(other.prompt_learning_type_), @@ -149,6 +152,7 @@ LlamaWeight& LlamaWeight::operator=(const LlamaWeight& other) layer_para_size_ = other.layer_para_size_; layer_para_rank_ = other.layer_para_rank_; use_gptj_residual_ = other.use_gptj_residual_; + int8_mode_ = other.int8_mode_; prompt_token_weight_size_ = other.prompt_token_weight_size_; malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; prompt_learning_type_ = other.prompt_learning_type_; diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h index ec909ca49..eb90dc5bf 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.h +++ b/src/fastertransformer/models/llama/LlamaWeight.h @@ -38,6 +38,7 @@ struct LlamaWeight { const int layer_para_size = 1, const int layer_para_rank = 0, const bool use_gptj_residual_ = false, + const int int8_mode = 0, PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); @@ -88,6 +89,8 @@ struct LlamaWeight { int layer_para_size_; int layer_para_rank_; + size_t int8_mode_ = 0; + // residual type bool use_gptj_residual_; From 6f0469f2c3840bbfe31aa39a276281f3710cbd7e Mon Sep 17 00:00:00 2001 From: shaoxin Date: Fri, 30 Jun 2023 14:08:17 +0800 Subject: [PATCH 2/3] update code with comments from reviewer --- .gitignore | 4 +- examples/cpp/llama/llama_example.cc | 1 - src/fastertransformer/.DS_Store | Bin 6148 -> 6148 bytes src/fastertransformer/layers/FfnLayer.cc | 3 +- src/fastertransformer/layers/FfnLayer.h | 1 + .../layers/TensorParallelSiluFfnLayer.cc | 4 +- .../layers/TensorParallelSiluFfnLayer.h | 3 +- .../adapter_layers/LinearAdapterLayer.cc | 3 +- src/fastertransformer/models/.DS_Store | Bin 8196 -> 8196 bytes src/fastertransformer/models/llama/Llama.cc | 120 +++++++++--------- .../models/llama/LlamaContextDecoder.cc | 5 +- .../models/llama/LlamaDecoder.cc | 3 +- 12 files changed, 78 insertions(+), 69 deletions(-) diff --git a/.gitignore b/.gitignore index 77849f435..8ecc0cca0 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ __pycache__/ /models /notebooks **/.ipynb_checkpoints/ +.DS_Store /3rdparty/NeMo/ -/3rdparty/apex/ \ No newline at end of file +/3rdparty/apex/ + diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 49d966772..14cff7bee 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -541,6 +541,5 @@ void llama_example(const INIReader reader) if (d_sequence_lengths != nullptr) { deviceFree(d_sequence_lengths); } - return; } diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store index d1845e2dff13bedf5e41cf42a41978e6bf67cb38..d4edc9af914042cba222ab33425eac19388ca6d5 100644 GIT binary patch literal 6148 zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ z1K+?eu!SGtUs%Dp9v96dQiC=d@-8 z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5 z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=t_v4?q z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5` zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{ zokfK~+P9>_&d&)L)+xa{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n! delta 96 zcmZoMXfc=|#>B`mu~2NHo}wrd0|Nsi1A_nqLk>f6Qh9MfQcix-#KPs14MbQbv$B2M sEW#nova#U{<7Rdaeh#3%&4L`?nJ4p$SOT?xwSr6l8NsqSLSzjy0AfWI5&!@I diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc index 7ac441198..14bb5e3f6 100644 --- a/src/fastertransformer/layers/FfnLayer.cc +++ b/src/fastertransformer/layers/FfnLayer.cc @@ -684,6 +684,7 @@ SiluFfnLayer::SiluFfnLayer(size_t max_batch_size, IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse, + int int8_mode, bool use_gated_activation): FfnLayer(max_batch_size, max_seq_len, @@ -696,7 +697,7 @@ SiluFfnLayer::SiluFfnLayer(size_t max_batch_size, allocator, is_free_buffer_after_forward, sparse, - 0, + int8_mode, use_gated_activation) { } diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h index af7ae7606..f84915d2f 100644 --- a/src/fastertransformer/layers/FfnLayer.h +++ b/src/fastertransformer/layers/FfnLayer.h @@ -210,6 +210,7 @@ class SiluFfnLayer: public FfnLayer { IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse = false, + int int8_mode = 0, bool use_gated_activation = false); SiluFfnLayer(SiluFfnLayer const& ffn_layer); diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc index 25a2da86b..af4714d82 100644 --- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc +++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc @@ -76,7 +76,8 @@ TensorParallelSiluFfnLayer::TensorParallelSiluFfnLayer(size_t max_b bool is_sparse, bool use_gated_activation, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + int int8_mode): SiluFfnLayer(max_batch_size, max_seq_len, head_num, @@ -88,6 +89,7 @@ TensorParallelSiluFfnLayer::TensorParallelSiluFfnLayer(size_t max_b allocator, is_free_buffer_after_forward, is_sparse, + int8_mode, use_gated_activation), tensor_para_(tensor_para), custom_all_reduce_comm_(custom_all_reduce_comm), diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h index ae481373a..5f0e6d625 100644 --- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h +++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h @@ -47,7 +47,8 @@ class TensorParallelSiluFfnLayer: public SiluFfnLayer { bool is_sparse, bool use_gated_activation = false, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + int int8_mode = 0); TensorParallelSiluFfnLayer(TensorParallelSiluFfnLayer const& ffn_layer); diff --git a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc index c5ea150f4..ef2ac7b3a 100644 --- a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc +++ b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc @@ -88,7 +88,8 @@ LinearAdapterLayer::LinearAdapterLayer(LinearAdapterConfig const& co is_sparse, false, custom_all_reduce_comm, - enable_custom_all_reduce)}, + enable_custom_all_reduce, + 0)}, layer_norm_type_{config.layerNormType()}, layer_norm_eps_{layer_norm_eps}, max_token_size_{max_batch_size * max_seq_len}, diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store index cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71..de13d48fdd4e9e824db21adcab91399ef3727fa6 100644 GIT binary patch delta 22 dcmZp1XmQx^ONi6R#6m~G(8$7IGqW%wF91_V1=;`r delta 22 dcmZp1XmQx^ONi6h&`?Lg(8%0$GqW%wF91^U1=Rom diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 8f66e7def..1eac9fd20 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -927,68 +927,68 @@ void Llama::forward(std::unordered_map* output_ten sync_check_cuda_error(); - // if (tensor_para_.world_size_ == 1) { - // float alpha = 1.0f; - // float beta = 0.0f; - // cublas_wrapper_->Gemm(CUBLAS_OP_T, - // CUBLAS_OP_N, - // vocab_size_padded_, // n - // local_batch_size * beam_width, - // hidden_units_, // k - // &alpha, - // padded_embedding_kernel_ptr_, - // gemm_data_type, - // hidden_units_, // k - // normed_decoder_output_buf_ + hidden_units_offset, - // gemm_data_type, - // hidden_units_, // k - // &beta, - // logits_buf_ + vocab_size_units_offset, - // CUDA_R_32F, - // vocab_size_padded_, /* n */ - // CUDA_R_32F, - // cublasGemmAlgo_t(-1)); - // } - // else { - // FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); - // const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; - // float alpha = 1.0f; - // float beta = 0.0f; - // cublas_wrapper_->Gemm(CUBLAS_OP_T, - // CUBLAS_OP_N, - // local_vocab_size, // n - // local_batch_size * beam_width, - // hidden_units_, // k - // &alpha, - // padded_embedding_kernel_ptr_ - // + tensor_para_.rank_ * local_vocab_size * hidden_units_, - // gemm_data_type, - // hidden_units_, // k - // normed_decoder_output_buf_ + hidden_units_offset, - // gemm_data_type, - // hidden_units_, // k - // &beta, - // nccl_logits_buf_ + vocab_size_units_offset - // + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, - // CUDA_R_32F, - // local_vocab_size, /* n */ - // CUDA_R_32F, - // cublasGemmAlgo_t(-1)); + if (tensor_para_.world_size_ == 1) { + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + vocab_size_padded_, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + logits_buf_ + vocab_size_units_offset, + CUDA_R_32F, + vocab_size_padded_, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); + } + else { + FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0); + const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_; + float alpha = 1.0f; + float beta = 0.0f; + cublas_wrapper_->Gemm(CUBLAS_OP_T, + CUBLAS_OP_N, + local_vocab_size, // n + local_batch_size * beam_width, + hidden_units_, // k + &alpha, + padded_embedding_kernel_ptr_ + + tensor_para_.rank_ * local_vocab_size * hidden_units_, + gemm_data_type, + hidden_units_, // k + normed_decoder_output_buf_ + hidden_units_offset, + gemm_data_type, + hidden_units_, // k + &beta, + nccl_logits_buf_ + vocab_size_units_offset + + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size, + CUDA_R_32F, + local_vocab_size, /* n */ + CUDA_R_32F, + cublasGemmAlgo_t(-1)); - // ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, - // nccl_logits_buf_ + vocab_size_units_offset, - // local_batch_size * beam_width * local_vocab_size, - // tensor_para_.rank_, - // tensor_para_, - // stream_); - // invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, - // nccl_logits_buf_ + vocab_size_units_offset, - // tensor_para_.world_size_, - // local_batch_size * beam_width, - // local_vocab_size, - // stream_); - // } + ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + local_batch_size * beam_width * local_vocab_size, + tensor_para_.rank_, + tensor_para_, + stream_); + invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset, + nccl_logits_buf_ + vocab_size_units_offset, + tensor_para_.world_size_, + local_batch_size * beam_width, + local_vocab_size, + stream_); + } int tmp_local_batch_size = local_batch_size; diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index a1d78c852..0595358a0 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -44,7 +44,7 @@ void LlamaContextDecoder::initialize() custom_all_reduce_comm_, enable_custom_all_reduce_); -// TODO: 这里的SiluFfnLayer是不支持int8的dataType,再不做修改的情况下试一下会不会报错。 + ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 0, // max_seq_len head_num_, @@ -60,7 +60,8 @@ void LlamaContextDecoder::initialize() false, true, // use_gated_activation = true; custom_all_reduce_comm_, - enable_custom_all_reduce_); + enable_custom_all_reduce_, + int8_mode_); } template diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index 4685217b0..b68c82207 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -55,7 +55,8 @@ void LlamaDecoder::initialize() false, true, // use_gated_activation = true; custom_all_reduce_comm_, - enable_custom_all_reduce_); + enable_custom_all_reduce_, + int8_mode_); } template From 0e6ae5b1a48ad55a451788c39815518d9d6b5c3f Mon Sep 17 00:00:00 2001 From: shaoxin Date: Fri, 30 Jun 2023 20:38:06 +0800 Subject: [PATCH 3/3] rm unused code --- examples/cpp/llama/llama_example.cc | 18 ------------------ src/fastertransformer/.DS_Store | Bin 6148 -> 0 bytes src/fastertransformer/models/.DS_Store | Bin 8196 -> 0 bytes 3 files changed, 18 deletions(-) delete mode 100644 src/fastertransformer/.DS_Store delete mode 100644 src/fastertransformer/models/.DS_Store diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index 14cff7bee..b42aea9bc 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -277,7 +277,6 @@ void llama_example(const INIReader reader) cublas_wrapper.setFP32GemmConfig(); } - printf("******* Enter gpt_weights ********** \n"); const bool use_gptj_residual = false; fastertransformer::LlamaWeight gpt_weights(hidden_units, inter_size, @@ -293,8 +292,6 @@ void llama_example(const INIReader reader) prompt_learning_type, prefix_prompt_table_pair); - printf("******* Enter loadModel ********* \n"); - gpt_weights.loadModel(model_dir); unsigned long long random_seed; if (rank == 0) { @@ -312,8 +309,6 @@ void llama_example(const INIReader reader) false, // with_relative_position_bias true); // causal_mask - printf("******* Inilize Llama ********* \n"); - Llama gpt = Llama(head_num, size_per_head, inter_size, @@ -349,13 +344,10 @@ void llama_example(const INIReader reader) int* d_output_ids; int* d_sequence_lengths; - printf("******* deviceMalloc start ********* \n"); deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); - printf("******* deviceMalloc end ********* \n"); - std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", @@ -421,8 +413,6 @@ void llama_example(const INIReader reader) print_mem_usage(); - printf("******* before cudaDeviceSynchronize ********* \n"); - int ite = 1; cudaDeviceSynchronize(); mpi::barrier(); @@ -433,19 +423,16 @@ void llama_example(const INIReader reader) ft_nvtx::setScope("warmup_time"); PUSH_RANGE("warmup time") - printf("******* before gpt.forward ********* \n"); for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } - printf("******* end gpt.forward ********* \n"); cudaDeviceSynchronize(); mpi::barrier(); POP_RANGE; ft_nvtx::resetScope(); - printf("******* end cudaDeviceSynchronize ********* \n"); if (rank == 0) { @@ -458,11 +445,8 @@ void llama_example(const INIReader reader) size_t outCount = total_output_len * request_batch_size * beam_width; int* hBuf = new int[outCount]; - printf("******* before cudaD2Hcpy ********* \n"); - cudaD2Hcpy(hBuf, d_output_ids, outCount); - printf("******* end cudaD2Hcpy ********* \n"); { std::cout << "Writing " << outCount << " elements\n"; int zeroCount = 0; @@ -496,11 +480,9 @@ void llama_example(const INIReader reader) ft_nvtx::setScope("total_time"); PUSH_RANGE("total time") - printf("******* before gpt forward ********* \n"); for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } - printf("******* after gpt forward ********* \n"); cudaDeviceSynchronize(); mpi::barrier(); diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store deleted file mode 100644 index d4edc9af914042cba222ab33425eac19388ca6d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ z1K+?eu!SGtUs%Dp9v96dQiC=d@-8 z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5 z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=t_v4?q z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5` zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{ zokfK~+P9>_&d&)L)+xa{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n! diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store deleted file mode 100644 index de13d48fdd4e9e824db21adcab91399ef3727fa6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHML2uJA6n^eHnld)^0Ma-hMdCV*ZevoVT}n3&NN}Nu9RQVN4O>_iS53N3s)~At z-@sqs%pc*uaDwmI?v%Le6(O)I`z7}Gz4(20n!c2XL}%3BA=)G&50!0W4b2UO$GI+) zis@mp0H2;E@+3@Ti1Wq*+I2VuoB~b(r+`zyDeyNafM+%rZ^?6CkGj?=;1u{T72xj& zAC+y}*qOHKs{@Tn0br}>Rt3lEr$4OE9k6X@esa(lW1$}TBm?hU{L|i-K!K*Ktsx{{QU{WwtuOjh!9ew50;?}4SmWhOd^4y5E`dZLRqfbaG*3d$;LPfNvP#VpVoR;xQ zsUp4x?FX3ka-E9kszHCWys|n)+}vx5I3-<`(MpdZdX*A<0=l?K3$anqqJRv?*}%LZ z`W_CFILn*OAEL5SUAp#CgJ=z2w%t1xYxLQDAPQM z(`cv&aX$j&?Qxv;<+vk9X+Klkt{(8}UcJ}2Kbh=4Zf*HHyRGJQ%b&FN_O|@}-R5*! z_wGG>bnvowo?N7ow;75sPa0a+=hy8Q92G*k><-gJro3I4aT}sYUP5ePiA{gOTfZzN zVfjk}tphazFUOYbsl^%@Yu%E(5lYM|w(l6Mw0s4=RZ6(}90$KkZf@&6$On-$6-}E4txEF bA&z}OnbS6QrY(9<+=~DugKL}ue^i0r53^