diff --git a/.gitignore b/.gitignore index 77849f435..8ecc0cca0 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ __pycache__/ /models /notebooks **/.ipynb_checkpoints/ +.DS_Store /3rdparty/NeMo/ -/3rdparty/apex/ \ No newline at end of file +/3rdparty/apex/ + diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc index a64c86bca..b42aea9bc 100644 --- a/examples/cpp/llama/llama_example.cc +++ b/examples/cpp/llama/llama_example.cc @@ -81,6 +81,7 @@ void llama_example(const INIReader reader) int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"); int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"); + int int8_mode = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0); const size_t head_num = reader.GetInteger(model_name, "head_num"); const size_t size_per_head = reader.GetInteger(model_name, "size_per_head"); @@ -177,6 +178,7 @@ void llama_example(const INIReader reader) tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end()); } + int* d_stop_words = nullptr; deviceMalloc(&d_stop_words, tiled_stop_words.size(), false); cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size()); @@ -193,6 +195,7 @@ void llama_example(const INIReader reader) 1, "../examples/cpp/llama/start_ids.csv"); + int* d_input_ids; int* d_input_lengths; if (max_input_len == 0) { @@ -285,6 +288,7 @@ void llama_example(const INIReader reader) pipeline_para.world_size_, pipeline_para.rank_, use_gptj_residual, + int8_mode, prompt_learning_type, prefix_prompt_table_pair); @@ -331,12 +335,19 @@ void llama_example(const INIReader reader) &allocator, false, &prop, - attention_type); + attention_type, + int8_mode, + nullptr, + 0, + 1.0f); int* d_output_ids; int* d_sequence_lengths; + + deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false); deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false); + std::vector output_seq_len(request_batch_size, total_output_len); std::unordered_map input_tensors = std::unordered_map{ {"input_ids", @@ -411,15 +422,18 @@ void llama_example(const INIReader reader) ite = 1; ft_nvtx::setScope("warmup_time"); PUSH_RANGE("warmup time") + for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } + cudaDeviceSynchronize(); mpi::barrier(); POP_RANGE; ft_nvtx::resetScope(); + if (rank == 0) { std::string fName = "out"; @@ -430,6 +444,7 @@ void llama_example(const INIReader reader) else { size_t outCount = total_output_len * request_batch_size * beam_width; int* hBuf = new int[outCount]; + cudaD2Hcpy(hBuf, d_output_ids, outCount); { @@ -468,7 +483,6 @@ void llama_example(const INIReader reader) for (int i = 0; i < ite; ++i) { gpt.forward(&output_tensors, &input_tensors, &gpt_weights); } - cudaDeviceSynchronize(); mpi::barrier(); @@ -509,6 +523,5 @@ void llama_example(const INIReader reader) if (d_sequence_lengths != nullptr) { deviceFree(d_sequence_lengths); } - return; } diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc index 7ac441198..14bb5e3f6 100644 --- a/src/fastertransformer/layers/FfnLayer.cc +++ b/src/fastertransformer/layers/FfnLayer.cc @@ -684,6 +684,7 @@ SiluFfnLayer::SiluFfnLayer(size_t max_batch_size, IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse, + int int8_mode, bool use_gated_activation): FfnLayer(max_batch_size, max_seq_len, @@ -696,7 +697,7 @@ SiluFfnLayer::SiluFfnLayer(size_t max_batch_size, allocator, is_free_buffer_after_forward, sparse, - 0, + int8_mode, use_gated_activation) { } diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h index af7ae7606..f84915d2f 100644 --- a/src/fastertransformer/layers/FfnLayer.h +++ b/src/fastertransformer/layers/FfnLayer.h @@ -210,6 +210,7 @@ class SiluFfnLayer: public FfnLayer { IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse = false, + int int8_mode = 0, bool use_gated_activation = false); SiluFfnLayer(SiluFfnLayer const& ffn_layer); diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc index 25a2da86b..af4714d82 100644 --- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc +++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc @@ -76,7 +76,8 @@ TensorParallelSiluFfnLayer::TensorParallelSiluFfnLayer(size_t max_b bool is_sparse, bool use_gated_activation, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + int int8_mode): SiluFfnLayer(max_batch_size, max_seq_len, head_num, @@ -88,6 +89,7 @@ TensorParallelSiluFfnLayer::TensorParallelSiluFfnLayer(size_t max_b allocator, is_free_buffer_after_forward, is_sparse, + int8_mode, use_gated_activation), tensor_para_(tensor_para), custom_all_reduce_comm_(custom_all_reduce_comm), diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h index ae481373a..5f0e6d625 100644 --- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h +++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h @@ -47,7 +47,8 @@ class TensorParallelSiluFfnLayer: public SiluFfnLayer { bool is_sparse, bool use_gated_activation = false, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + int int8_mode = 0); TensorParallelSiluFfnLayer(TensorParallelSiluFfnLayer const& ffn_layer); diff --git a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc index c5ea150f4..ef2ac7b3a 100644 --- a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc +++ b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc @@ -88,7 +88,8 @@ LinearAdapterLayer::LinearAdapterLayer(LinearAdapterConfig const& co is_sparse, false, custom_all_reduce_comm, - enable_custom_all_reduce)}, + enable_custom_all_reduce, + 0)}, layer_norm_type_{config.layerNormType()}, layer_norm_eps_{layer_norm_eps}, max_token_size_{max_batch_size * max_seq_len}, diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc index 8e3f1cedf..1eac9fd20 100644 --- a/src/fastertransformer/models/llama/Llama.cc +++ b/src/fastertransformer/models/llama/Llama.cc @@ -42,6 +42,7 @@ void Llama::initialize() is_free_buffer_after_forward_, is_context_qk_buf_float_, attention_type_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -59,6 +60,7 @@ void Llama::initialize() cublas_wrapper_, allocator_, is_free_buffer_after_forward_, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); @@ -165,6 +167,13 @@ void Llama::allocateBuffer( generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true); + if (shared_contexts_ratio_ > 0.0f) { + shared_contexts_idx_ = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false); + batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false); + compact_idx_ = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false); + compact_size_ = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false); + } + is_allocate_buffer_ = true; } @@ -216,6 +225,11 @@ void Llama::freeBuffer() allocator_->free((void**)(&generation_should_stop_), true); + if (shared_contexts_ratio_ > 0.0f) { + allocator_->free((void**)(&shared_contexts_idx_)); + allocator_->free((void**)(&compact_size_)); + } + is_allocate_buffer_ = false; } } @@ -246,8 +260,10 @@ Llama::Llama(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -263,7 +279,9 @@ Llama::Llama(size_t head_num, use_gptj_residual_(use_gptj_residual), hidden_units_(head_num * size_per_head), local_head_num_(head_num / 1), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { tensor_para_.world_size_ = 1; tensor_para_.rank_ = 0; @@ -310,8 +328,10 @@ Llama::Llama(size_t head_num, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, - int enable_custom_all_reduce): + int enable_custom_all_reduce, + float shared_contexts_ratio): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), head_num_(head_num), size_per_head_(size_per_head), @@ -331,7 +351,9 @@ Llama::Llama(size_t head_num, local_head_num_(head_num / tensor_para.world_size_), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce), - attention_type_(attention_type) + attention_type_(attention_type), + int8_mode_(int8_mode), + shared_contexts_ratio_(shared_contexts_ratio) { int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_); if (std::is_same::value) { @@ -363,7 +385,9 @@ Llama::Llama(Llama const& gpt): vocab_size_padded_(gpt.vocab_size_padded_), custom_all_reduce_comm_(gpt.custom_all_reduce_comm_), enable_custom_all_reduce_(gpt.enable_custom_all_reduce_), - attention_type_(gpt.attention_type_) + attention_type_(gpt.attention_type_), + int8_mode_(gpt.int8_mode_), + shared_contexts_ratio_(gpt.shared_contexts_ratio_) { initialize(); } @@ -585,6 +609,23 @@ void Llama::forward(std::unordered_map* output_ten cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_); } + int compact_size; + bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1); + if (use_shared_contexts) { + invokeFindContextDups(shared_contexts_idx_, + batch_to_compact_idx_, + compact_idx_, + compact_size_, + input_tensors->at("input_ids").getPtr(), + batch_size, + beam_width, + max_input_length, + stream_); + cudaD2Hcpy(&compact_size, compact_size_, 1); + use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size; + sync_check_cuda_error(); + } + // Prefix prompts if (has_prefix_prompt_) { cudaMemcpyAsync(prompt_learning_weight_batch_, @@ -686,6 +727,14 @@ void Llama::forward(std::unordered_map* output_ten {batch_size * beam_width}, has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}}; + if (use_shared_contexts) { + decoder_input_tensors.insert( + {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)}); + decoder_input_tensors.insert( + {"batch_to_compact_idx", + Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)}); + } + std::unordered_map decoder_output_tensors{ {"decoder_output", Tensor{MEMORY_GPU, @@ -877,6 +926,7 @@ void Llama::forward(std::unordered_map* output_ten stream_); sync_check_cuda_error(); + if (tensor_para_.world_size_ == 1) { float alpha = 1.0f; float beta = 0.0f; @@ -924,6 +974,8 @@ void Llama::forward(std::unordered_map* output_ten local_vocab_size, /* n */ CUDA_R_32F, cublasGemmAlgo_t(-1)); + + ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset, nccl_logits_buf_ + vocab_size_units_offset, local_batch_size * beam_width * local_vocab_size, @@ -937,7 +989,8 @@ void Llama::forward(std::unordered_map* output_ten local_vocab_size, stream_); } - + + int tmp_local_batch_size = local_batch_size; bool is_initialize_random_table = step == max_input_length; std::unordered_map dynamic_decode_input_tensors{ @@ -1229,5 +1282,4 @@ template class Llama; #ifdef ENABLE_BF16 template class Llama<__nv_bfloat16>; #endif - } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h index f318e6947..a0958280e 100644 --- a/src/fastertransformer/models/llama/Llama.h +++ b/src/fastertransformer/models/llama/Llama.h @@ -41,6 +41,7 @@ class Llama: public BaseLayer { float layernorm_eps_; static constexpr bool neox_rotary_style_ = true; + float shared_contexts_ratio_; int start_id_; int end_id_; @@ -54,6 +55,7 @@ class Llama: public BaseLayer { int enable_custom_all_reduce_; AttentionType attention_type_; + const int int8_mode_ = 0; size_t vocab_size_padded_; const bool is_context_qk_buf_float_ = @@ -120,6 +122,11 @@ class Llama: public BaseLayer { bool* generation_should_stop_ = nullptr; + int* shared_contexts_idx_ = nullptr; + int* compact_idx_ = nullptr; + int* batch_to_compact_idx_ = nullptr; + int* compact_size_ = nullptr; + T* context_decoder_input_buf_; T* context_decoder_output_buf_; float* output_log_probs_buf_; @@ -165,8 +172,10 @@ class Llama: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); Llama(size_t head_num, size_t size_per_head, @@ -195,8 +204,10 @@ class Llama: public BaseLayer { bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop = nullptr, AttentionType attention_type = AttentionType::UNFUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, - int enable_custom_all_reduce = 0); + int enable_custom_all_reduce = 0, + float shared_contexts_ratio = 1.0f); Llama(Llama const& Llama); diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc index e3afd1780..0595358a0 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc @@ -40,10 +40,11 @@ void LlamaContextDecoder::initialize() is_free_buffer_after_forward_, is_qk_buf_float_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); + ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 0, // max_seq_len head_num_, @@ -59,7 +60,8 @@ void LlamaContextDecoder::initialize() false, true, // use_gated_activation = true; custom_all_reduce_comm_, - enable_custom_all_reduce_); + enable_custom_all_reduce_, + int8_mode_); } template @@ -69,7 +71,7 @@ void LlamaContextDecoder::allocateBuffer() } template -void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) +void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts) { decoder_normed_input_ = reinterpret_cast( allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); @@ -83,6 +85,20 @@ void LlamaContextDecoder::allocateBuffer(size_t batch_size, size_t seq_len) padding_offset_ = reinterpret_cast(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false)); cu_seqlens_ = reinterpret_cast(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false)); + + if (use_shared_contexts) { + compact_decoder_features_ = reinterpret_cast( + allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + compact_attention_mask_ = reinterpret_cast( + allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false)); + compact_input_lengths_ = + reinterpret_cast(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false)); + k_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + v_cache_layer_ = reinterpret_cast( + allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false)); + } + is_allocate_buffer_ = true; } @@ -97,6 +113,13 @@ void LlamaContextDecoder::freeBuffer() allocator_->free((void**)(&h_pinned_token_num_ptr_), true); allocator_->free((void**)(&padding_offset_)); allocator_->free((void**)(&cu_seqlens_)); + if (compact_decoder_features_ != nullptr) { + allocator_->free((void**)(&compact_decoder_features_)); + allocator_->free((void**)(&compact_attention_mask_)); + allocator_->free((void**)(&compact_input_lengths_)); + allocator_->free((void**)(&k_cache_layer_)); + allocator_->free((void**)(&v_cache_layer_)); + } is_allocate_buffer_ = false; } } @@ -147,6 +170,7 @@ LlamaContextDecoder::LlamaContextDecoder(size_t bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -163,6 +187,7 @@ LlamaContextDecoder::LlamaContextDecoder(size_t pipeline_para_(pipeline_para), is_qk_buf_float_(is_qk_buf_float), attention_type_(attention_type), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -185,6 +210,7 @@ LlamaContextDecoder::LlamaContextDecoder(LlamaContextDecoder const& decode pipeline_para_(decoder.pipeline_para_), is_qk_buf_float_(decoder.is_qk_buf_float_), attention_type_(decoder.attention_type_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -238,15 +264,23 @@ void LlamaContextDecoder::forward(std::unordered_map* // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during // computing. - FT_CHECK(input_tensors->size() == 5); + FT_CHECK(input_tensors->size() >= 5); FT_CHECK(output_tensors->size() == 4); - const int batch_size = input_tensors->at("decoder_input").shape[0]; - const int seq_len = input_tensors->at("decoder_input").shape[1]; + const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end(); + FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end())); + const size_t request_batch_size = input_tensors->at("decoder_input").shape[0]; + // compacted batch size. + const size_t batch_size = + use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0]; + const int seq_len = input_tensors->at("decoder_input").shape[1]; // max_input_len + // The maximum length of generation. + const size_t max_seq_len = output_tensors->at("value_cache").shape[3]; + const int max_prompt_length = input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2]; const DataType data_type = getTensorType(); - allocateBuffer(batch_size, seq_len); + allocateBuffer(batch_size, seq_len, use_shared_contexts); T* decoder_input = input_tensors->at("decoder_input").getPtr(); T* decoder_output = output_tensors->at("decoder_output").getPtr(); @@ -254,6 +288,20 @@ void LlamaContextDecoder::forward(std::unordered_map* const T** d_prefix_prompt_batch = input_tensors->at("d_prefix_prompt_batch").getPtr(); const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr(); + if (use_shared_contexts) { + invokeCompactInputs(compact_decoder_features_, + compact_attention_mask_, + compact_input_lengths_, + decoder_input, + attention_mask, + input_tensors->at("input_lengths").getPtr(), + input_tensors->at("compact_idx").getPtr(), + batch_size, + seq_len, + hidden_units_, + stream_); + } + const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_); FT_CHECK(batch_size % local_batch_size == 0); const int iteration_num = batch_size / local_batch_size; @@ -271,6 +319,12 @@ void LlamaContextDecoder::forward(std::unordered_map* self_v_cache_size.push_back(*t); } + if (use_shared_contexts) { + // we use k_cache_layer_ and v_cache_layer_ + self_k_cache_size[3] = seq_len; + self_v_cache_size[2] = seq_len; + } + AttentionType attention_type = (d_prefix_prompt_lengths != nullptr) ? getUnfusedAttentionType(attention_type_) : attention_type_; @@ -279,7 +333,8 @@ void LlamaContextDecoder::forward(std::unordered_map* for (int ite = 0; ite < iteration_num; ite++) { size_t h_token_num = local_batch_size * seq_len; if (is_unpadded_mha) { - const int* base_input_lengths = input_tensors->at("input_lengths").getPtr(); + const int* base_input_lengths = + use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr(); invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_, &h_token_num, padding_offset_, @@ -295,8 +350,9 @@ void LlamaContextDecoder::forward(std::unordered_map* } if (l == 0 && is_unpadded_mha) { + const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input); invokeRemovePadding(decoder_layer_output_, - decoder_input + ite * local_batch_size * seq_len * hidden_units_, + base_input + ite * local_batch_size * seq_len * hidden_units_, padding_offset_, h_token_num, hidden_units_, @@ -308,11 +364,11 @@ void LlamaContextDecoder::forward(std::unordered_map* T* layer_output = decoder_layer_output_; if (!is_unpadded_mha) { if (l == 0) { - layer_input = decoder_input; + layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input; layer_input += ite * local_batch_size * seq_len * hidden_units_; } if (l == num_layer_ - 1) { - layer_output = decoder_output; + layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output; layer_output += ite * local_batch_size * seq_len * hidden_units_; } } @@ -328,7 +384,7 @@ void LlamaContextDecoder::forward(std::unordered_map* ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_); } } - + // TODO: 这里用的LN跟neox不一样,不太清楚这里需不需要改成int8的LN invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, @@ -337,8 +393,11 @@ void LlamaContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, stream_); + sync_check_cuda_error(); + const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask; + TensorMap self_attention_input_tensors{ {"input_query", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}}, @@ -346,7 +405,7 @@ void LlamaContextDecoder::forward(std::unordered_map* Tensor{MEMORY_GPU, data_type, {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)}, - attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, + attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}}, {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}}, {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}}, {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}}; @@ -381,17 +440,43 @@ void LlamaContextDecoder::forward(std::unordered_map* } cache_offset += ite_cache_offset; + T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset(cache_offset); + T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset(cache_offset); + TensorMap self_attention_output_tensors{ {"hidden_features", Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}}, - {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}}, - {"value_cache", - Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}}; + {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}}, + {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}}; self_attention_layer_->forward(&self_attention_output_tensors, &self_attention_input_tensors, &gpt_decoder_layer_weight->at(l)->self_attention_weights); + if (use_shared_contexts) { + // Even with local batches, we must process the whole K/V caches as any + // element in batch_idx_to_compact_idx may reference the local batch + // we're processing. We also need to discard references that aren't in + // that particular local batch. + const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len; + const size_t cache_layer_offset = + (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch; + invokeUnCompactCaches(k_cache.getPtrWithOffset(cache_layer_offset), + v_cache.getPtrWithOffset(cache_layer_offset), + k_cache_layer_, + v_cache_layer_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch_size (uncompact) + v_cache.shape[2], // local_head_num + max_seq_len, + seq_len, + size_per_head_, + local_batch_size, + ite, + stream_); + sync_check_cuda_error(); + } + if (is_final == false) { if (use_gptj_residual_) { invokeGeneralLayerNorm(decoder_normed_input_, @@ -402,10 +487,11 @@ void LlamaContextDecoder::forward(std::unordered_map* h_token_num, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { + // TODO: modify or not ? invokeGeneralAddResidualT5PreLayerNorm( self_attn_output_, decoder_normed_input_, @@ -472,7 +558,8 @@ void LlamaContextDecoder::forward(std::unordered_map* } if ((l == num_layer_ - 1) && is_unpadded_mha) { - invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_, + T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output; + invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_, decoder_layer_output_, padding_offset_, h_token_num, @@ -483,12 +570,22 @@ void LlamaContextDecoder::forward(std::unordered_map* } } + if (use_shared_contexts) { + invokeUnCompactOutputs(decoder_output, + compact_decoder_features_, + input_tensors->at("batch_to_compact_idx").getPtr(), + request_batch_size, // batch + seq_len * hidden_units_, + stream_); + sync_check_cuda_error(); + } + // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr(), output_tensors->at("decoder_output").getPtr(), input_tensors->at("input_lengths").getPtr(), seq_len, - batch_size, + request_batch_size, hidden_units_, stream_); sync_check_cuda_error(); diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h index 788d1d45d..a2f91f7b8 100644 --- a/src/fastertransformer/models/llama/LlamaContextDecoder.h +++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h @@ -56,13 +56,15 @@ class LlamaContextDecoder: public BaseLayer { AttentionType attention_type_; + int int8_mode_ = 0; + bool is_qk_buf_float_; BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; void allocateBuffer() override; - void allocateBuffer(size_t batch_size, size_t seq_len); + void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts); void freeBuffer() override; bool isValidLayerParallelId(uint l); @@ -81,6 +83,12 @@ class LlamaContextDecoder: public BaseLayer { int* padding_offset_ = nullptr; int* cu_seqlens_ = nullptr; + T* compact_decoder_features_ = nullptr; + T* compact_attention_mask_ = nullptr; + int* compact_input_lengths_ = nullptr; + T* k_cache_layer_ = nullptr; + T* v_cache_layer_ = nullptr; + public: LlamaContextDecoder(size_t head_num, size_t size_per_head, @@ -98,6 +106,7 @@ class LlamaContextDecoder: public BaseLayer { bool is_free_buffer_after_forward, bool is_qk_buf_float, AttentionType attention_type = AttentionType::FUSED_MHA, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc index a5cffa731..b68c82207 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.cc +++ b/src/fastertransformer/models/llama/LlamaDecoder.cc @@ -35,10 +35,11 @@ void LlamaDecoder::initialize() !use_gptj_residual_, is_free_buffer_after_forward_, false, - 0, + int8_mode_, custom_all_reduce_comm_, enable_custom_all_reduce_); + // TODO: SiLu ftn layer not support int8 ffn_layer_ = new TensorParallelSiluFfnLayer(0, // max_batch_size 1, head_num_, @@ -54,7 +55,8 @@ void LlamaDecoder::initialize() false, true, // use_gated_activation = true; custom_all_reduce_comm_, - enable_custom_all_reduce_); + enable_custom_all_reduce_, + int8_mode_); } template @@ -133,6 +135,7 @@ LlamaDecoder::LlamaDecoder(size_t head_num, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode, std::shared_ptr custom_all_reduce_comm, int enable_custom_all_reduce): BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), @@ -147,6 +150,7 @@ LlamaDecoder::LlamaDecoder(size_t head_num, hidden_units_(head_num_ * size_per_head), tensor_para_(tensor_para), pipeline_para_(pipeline_para), + int8_mode_(int8_mode), custom_all_reduce_comm_(custom_all_reduce_comm), enable_custom_all_reduce_(enable_custom_all_reduce) { @@ -167,6 +171,7 @@ LlamaDecoder::LlamaDecoder(LlamaDecoder const& decoder): hidden_units_(decoder.hidden_units_), tensor_para_(decoder.tensor_para_), pipeline_para_(decoder.pipeline_para_), + int8_mode_(decoder.int8_mode_), custom_all_reduce_comm_(decoder.custom_all_reduce_comm_), enable_custom_all_reduce_(decoder.enable_custom_all_reduce_) { @@ -260,6 +265,7 @@ void LlamaDecoder::forward(std::unordered_map* } } + // TODO 使用的是T5 LN,这里是没有int8的参数支持 invokeGeneralT5LayerNorm(decoder_normed_input_, layer_input, gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma, @@ -301,7 +307,7 @@ void LlamaDecoder::forward(std::unordered_map* local_batch_size, hidden_units_, (float*)nullptr, - 0, + int8_mode_, stream_); } else { diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h index 6cdd7df27..dc44b0f32 100644 --- a/src/fastertransformer/models/llama/LlamaDecoder.h +++ b/src/fastertransformer/models/llama/LlamaDecoder.h @@ -70,6 +70,8 @@ class LlamaDecoder: public BaseLayer { BaseAttentionLayer* self_attention_layer_; FfnLayer* ffn_layer_; + int int8_mode_ = 0; + public: LlamaDecoder(size_t head_num, size_t size_per_head, @@ -85,6 +87,7 @@ class LlamaDecoder: public BaseLayer { cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, + int int8_mode = 0, std::shared_ptr custom_all_reduce_comm = nullptr, int enable_custom_all_reduce_ = 0); diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index 3e97b67d0..a8dadefea 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -24,15 +24,26 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size, const int tensor_para_rank, - const bool use_gptj_residual): + const bool use_gptj_residual, + const int int8_mode): hidden_units_(hidden_units), inter_size_(inter_size), tensor_para_size_(tensor_para_size), tensor_para_rank_(tensor_para_rank), + int8_mode_(int8_mode), use_gptj_residual_(use_gptj_residual) { mallocWeights(); setWeightPtr(); + + FT_CHECK_WITH_INFO(int8_mode_ != 2, "Llama doesn't support int8_model == 2"); + FT_CHECK_WITH_INFO(!(std::is_same::value && int8_mode_ == 1), + "Weight only quant does not work with FP32 compute."); +} + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode) +{ } template @@ -60,38 +71,92 @@ LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() ffn_weights.intermediate_weight2.bias = nullptr; ffn_weights.output_weight.kernel = nullptr; ffn_weights.output_weight.bias = nullptr; + + if (int8_mode_ != 0) { + for (int i = 0; i < int8_weights_ptr.size(); i++) { + if (int8_weights_ptr[i] != nullptr) { + deviceFree(int8_weights_ptr[i]); + } + } + + if (int8_mode_ == 1) { + for (int i = 0; i < weight_only_scale_ptr.size(); i++) { + if (weight_only_scale_ptr[i] != nullptr) { + deviceFree(weight_only_scale_ptr[i]); + } + } + } + + self_attention_weights.query_weight.int8_kernel = nullptr; + self_attention_weights.query_weight.weight_only_quant_scale = nullptr; + self_attention_weights.attention_output_weight.int8_kernel = nullptr; + self_attention_weights.attention_output_weight.weight_only_quant_scale = nullptr; + + // 作一下标记 intermediate_weight => gate_proj; intermediate_weight2 => up_proj; output_weight => down_proj. + ffn_weights.intermediate_weight.int8_kernel = nullptr; + ffn_weights.intermediate_weight.weight_only_quant_scale = nullptr; + ffn_weights.intermediate_weight2.int8_kernel = nullptr; + ffn_weights.intermediate_weight2.weight_only_quant_scale = nullptr; + ffn_weights.output_weight.int8_kernel = nullptr; + ffn_weights.output_weight.weight_only_quant_scale = nullptr; + } + is_maintain_buffer = false; } } template -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other): - hidden_units_(other.hidden_units_), - inter_size_(other.inter_size_), - tensor_para_size_(other.tensor_para_size_), - tensor_para_rank_(other.tensor_para_rank_), - use_gptj_residual_(other.use_gptj_residual_) +void LlamaDecoderLayerWeight::copyFrom(const LlamaDecoderLayerWeight& other) { - mallocWeights(); cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); if (!use_gptj_residual_) { cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); } - - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); - - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + + if (int8_mode_ == 0) { + cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); + } + else { + cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_); + cudaD2Dcpy(int8_weights_ptr[4], other.int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_); + + if (int8_mode_ == 1) { + cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_); + cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + + // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的,后面做一下验证,回来再改一下 + cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_); + cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_); + } + } +} + +template +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other): + hidden_units_(other.hidden_units_), + inter_size_(other.inter_size_), + tensor_para_size_(other.tensor_para_size_), + tensor_para_rank_(other.tensor_para_rank_), + int8_mode_(other.int8_mode_), + use_gptj_residual_(other.use_gptj_residual_) +{ + mallocWeights(); + copyFrom(other); setWeightPtr(); } @@ -102,26 +167,12 @@ LlamaDecoderLayerWeight& LlamaDecoderLayerWeight::operator=(const LlamaDec inter_size_ = other.inter_size_; tensor_para_size_ = other.tensor_para_size_; tensor_para_rank_ = other.tensor_para_rank_; + int8_mode_ = other.int8_mode_; use_gptj_residual_ = other.use_gptj_residual_; mallocWeights(); - cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_); - cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_); - cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); - if (!use_gptj_residual_) { - cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_); - } - cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_); - cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); - cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_); - cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_); - cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_); + copyFrom(other); setWeightPtr(); return *this; } @@ -137,42 +188,90 @@ void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType loadWeightFromBin( weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type); - loadWeightFromBin(weights_ptr[2], + deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0); + + if (!use_gptj_residual_) { + deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0); + } + + // FIXME(sunpeng17): check if the weights are correct + // loadWeightFromBin(weights_ptr[6], + // {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + // dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + // model_file_type); + + deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); + + // loadWeightFromBin(weights_ptr[10], + // {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + // dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", + // model_file_type); + + + deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0); + + deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0); + loadWeightFromBin( + weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); + + if (int8_mode_ == 0) { + loadWeightFromBin(weights_ptr[2], {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0); - loadWeightFromBin(weights_ptr[4], + loadWeightFromBin(weights_ptr[4], {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".attention.dense.weight." + rank_spec + ".bin", model_file_type); - if (!use_gptj_residual_) { - deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0); - } - // FIXME(sunpeng17): check if the weights are correct - loadWeightFromBin(weights_ptr[6], + loadWeightFromBin(weights_ptr[6], {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); - loadWeightFromBin(weights_ptr[8], - {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + loadWeightFromBin(weights_ptr[8], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0); - - loadWeightFromBin(weights_ptr[10], + loadWeightFromBin(weights_ptr[10], {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", model_file_type); - deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0); + } + else if (int8_mode_ == 1) { + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[0], + weight_only_scale_ptr[0], + {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)}, + dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin", + model_file_type); - deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0); - loadWeightFromBin( - weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type); + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[1], + weight_only_scale_ptr[1], + {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".attention.dense.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[2], + weight_only_scale_ptr[2], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin", + model_file_type); + + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[3], + weight_only_scale_ptr[3], + {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)}, + dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin", + model_file_type); + loadWeightFromBinAndQuantizeForWeightOnly(int8_weights_ptr[4], + weight_only_scale_ptr[4], + {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_}, + dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin", + model_file_type); + + } } template @@ -194,6 +293,23 @@ void LlamaDecoderLayerWeight::setWeightPtr() post_attention_layernorm_weights.beta = weights_ptr[12]; post_attention_layernorm_weights.gamma = weights_ptr[13]; + + if (int8_mode_ != 0) { + self_attention_weights.query_weight.int8_kernel = int8_weights_ptr[0]; + self_attention_weights.attention_output_weight.int8_kernel = int8_weights_ptr[1]; + ffn_weights.intermediate_weight.int8_kernel = int8_weights_ptr[2]; + ffn_weights.intermediate_weight2.int8_kernel = int8_weights_ptr[3]; + ffn_weights.output_weight.int8_kernel = int8_weights_ptr[4]; + + if (int8_mode_ == 1) { + self_attention_weights.query_weight.weight_only_quant_scale = weight_only_scale_ptr[0]; + self_attention_weights.attention_output_weight.weight_only_quant_scale = weight_only_scale_ptr[1]; + ffn_weights.intermediate_weight.weight_only_quant_scale = weight_only_scale_ptr[2]; + ffn_weights.intermediate_weight2.weight_only_quant_scale = weight_only_scale_ptr[3]; + ffn_weights.output_weight.weight_only_quant_scale = weight_only_scale_ptr[4]; + } + } + is_maintain_buffer = true; } @@ -202,21 +318,48 @@ void LlamaDecoderLayerWeight::mallocWeights() { deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma - deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel + // deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias - deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + // deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight if (!use_gptj_residual_) { deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias } - deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel + // deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_); // intermediate_weight bias - deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel + // deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_); // intermediate_weight2 bias - deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel + // deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel deviceMalloc(&weights_ptr[11], hidden_units_); // output_weight bias deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma + + if (int8_mode_ == 0) { + deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv weight + deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight + deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel + deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel + deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel + } + else { + // Alloc FFN and Attention int8 weights + deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_); + deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_); + deviceMalloc(&int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_); + + + if (int8_mode_ == 1) { + // Alloc scales for weight only quant for attention and FFN weights + deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[1], hidden_units_); + deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[3], inter_size_ / tensor_para_size_); + deviceMalloc(&weight_only_scale_ptr[4], hidden_units_); + } + } + } template struct LlamaDecoderLayerWeight; diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h index 008e1a3b4..5a76ba4c5 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h @@ -29,11 +29,13 @@ template struct LlamaDecoderLayerWeight { public: LlamaDecoderLayerWeight() = default; + LlamaDecoderLayerWeight(const int int8_mode); LlamaDecoderLayerWeight(const int hidden_units, const int inter_size, const int tensor_para_size = 1, const int tensor_para_rank = 0, - const bool use_gptj_residual = true); + const bool use_gptj_residual = true, + const int int8_mode = 0); ~LlamaDecoderLayerWeight(); LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other); LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other); @@ -54,9 +56,14 @@ struct LlamaDecoderLayerWeight { const int attention_dense_bias_weight_id = 5; bool is_maintain_buffer = false; T* weights_ptr[14]; + int int8_mode_ = 0; + + std::vector int8_weights_ptr = std::vector(5, nullptr); + std::vector weight_only_scale_ptr = std::vector(5, nullptr); void setWeightPtr(); void mallocWeights(); + void copyFrom(const LlamaDecoderLayerWeight& other); }; } // namespace fastertransformer diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc index 6105267ff..e9e11b6a1 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.cc +++ b/src/fastertransformer/models/llama/LlamaWeight.cc @@ -29,6 +29,7 @@ LlamaWeight::LlamaWeight(const int hidden_un const int layer_para_size, const int layer_para_rank, const bool use_gptj_residual, + const int int8_mode, PromptLearningType prompt_learning_type, std::map> prompt_learning_pair): hidden_units_(hidden_units), @@ -41,6 +42,7 @@ LlamaWeight::LlamaWeight(const int hidden_un layer_para_size_(layer_para_size), layer_para_rank_(layer_para_rank), use_gptj_residual_(use_gptj_residual), + int8_mode_(int8_mode), prompt_learning_type_(prompt_learning_type), prompt_learning_pair_(prompt_learning_pair) { @@ -62,7 +64,7 @@ LlamaWeight::LlamaWeight(const int hidden_un for (int l = 0; l < num_layer_; l++) { if (isValidLayerParallelId(l)) { decoder_layer_weights.push_back(new LlamaDecoderLayerWeight( - hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_)); + hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_)); } else { // Layer-parallelism: allocate empty layer because @@ -103,6 +105,7 @@ LlamaWeight::LlamaWeight(const LlamaWeight& other): layer_para_size_(other.layer_para_size_), layer_para_rank_(other.layer_para_rank_), use_gptj_residual_(other.use_gptj_residual_), + int8_mode_(other.int8_mode_), prompt_token_weight_size_(other.prompt_token_weight_size_), malloc_load_prompt_weights_(other.malloc_load_prompt_weights_), prompt_learning_type_(other.prompt_learning_type_), @@ -149,6 +152,7 @@ LlamaWeight& LlamaWeight::operator=(const LlamaWeight& other) layer_para_size_ = other.layer_para_size_; layer_para_rank_ = other.layer_para_rank_; use_gptj_residual_ = other.use_gptj_residual_; + int8_mode_ = other.int8_mode_; prompt_token_weight_size_ = other.prompt_token_weight_size_; malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_; prompt_learning_type_ = other.prompt_learning_type_; diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h index ec909ca49..eb90dc5bf 100644 --- a/src/fastertransformer/models/llama/LlamaWeight.h +++ b/src/fastertransformer/models/llama/LlamaWeight.h @@ -38,6 +38,7 @@ struct LlamaWeight { const int layer_para_size = 1, const int layer_para_rank = 0, const bool use_gptj_residual_ = false, + const int int8_mode = 0, PromptLearningType prompt_learning_type = PromptLearningType::no_prompt, std::map> prompt_learning_pair = std::map>{}); @@ -88,6 +89,8 @@ struct LlamaWeight { int layer_para_size_; int layer_para_rank_; + size_t int8_mode_ = 0; + // residual type bool use_gptj_residual_;