From 3c03564ca045918d832c3a9e3889a222f607df95 Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Thu, 29 Jun 2023 20:16:02 +0800
Subject: [PATCH 1/3] support int8 & share context

---
 examples/cpp/llama/llama_example.cc           |  36 ++-
 src/fastertransformer/.DS_Store               | Bin 0 -> 6148 bytes
 src/fastertransformer/models/.DS_Store        | Bin 0 -> 8196 bytes
 src/fastertransformer/models/llama/Llama.cc   | 186 ++++++++-----
 src/fastertransformer/models/llama/Llama.h    |  15 +-
 .../models/llama/LlamaContextDecoder.cc       | 132 +++++++--
 .../models/llama/LlamaContextDecoder.h        |  11 +-
 .../models/llama/LlamaDecoder.cc              |   9 +-
 .../models/llama/LlamaDecoder.h               |   3 +
 .../models/llama/LlamaDecoderLayerWeight.cc   | 253 ++++++++++++++----
 .../models/llama/LlamaDecoderLayerWeight.h    |   9 +-
 .../models/llama/LlamaWeight.cc               |   6 +-
 .../models/llama/LlamaWeight.h                |   3 +
 13 files changed, 514 insertions(+), 149 deletions(-)
 create mode 100644 src/fastertransformer/.DS_Store
 create mode 100644 src/fastertransformer/models/.DS_Store
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index a64c86bca..49d966772 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,6 +81,7 @@ void llama_example(const INIReader reader)
 
     int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    int int8_mode  = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0);
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -177,6 +178,7 @@ void llama_example(const INIReader reader)
         tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
     }
 
+    
     int* d_stop_words = nullptr;
     deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
     cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
@@ -193,6 +195,7 @@ void llama_example(const INIReader reader)
                    1,
                    "../examples/cpp/llama/start_ids.csv");
 
+
     int* d_input_ids;
     int* d_input_lengths;
     if (max_input_len == 0) {
@@ -274,6 +277,7 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
+    printf("******* Enter  gpt_weights ********** \n");
     const bool                          use_gptj_residual = false;
     fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
                                                   inter_size,
@@ -285,9 +289,12 @@ void llama_example(const INIReader reader)
                                                   pipeline_para.world_size_,
                                                   pipeline_para.rank_,
                                                   use_gptj_residual,
+                                                  int8_mode,
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
+    printf("******* Enter loadModel  ********* \n");
+
     gpt_weights.loadModel(model_dir);
     unsigned long long random_seed;
     if (rank == 0) {
@@ -305,6 +312,8 @@ void llama_example(const INIReader reader)
                                                        false,  // with_relative_position_bias
                                                        true);  // causal_mask
 
+    printf("******* Inilize  Llama  ********* \n");
+
     Llama<T> gpt = Llama<T>(head_num,
                             size_per_head,
                             inter_size,
@@ -331,12 +340,22 @@ void llama_example(const INIReader reader)
                             &allocator,
                             false,
                             &prop,
-                            attention_type);
+                            attention_type,
+                            int8_mode,
+                            nullptr,
+                            0,
+                            1.0f);
 
     int* d_output_ids;
     int* d_sequence_lengths;
+
+    printf("******* deviceMalloc start  ********* \n");
+
     deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
     deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+
+    printf("******* deviceMalloc end  ********* \n");
+
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -402,6 +421,8 @@ void llama_example(const INIReader reader)
 
     print_mem_usage();
 
+    printf("******* before cudaDeviceSynchronize ********* \n");
+
     int ite = 1;
     cudaDeviceSynchronize();
     mpi::barrier();
@@ -411,15 +432,21 @@ void llama_example(const INIReader reader)
     ite = 1;
     ft_nvtx::setScope("warmup_time");
     PUSH_RANGE("warmup time")
+
+    printf("******* before gpt.forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
+
+    printf("******* end gpt.forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
     POP_RANGE;
     ft_nvtx::resetScope();
 
+    printf("******* end cudaDeviceSynchronize ********* \n");
+
     if (rank == 0) {
 
         std::string fName   = "out";
@@ -430,8 +457,12 @@ void llama_example(const INIReader reader)
         else {
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
+
+            printf("******* before cudaD2Hcpy ********* \n");
+
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
+            printf("******* end cudaD2Hcpy ********* \n");
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
@@ -465,10 +496,11 @@ void llama_example(const INIReader reader)
 
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
+    printf("******* before gpt forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
-
+    printf("******* after gpt forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d1845e2dff13bedf5e41cf42a41978e6bf67cb38
GIT binary patch
literal 6148
zcmeHK%}T>S5Z<-5O({YT3VK`cS}<)D3SL63FJMFuDm5WNgK4%jsX3HF?)pN$h|lB9
z?&c5-coVTRu=~x<&u->}><?p%PgcQzu^wa0f`-UZsSz~Sx^_%3B3E<7EcNqr;m6Xi
zndmQ?@Y`E#&Lj(1$g1D};V<GS&20OV?=+gbt+v&+y4HjHEOR&WvQ;|qmUlS1mNNEB
z-ScmvFrW4IFJzK=Q4%gyLKK7$a(^2ofy^gzl?18Eb+yClSe;q#XuTeuj{9OT9NU||
zSdT}ezBn7&n@z_$I6gVQn!UvHM80VzIgqYo&tM7fpj3)_b(cvjlP9p3Ib|#%F+dCu
z1H{1AF<_1YtGjiTQ^CXlG4K-uxIYMJh>pQRquM&4!|OBpn}{f&<68nz7<3F48o>j?
zbt<4v<>raObvoFEiE|7V8g)A3YGs(mtXw``xLO_TLWMK#Xr!JPAO@-oH1x2C=l>=A
zGPRHVY6^{r0b<~vF~FNscRGQh%-Q;_JUnX!v`1(t7+0VI0(#{V00Z1d4wX~;1?mvz
X7%Vj6ENEBhfOHX1giuEe`~m}CV@gb8

literal 0
HcmV?d00001

diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71
GIT binary patch
literal 8196
zcmeHML2uJA6n^eHnlgrZ0BIbMB5|F@x@lq(m(q;`5?m-^2S6oR!xomsRg<ohrlH>9
zH}DrY^GEnEoZx%5J2~!pMF{N5eu@2kFMgk$rY|KT(H#%kL~BG8pt5bPpt+{-IM=yS
zGd)Ze;M22A9>$rBao(Ck`wp*wSHLUa74Qmp1^xyF@XY2CEO_qgQQvw6yaNBF0{s0D
zqO$E6JJnWwb)Zox0BjxIs^D1t^oI?(1GZ!AR9p0*2u+36RAEvKVKN7y=@{=A{8U>_
zC!wwkKjzB9WGKQU9wJn55*=-Q>lN?{%qzgTd!1s6Xhfxxzdym)#xGSA@fLM~D=47@
z)Rd0sm`d89V~6+6;j#Z@-oLo-;DQ;B`t*T@v|IHnK0ElPg-<ERTwKOahjE+s=mlap
z!F--0f&!cZ5tMZ9MqfK}xD6VBWg?-dIyYjizFJyo^dU*b8aZfI$VLYWrO_P8Q5CP0
zvhfXQKg6sT>tv(r2K~|T%Ies-nb#a~N;<Eil^!;FnKFDLI=xN{aof_uLdKI~Xx<Qg
zkB3=Wl$)DBL~W_Qe4`OGf|cN1?^sTHWmJxfZZvwsXRoBp;@NQ&zfO}$zjbF%=4F)T
z$w(2>K?2CzgESw=Nmq{ZL7})kJrFd5X1{gs;$pkqUJu(_k9IEC!;9VR?e%bHYxDB5
z8QgvF@bSz3Np_k`-exGmJZWfEpI^0KaAbva)*I!S%z3*o<0eFryoA`o5}W>%w|-Sh
z!tz%HS_NthUWqN)SBo_^*108lBUG4mY~Kl3dG!i>t5k5!84i9I-YOMF4YS#)dgU!p
z!7R}kyg75^TS0uXmOs#_MORj8*!X2clt976^{k?=BF~}Pbx4liMqVwRKky281+)T7
z+N`a>K3)9&-^=9{@Cy9H3W)lCZ@-HGXTNydSUJ}YQQxC-Vcb+(MM0y|aafg(!(RPi
bh~p4Y=5&mmYKtBe{~|!i;2W>NA64Kt&SGW%

literal 0
HcmV?d00001

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 8e3f1cedf..8f66e7def 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -42,6 +42,7 @@ void Llama<T>::initialize()
                                                       is_free_buffer_after_forward_,
                                                       is_context_qk_buf_float_,
                                                       attention_type_,
+                                                      int8_mode_,
                                                       custom_all_reduce_comm_,
                                                       enable_custom_all_reduce_);
 
@@ -59,6 +60,7 @@ void Llama<T>::initialize()
                                        cublas_wrapper_,
                                        allocator_,
                                        is_free_buffer_after_forward_,
+                                       int8_mode_,
                                        custom_all_reduce_comm_,
                                        enable_custom_all_reduce_);
 
@@ -165,6 +167,13 @@ void Llama<T>::allocateBuffer(
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
+    if (shared_contexts_ratio_ > 0.0f) {
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -216,6 +225,11 @@ void Llama<T>::freeBuffer()
 
         allocator_->free((void**)(&generation_should_stop_), true);
 
+        if (shared_contexts_ratio_ > 0.0f) {
+            allocator_->free((void**)(&shared_contexts_idx_));
+            allocator_->free((void**)(&compact_size_));
+        }
+
         is_allocate_buffer_ = false;
     }
 }
@@ -246,8 +260,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -263,7 +279,9 @@ Llama<T>::Llama(size_t                              head_num,
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     tensor_para_.world_size_   = 1;
     tensor_para_.rank_         = 0;
@@ -310,8 +328,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -331,7 +351,9 @@ Llama<T>::Llama(size_t                              head_num,
     local_head_num_(head_num / tensor_para.world_size_),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
@@ -363,7 +385,9 @@ Llama<T>::Llama(Llama<T> const& gpt):
     vocab_size_padded_(gpt.vocab_size_padded_),
     custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
     enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
-    attention_type_(gpt.attention_type_)
+    attention_type_(gpt.attention_type_),
+    int8_mode_(gpt.int8_mode_),
+    shared_contexts_ratio_(gpt.shared_contexts_ratio_)
 {
     initialize();
 }
@@ -585,6 +609,23 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
+    int  compact_size;
+    bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1);
+    if (use_shared_contexts) {
+        invokeFindContextDups(shared_contexts_idx_,
+                              batch_to_compact_idx_,
+                              compact_idx_,
+                              compact_size_,
+                              input_tensors->at("input_ids").getPtr<int>(),
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream_);
+        cudaD2Hcpy(&compact_size, compact_size_, 1);
+        use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size;
+        sync_check_cuda_error();
+    }
+
     // Prefix prompts
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
@@ -686,6 +727,14 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                     {batch_size * beam_width},
                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
 
+        if (use_shared_contexts) {
+            decoder_input_tensors.insert(
+                {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)});
+            decoder_input_tensors.insert(
+                {"batch_to_compact_idx",
+                 Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)});
+        }
+
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
              Tensor{MEMORY_GPU,
@@ -877,67 +926,71 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                          stream_);
                 sync_check_cuda_error();
 
-                if (tensor_para_.world_size_ == 1) {
-                    float alpha = 1.0f;
-                    float beta  = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          vocab_size_padded_,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          logits_buf_ + vocab_size_units_offset,
-                                          CUDA_R_32F,
-                                          vocab_size_padded_, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                }
-                else {
-                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
-                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
-                    float     alpha            = 1.0f;
-                    float     beta             = 0.0f;
-                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          local_vocab_size,  // n
-                                          local_batch_size * beam_width,
-                                          hidden_units_,  // k
-                                          &alpha,
-                                          padded_embedding_kernel_ptr_
-                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          normed_decoder_output_buf_ + hidden_units_offset,
-                                          gemm_data_type,
-                                          hidden_units_,  // k
-                                          &beta,
-                                          nccl_logits_buf_ + vocab_size_units_offset
-                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
-                                          CUDA_R_32F,
-                                          local_vocab_size, /* n */
-                                          CUDA_R_32F,
-                                          cublasGemmAlgo_t(-1));
-                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
-                                    nccl_logits_buf_ + vocab_size_units_offset,
-                                    local_batch_size * beam_width * local_vocab_size,
-                                    tensor_para_.rank_,
-                                    tensor_para_,
-                                    stream_);
-                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
-                                          nccl_logits_buf_ + vocab_size_units_offset,
-                                          tensor_para_.world_size_,
-                                          local_batch_size * beam_width,
-                                          local_vocab_size,
-                                          stream_);
-                }
-
+                
+                // if (tensor_para_.world_size_ == 1) {
+                //     float alpha = 1.0f;
+                //     float beta  = 0.0f;
+                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                //                           CUBLAS_OP_N,
+                //                           vocab_size_padded_,  // n
+                //                           local_batch_size * beam_width,
+                //                           hidden_units_,  // k
+                //                           &alpha,
+                //                           padded_embedding_kernel_ptr_,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           normed_decoder_output_buf_ + hidden_units_offset,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           &beta,
+                //                           logits_buf_ + vocab_size_units_offset,
+                //                           CUDA_R_32F,
+                //                           vocab_size_padded_, /* n */
+                //                           CUDA_R_32F,
+                //                           cublasGemmAlgo_t(-1));
+                // }
+                // else {
+                //     FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                //     const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                //     float     alpha            = 1.0f;
+                //     float     beta             = 0.0f;
+                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                //                           CUBLAS_OP_N,
+                //                           local_vocab_size,  // n
+                //                           local_batch_size * beam_width,
+                //                           hidden_units_,  // k
+                //                           &alpha,
+                //                           padded_embedding_kernel_ptr_
+                //                               + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           normed_decoder_output_buf_ + hidden_units_offset,
+                //                           gemm_data_type,
+                //                           hidden_units_,  // k
+                //                           &beta,
+                //                           nccl_logits_buf_ + vocab_size_units_offset
+                //                               + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                //                           CUDA_R_32F,
+                //                           local_vocab_size, /* n */
+                //                           CUDA_R_32F,
+                //                           cublasGemmAlgo_t(-1));
+                    
+
+                //     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                //                     nccl_logits_buf_ + vocab_size_units_offset,
+                //                     local_batch_size * beam_width * local_vocab_size,
+                //                     tensor_para_.rank_,
+                //                     tensor_para_,
+                //                     stream_);
+                //     invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                //                           nccl_logits_buf_ + vocab_size_units_offset,
+                //                           tensor_para_.world_size_,
+                //                           local_batch_size * beam_width,
+                //                           local_vocab_size,
+                //                           stream_);
+                // }
+                
+                
                 int                                     tmp_local_batch_size       = local_batch_size;
                 bool                                    is_initialize_random_table = step == max_input_length;
                 std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
@@ -1229,5 +1282,4 @@ template class Llama<half>;
 #ifdef ENABLE_BF16
 template class Llama<__nv_bfloat16>;
 #endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index f318e6947..a0958280e 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -41,6 +41,7 @@ class Llama: public BaseLayer {
     float layernorm_eps_;
 
     static constexpr bool  neox_rotary_style_ = true;
+    float       shared_contexts_ratio_;
 
     int    start_id_;
     int    end_id_;
@@ -54,6 +55,7 @@ class Llama: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    const int     int8_mode_      = 0;
 
     size_t     vocab_size_padded_;
     const bool is_context_qk_buf_float_ =
@@ -120,6 +122,11 @@ class Llama: public BaseLayer {
 
     bool* generation_should_stop_ = nullptr;
 
+    int* shared_contexts_idx_  = nullptr;
+    int* compact_idx_          = nullptr;
+    int* batch_to_compact_idx_ = nullptr;
+    int* compact_size_         = nullptr;
+
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
     float* output_log_probs_buf_;
@@ -165,8 +172,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(size_t                              head_num,
           size_t                              size_per_head,
@@ -195,8 +204,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(Llama<T> const& Llama);
 
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index e3afd1780..a1d78c852 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -40,10 +40,11 @@ void LlamaContextDecoder<T>::initialize()
                                                                           is_free_buffer_after_forward_,
                                                                           is_qk_buf_float_,
                                                                           false,
-                                                                          0,
+                                                                          int8_mode_,
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
+// TODO: 这里的SiluFfnLayer是不支持int8的dataType，再不做修改的情况下试一下会不会报错。
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
@@ -69,7 +70,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -83,6 +84,20 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    
+    if (use_shared_contexts) {
+        compact_decoder_features_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        compact_attention_mask_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        compact_input_lengths_ =
+            reinterpret_cast<int*>(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false));
+        k_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        v_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -97,6 +112,13 @@ void LlamaContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        if (compact_decoder_features_ != nullptr) {
+            allocator_->free((void**)(&compact_decoder_features_));
+            allocator_->free((void**)(&compact_attention_mask_));
+            allocator_->free((void**)(&compact_input_lengths_));
+            allocator_->free((void**)(&k_cache_layer_));
+            allocator_->free((void**)(&v_cache_layer_));
+        }
         is_allocate_buffer_ = false;
     }
 }
@@ -147,6 +169,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
                                             bool                                is_free_buffer_after_forward,
                                             bool                                is_qk_buf_float,
                                             AttentionType                       attention_type,
+                                            int                                 int8_mode,
                                             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                             int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -163,6 +186,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -185,6 +209,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(LlamaContextDecoder<T> const& decode
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -238,15 +263,23 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 4);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end();
+    FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end()));
+    const size_t request_batch_size = input_tensors->at("decoder_input").shape[0];
+    // compacted batch size.
+    const size_t batch_size =
+        use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0];
+    const int seq_len = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    // The maximum length of generation.
+    const size_t max_seq_len = output_tensors->at("value_cache").shape[3];
+
     const int max_prompt_length =
         input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
     const DataType data_type = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    allocateBuffer(batch_size, seq_len, use_shared_contexts);
 
     T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
     T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
@@ -254,6 +287,20 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
     const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
 
+    if (use_shared_contexts) {
+        invokeCompactInputs(compact_decoder_features_,
+                            compact_attention_mask_,
+                            compact_input_lengths_,
+                            decoder_input,
+                            attention_mask,
+                            input_tensors->at("input_lengths").getPtr<int>(),
+                            input_tensors->at("compact_idx").getPtr<int>(),
+                            batch_size,
+                            seq_len,
+                            hidden_units_,
+                            stream_);
+    }
+
     const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
@@ -271,6 +318,12 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
+    if (use_shared_contexts) {
+        // we use k_cache_layer_ and v_cache_layer_
+        self_k_cache_size[3] = seq_len;
+        self_v_cache_size[2] = seq_len;
+    }
+
     AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
                                         getUnfusedAttentionType(attention_type_) :
                                         attention_type_;
@@ -279,7 +332,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     for (int ite = 0; ite < iteration_num; ite++) {
         size_t h_token_num = local_batch_size * seq_len;
         if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            const int* base_input_lengths =
+                use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr<int>();
             invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                                &h_token_num,
                                                padding_offset_,
@@ -295,8 +349,9 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (l == 0 && is_unpadded_mha) {
+                const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input);
                 invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    base_input + ite * local_batch_size * seq_len * hidden_units_,
                                     padding_offset_,
                                     h_token_num,
                                     hidden_units_,
@@ -308,11 +363,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             T*         layer_output = decoder_layer_output_;
             if (!is_unpadded_mha) {
                 if (l == 0) {
-                    layer_input = decoder_input;
+                    layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input;
                     layer_input += ite * local_batch_size * seq_len * hidden_units_;
                 }
                 if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
+                    layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output;
                     layer_output += ite * local_batch_size * seq_len * hidden_units_;
                 }
             }
@@ -328,7 +383,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                     ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
                 }
             }
-
+            // TODO: 这里用的LN跟neox不一样，不太清楚这里需不需要改成int8的LN
             invokeGeneralT5LayerNorm(decoder_normed_input_,
                                    layer_input,
                                    gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -337,8 +392,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    h_token_num,
                                    hidden_units_,
                                    stream_);
+
             sync_check_cuda_error();
 
+            const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask;
+
             TensorMap self_attention_input_tensors{
                 {"input_query",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -346,7 +404,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                  Tensor{MEMORY_GPU,
                         data_type,
                         {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                        attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
@@ -381,17 +439,43 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
             cache_offset += ite_cache_offset;
 
+            T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset<T>(cache_offset);
+            T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset<T>(cache_offset);
+
             TensorMap self_attention_output_tensors{
                 {"hidden_features",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}},
+                {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}};
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
                                            &gpt_decoder_layer_weight->at(l)->self_attention_weights);
 
+            if (use_shared_contexts) {
+                // Even with local batches, we must process the whole K/V caches as any
+                // element in batch_idx_to_compact_idx may reference the local batch
+                // we're processing. We also need to discard references that aren't in
+                // that particular local batch.
+                const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len;
+                const size_t cache_layer_offset =
+                    (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch;
+                invokeUnCompactCaches(k_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      v_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      k_cache_layer_,
+                                      v_cache_layer_,
+                                      input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                                      request_batch_size,  // batch_size (uncompact)
+                                      v_cache.shape[2],    // local_head_num
+                                      max_seq_len,
+                                      seq_len,
+                                      size_per_head_,
+                                      local_batch_size,
+                                      ite,
+                                      stream_);
+                sync_check_cuda_error();
+            }
+
             if (is_final == false) {
                 if (use_gptj_residual_) {
                     invokeGeneralLayerNorm(decoder_normed_input_,
@@ -402,10 +486,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                            h_token_num,
                                            hidden_units_,
                                            (float*)nullptr,
-                                           0,
+                                           int8_mode_,
                                            stream_);
                 }
                 else {
+                    // TODO: modify or not ?
                     invokeGeneralAddResidualT5PreLayerNorm(
                         self_attn_output_,
                         decoder_normed_input_,
@@ -472,7 +557,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                    T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output;
+                    invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_,
                                          decoder_layer_output_,
                                          padding_offset_,
                                          h_token_num,
@@ -483,12 +569,22 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
+    if (use_shared_contexts) {
+        invokeUnCompactOutputs(decoder_output,
+                               compact_decoder_features_,
+                               input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                               request_batch_size,  // batch
+                               seq_len * hidden_units_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+
     // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
     invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
                                        output_tensors->at("decoder_output").getPtr<T>(),
                                        input_tensors->at("input_lengths").getPtr<int>(),
                                        seq_len,
-                                       batch_size,
+                                       request_batch_size,
                                        hidden_units_,
                                        stream_);
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h
index 788d1d45d..a2f91f7b8 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -56,13 +56,15 @@ class LlamaContextDecoder: public BaseLayer {
 
     AttentionType attention_type_;
 
+    int int8_mode_ = 0;
+
     bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -81,6 +83,12 @@ class LlamaContextDecoder: public BaseLayer {
     int*    padding_offset_         = nullptr;
     int*    cu_seqlens_             = nullptr;
 
+    T*   compact_decoder_features_ = nullptr;
+    T*   compact_attention_mask_   = nullptr;
+    int* compact_input_lengths_    = nullptr;
+    T*   k_cache_layer_            = nullptr;
+    T*   v_cache_layer_            = nullptr;
+
 public:
     LlamaContextDecoder(size_t                              head_num,
                         size_t                              size_per_head,
@@ -98,6 +106,7 @@ class LlamaContextDecoder: public BaseLayer {
                         bool                                is_free_buffer_after_forward,
                         bool                                is_qk_buf_float,
                         AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                        int                                 int8_mode                = 0,
                         std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                         int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index a5cffa731..4685217b0 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -35,10 +35,11 @@ void LlamaDecoder<T>::initialize()
                                                                            !use_gptj_residual_,
                                                                            is_free_buffer_after_forward_,
                                                                            false,
-                                                                           0,
+                                                                           int8_mode_,
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
+    // TODO: SiLu ftn layer not support int8
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    1,
                                                    head_num_,
@@ -133,6 +134,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
                               cublasMMWrapper*                    cublas_wrapper,
                               IAllocator*                         allocator,
                               bool                                is_free_buffer_after_forward,
+                              int                                 int8_mode,
                               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                               int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -147,6 +149,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
     hidden_units_(head_num_ * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -167,6 +170,7 @@ LlamaDecoder<T>::LlamaDecoder(LlamaDecoder<T> const& decoder):
     hidden_units_(decoder.hidden_units_),
     tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -260,6 +264,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
         }
 
+        // TODO 使用的是T5 LN，这里是没有int8的参数支持
         invokeGeneralT5LayerNorm(decoder_normed_input_,
                                  layer_input,
                                  gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -301,7 +306,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    local_batch_size,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
         }
         else {
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h
index 6cdd7df27..dc44b0f32 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -70,6 +70,8 @@ class LlamaDecoder: public BaseLayer {
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
+    int int8_mode_ = 0;
+
 public:
     LlamaDecoder(size_t                              head_num,
                  size_t                              size_per_head,
@@ -85,6 +87,7 @@ class LlamaDecoder: public BaseLayer {
                  cublasMMWrapper*                    cublas_wrapper,
                  IAllocator*                         allocator,
                  bool                                is_free_buffer_after_forward,
+                 int                                 int8_mode                 = 0,
                  std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                  int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 3e97b67d0..a8dadefea 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -24,15 +24,26 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int  hidden_units,
                                                     const int  inter_size,
                                                     const int  tensor_para_size,
                                                     const int  tensor_para_rank,
-                                                    const bool use_gptj_residual):
+                                                    const bool use_gptj_residual,
+                                                    const int  int8_mode):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank),
+    int8_mode_(int8_mode),
     use_gptj_residual_(use_gptj_residual)
 {
     mallocWeights();
     setWeightPtr();
+
+    FT_CHECK_WITH_INFO(int8_mode_ != 2, "Llama doesn't support int8_model == 2");
+    FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value && int8_mode_ == 1),
+                       "Weight only quant does not work with FP32 compute.");
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode)
+{
 }
 
 template<typename T>
@@ -60,38 +71,92 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
         ffn_weights.intermediate_weight2.bias   = nullptr;
         ffn_weights.output_weight.kernel       = nullptr;
         ffn_weights.output_weight.bias         = nullptr;
+
+        if (int8_mode_ != 0) {
+            for (int i = 0; i < int8_weights_ptr.size(); i++) {
+                if (int8_weights_ptr[i] != nullptr) {
+                    deviceFree(int8_weights_ptr[i]);
+                }
+            }
+
+            if (int8_mode_ == 1) {
+                for (int i = 0; i < weight_only_scale_ptr.size(); i++) {
+                    if (weight_only_scale_ptr[i] != nullptr) {
+                        deviceFree(weight_only_scale_ptr[i]);
+                    }
+                }
+            }
+
+            self_attention_weights.query_weight.int8_kernel                             = nullptr;
+            self_attention_weights.query_weight.weight_only_quant_scale                 = nullptr;
+            self_attention_weights.attention_output_weight.int8_kernel                  = nullptr;
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = nullptr;
+
+            //  作一下标记  intermediate_weight => gate_proj;  intermediate_weight2 => up_proj; output_weight => down_proj.
+            ffn_weights.intermediate_weight.int8_kernel                                 = nullptr;
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = nullptr;
+            ffn_weights.intermediate_weight2.int8_kernel                                = nullptr;
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = nullptr;
+            ffn_weights.output_weight.int8_kernel                                       = nullptr;
+            ffn_weights.output_weight.weight_only_quant_scale                           = nullptr;
+        }
+
         is_maintain_buffer                     = false;
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_)
+void LlamaDecoderLayerWeight<T>::copyFrom(const LlamaDecoderLayerWeight& other)
 {
-    mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
     if (!use_gptj_residual_) {
         cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     }
-
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
     cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
     cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
     cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+
+    if (int8_mode_ == 0) {
+        cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
+    }
+    else {
+        cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[4], other.int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+        
+        if (int8_mode_ == 1) {
+            cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
+            cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+
+            // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的，后面做一下验证，回来再改一下
+            cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    int8_mode_(other.int8_mode_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    copyFrom(other);
     setWeightPtr();
 }
 
@@ -102,26 +167,12 @@ LlamaDecoderLayerWeight<T>& LlamaDecoderLayerWeight<T>::operator=(const LlamaDec
     inter_size_        = other.inter_size_;
     tensor_para_size_  = other.tensor_para_size_;
     tensor_para_rank_  = other.tensor_para_rank_;
+    int8_mode_          = other.int8_mode_;
     use_gptj_residual_ = other.use_gptj_residual_;
 
     mallocWeights();
 
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
-    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
-    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    copyFrom(other);
     setWeightPtr();
     return *this;
 }
@@ -137,42 +188,90 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr[2],
+    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
+
+    if (!use_gptj_residual_) {
+        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
+    }
+
+    // FIXME(sunpeng17): check if the weights are correct
+    // loadWeightFromBin<T>(weights_ptr[6],
+    //                      {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+    //                      dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    // loadWeightFromBin<T>(weights_ptr[10],
+    //                      {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+    //                      dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+
+    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+
+    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+
+    if (int8_mode_ == 0) {        
+        loadWeightFromBin<T>(weights_ptr[2],
                          {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
                          dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[4],
+        loadWeightFromBin<T>(weights_ptr[4],
                          {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".attention.dense.weight." + rank_spec + ".bin",
                          model_file_type);
-    if (!use_gptj_residual_) {
-        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
-    }
 
-    // FIXME(sunpeng17): check if the weights are correct
-    loadWeightFromBin<T>(weights_ptr[6],
+        loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
                          dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[8],
-                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+        loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
-
-    loadWeightFromBin<T>(weights_ptr[10],
+        loadWeightFromBin<T>(weights_ptr[10],
                          {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+    }
+    else if (int8_mode_ == 1) {
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[0],
+                                                     weight_only_scale_ptr[0],
+                                                     {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                                                     dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                                                     model_file_type);
 
-    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
-    loadWeightFromBin<T>(
-        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[1],
+                                                     weight_only_scale_ptr[1],
+                                                     {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[2],
+                                                     weight_only_scale_ptr[2],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[3],
+                                                     weight_only_scale_ptr[3],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[4],
+                                                     weight_only_scale_ptr[4],
+                                                     {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+    }
 }
 
 template<typename T>
@@ -194,6 +293,23 @@ void LlamaDecoderLayerWeight<T>::setWeightPtr()
 
     post_attention_layernorm_weights.beta  = weights_ptr[12];
     post_attention_layernorm_weights.gamma = weights_ptr[13];
+
+    if (int8_mode_ != 0) {
+        self_attention_weights.query_weight.int8_kernel                 = int8_weights_ptr[0];
+        self_attention_weights.attention_output_weight.int8_kernel      = int8_weights_ptr[1];
+        ffn_weights.intermediate_weight.int8_kernel                     = int8_weights_ptr[2];
+        ffn_weights.intermediate_weight2.int8_kernel                    = int8_weights_ptr[3];
+        ffn_weights.output_weight.int8_kernel                           = int8_weights_ptr[4];
+
+        if (int8_mode_ == 1) {
+            self_attention_weights.query_weight.weight_only_quant_scale                 = weight_only_scale_ptr[0];
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = weight_only_scale_ptr[1];
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = weight_only_scale_ptr[2];
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = weight_only_scale_ptr[3];
+            ffn_weights.output_weight.weight_only_quant_scale                           = weight_only_scale_ptr[4];
+        }
+    }
+    
     is_maintain_buffer                     = true;
 }
 
@@ -202,21 +318,48 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta
     deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
+    // deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
+    // deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
     if (!use_gptj_residual_) {
         deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias
     }
 
-    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
+    // deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
     deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);                 // intermediate_weight bias
-    deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
+    // deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
     deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_);                 // intermediate_weight2 bias
-    deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
+    // deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
     deviceMalloc(&weights_ptr[11], hidden_units_);                                   // output_weight bias
     deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta
     deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma
+
+    if (int8_mode_ == 0) {
+        deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);  // qkv weight
+        deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);  // attention output weight
+        deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);   // intermediate_weight kernel
+        deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);  // intermediate_weight2 kernel
+        deviceMalloc(&weights_ptr[10],  inter_size_ / tensor_para_size_ * hidden_units_);  // output_weight kernel    
+    }
+    else {
+        // Alloc FFN and Attention int8 weights
+        deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+
+
+        if (int8_mode_ == 1) {
+            // Alloc scales for weight only quant for attention and FFN weights
+            deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[1], hidden_units_);
+            deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+
 }
 
 template struct LlamaDecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
index 008e1a3b4..5a76ba4c5 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -29,11 +29,13 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = default;
+    LlamaDecoderLayerWeight(const int int8_mode);
     LlamaDecoderLayerWeight(const int  hidden_units,
                             const int  inter_size,
                             const int  tensor_para_size  = 1,
                             const int  tensor_para_rank  = 0,
-                            const bool use_gptj_residual = true);
+                            const bool use_gptj_residual = true,
+                            const int int8_mode = 0);
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other);
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other);
@@ -54,9 +56,14 @@ struct LlamaDecoderLayerWeight {
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
     T*        weights_ptr[14];
+    int       int8_mode_ = 0;
+
+    std::vector<int8_t*> int8_weights_ptr = std::vector<int8_t*>(5, nullptr);
+    std::vector<T*>      weight_only_scale_ptr = std::vector<T*>(5, nullptr);
 
     void setWeightPtr();
     void mallocWeights();
+    void copyFrom(const LlamaDecoderLayerWeight& other);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 6105267ff..e9e11b6a1 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -29,6 +29,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
                             const int                                  layer_para_size,
                             const int                                  layer_para_rank,
                             const bool                                 use_gptj_residual,
+                            const int                                  int8_mode,
                             PromptLearningType                         prompt_learning_type,
                             std::map<std::string, std::pair<int, int>> prompt_learning_pair):
     hidden_units_(hidden_units),
@@ -41,6 +42,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
     use_gptj_residual_(use_gptj_residual),
+    int8_mode_(int8_mode),
     prompt_learning_type_(prompt_learning_type),
     prompt_learning_pair_(prompt_learning_pair)
 {
@@ -62,7 +64,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -103,6 +105,7 @@ LlamaWeight<T>::LlamaWeight(const LlamaWeight& other):
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     use_gptj_residual_(other.use_gptj_residual_),
+    int8_mode_(other.int8_mode_),
     prompt_token_weight_size_(other.prompt_token_weight_size_),
     malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
     prompt_learning_type_(other.prompt_learning_type_),
@@ -149,6 +152,7 @@ LlamaWeight<T>& LlamaWeight<T>::operator=(const LlamaWeight& other)
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
     use_gptj_residual_          = other.use_gptj_residual_;
+    int8_mode_                  = other.int8_mode_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
     malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
     prompt_learning_type_       = other.prompt_learning_type_;
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
index ec909ca49..eb90dc5bf 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -38,6 +38,7 @@ struct LlamaWeight {
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
         const bool                                 use_gptj_residual_   = false,
+        const int                                  int8_mode            = 0,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
@@ -88,6 +89,8 @@ struct LlamaWeight {
     int layer_para_size_;
     int layer_para_rank_;
 
+    size_t int8_mode_    = 0;
+
     // residual type
     bool use_gptj_residual_;
 

From 6f0469f2c3840bbfe31aa39a276281f3710cbd7e Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Fri, 30 Jun 2023 14:08:17 +0800
Subject: [PATCH 2/3] update code with comments from reviewer

---
 .gitignore                                    |   4 +-
 examples/cpp/llama/llama_example.cc           |   1 -
 src/fastertransformer/.DS_Store               | Bin 6148 -> 6148 bytes
 src/fastertransformer/layers/FfnLayer.cc      |   3 +-
 src/fastertransformer/layers/FfnLayer.h       |   1 +
 .../layers/TensorParallelSiluFfnLayer.cc      |   4 +-
 .../layers/TensorParallelSiluFfnLayer.h       |   3 +-
 .../adapter_layers/LinearAdapterLayer.cc      |   3 +-
 src/fastertransformer/models/.DS_Store        | Bin 8196 -> 8196 bytes
 src/fastertransformer/models/llama/Llama.cc   | 120 +++++++++---------
 .../models/llama/LlamaContextDecoder.cc       |   5 +-
 .../models/llama/LlamaDecoder.cc              |   3 +-
 12 files changed, 78 insertions(+), 69 deletions(-)

diff --git a/.gitignore b/.gitignore
index 77849f435..8ecc0cca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ __pycache__/
 /models
 /notebooks
 **/.ipynb_checkpoints/
+.DS_Store
 
 /3rdparty/NeMo/
-/3rdparty/apex/
\ No newline at end of file
+/3rdparty/apex/
+
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 49d966772..14cff7bee 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -541,6 +541,5 @@ void llama_example(const INIReader reader)
     if (d_sequence_lengths != nullptr) {
         deviceFree(d_sequence_lengths);
     }
-
     return;
 }
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
index d1845e2dff13bedf5e41cf42a41978e6bf67cb38..d4edc9af914042cba222ab33425eac19388ca6d5 100644
GIT binary patch
literal 6148
zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ
z1K+?eu!SGtUs%Dp9v96dQ<fD1-^f0GoNLD?zIIGRqBr$+i0VXSAu$%OAj>iC=d@-8
z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5
z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=<?6R8cJ-E>t_v4?q
z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5`
zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v
zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P
zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m<i
z)0B>}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{
zokfK~+P9>_&d&)L)+x<IHQ)3uvo`a)O{-DBDDWREz~_UF#MsriOi&#i$kY=6SVXfj
zl=*K0``8-08kY&82PP6%VB#wD6GJ5KXty=5t8tlN;!Z+8K7^iG=ogAmvtxW)(Mfa@
zG`Ug0D3DcPBdON;{(o@s`#<YsYDNL0z(1vcC?55XdRUUaTh|uHcdd=|8i|d0%LJ7R
kGJPGZ0$;^jNXpP>a{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n!

delta 96
zcmZoMXfc=|#>B`mu~2NHo}wrd0|Nsi1A_nqLk>f6Qh9MfQcix-#KPs14MbQbv$B2M
sEW#nova#U{<7Rdaeh#3%&4L`?nJ4p$SOT?xwSr6l8NsqSLSzjy0AfWI5&!@I

diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc
index 7ac441198..14bb5e3f6 100644
--- a/src/fastertransformer/layers/FfnLayer.cc
+++ b/src/fastertransformer/layers/FfnLayer.cc
@@ -684,6 +684,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                               IAllocator*      allocator,
                               bool             is_free_buffer_after_forward,
                               bool             sparse,
+                              int              int8_mode,
                               bool             use_gated_activation):
     FfnLayer<T>(max_batch_size,
                 max_seq_len,
@@ -696,7 +697,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                 allocator,
                 is_free_buffer_after_forward,
                 sparse,
-                0,
+                int8_mode,
                 use_gated_activation)
 {
 }
diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h
index af7ae7606..f84915d2f 100644
--- a/src/fastertransformer/layers/FfnLayer.h
+++ b/src/fastertransformer/layers/FfnLayer.h
@@ -210,6 +210,7 @@ class SiluFfnLayer: public FfnLayer<T> {
                  IAllocator*      allocator,
                  bool             is_free_buffer_after_forward,
                  bool             sparse               = false,
+                 int              int8_mode            = 0,
                  bool             use_gated_activation = false);
 
     SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
index 25a2da86b..af4714d82 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
@@ -76,7 +76,8 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                                                           bool             is_sparse,
                                                           bool             use_gated_activation,
                                                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                                          int                                 enable_custom_all_reduce):
+                                                          int                                 enable_custom_all_reduce,
+                                                          int              int8_mode):
     SiluFfnLayer<T>(max_batch_size,
                     max_seq_len,
                     head_num,
@@ -88,6 +89,7 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                     allocator,
                     is_free_buffer_after_forward,
                     is_sparse,
+                    int8_mode,
                     use_gated_activation),
     tensor_para_(tensor_para),
     custom_all_reduce_comm_(custom_all_reduce_comm),
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
index ae481373a..5f0e6d625 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
@@ -47,7 +47,8 @@ class TensorParallelSiluFfnLayer: public SiluFfnLayer<T> {
                                bool                                is_sparse,
                                bool                                use_gated_activation     = false,
                                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-                               int                                 enable_custom_all_reduce = 0);
+                               int                                 enable_custom_all_reduce = 0,
+                               int                                 int8_mode                = 0);
 
     TensorParallelSiluFfnLayer(TensorParallelSiluFfnLayer<T> const& ffn_layer);
 
diff --git a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
index c5ea150f4..ef2ac7b3a 100644
--- a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
+++ b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
@@ -88,7 +88,8 @@ LinearAdapterLayer<T>::LinearAdapterLayer(LinearAdapterConfig const&          co
                                                  is_sparse,
                                                  false,
                                                  custom_all_reduce_comm,
-                                                 enable_custom_all_reduce)},
+                                                 enable_custom_all_reduce,
+                                                 0)},
     layer_norm_type_{config.layerNormType()},
     layer_norm_eps_{layer_norm_eps},
     max_token_size_{max_batch_size * max_seq_len},
diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
index cd5987b054b2d9ac9e7e3b48fcc55e66056ebd71..de13d48fdd4e9e824db21adcab91399ef3727fa6 100644
GIT binary patch
delta 22
dcmZp1XmQx^ONi6R#6m~G(8$7IGqW%wF91_V1=;`r

delta 22
dcmZp1XmQx^ONi6h&`?Lg(8%0$GqW%wF91^U1=Rom

diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 8f66e7def..1eac9fd20 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -927,68 +927,68 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                 sync_check_cuda_error();
 
                 
-                // if (tensor_para_.world_size_ == 1) {
-                //     float alpha = 1.0f;
-                //     float beta  = 0.0f;
-                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                //                           CUBLAS_OP_N,
-                //                           vocab_size_padded_,  // n
-                //                           local_batch_size * beam_width,
-                //                           hidden_units_,  // k
-                //                           &alpha,
-                //                           padded_embedding_kernel_ptr_,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           normed_decoder_output_buf_ + hidden_units_offset,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           &beta,
-                //                           logits_buf_ + vocab_size_units_offset,
-                //                           CUDA_R_32F,
-                //                           vocab_size_padded_, /* n */
-                //                           CUDA_R_32F,
-                //                           cublasGemmAlgo_t(-1));
-                // }
-                // else {
-                //     FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
-                //     const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
-                //     float     alpha            = 1.0f;
-                //     float     beta             = 0.0f;
-                //     cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                //                           CUBLAS_OP_N,
-                //                           local_vocab_size,  // n
-                //                           local_batch_size * beam_width,
-                //                           hidden_units_,  // k
-                //                           &alpha,
-                //                           padded_embedding_kernel_ptr_
-                //                               + tensor_para_.rank_ * local_vocab_size * hidden_units_,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           normed_decoder_output_buf_ + hidden_units_offset,
-                //                           gemm_data_type,
-                //                           hidden_units_,  // k
-                //                           &beta,
-                //                           nccl_logits_buf_ + vocab_size_units_offset
-                //                               + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
-                //                           CUDA_R_32F,
-                //                           local_vocab_size, /* n */
-                //                           CUDA_R_32F,
-                //                           cublasGemmAlgo_t(-1));
+                if (tensor_para_.world_size_ == 1) {
+                    float alpha = 1.0f;
+                    float beta  = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          vocab_size_padded_,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          logits_buf_ + vocab_size_units_offset,
+                                          CUDA_R_32F,
+                                          vocab_size_padded_, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
+                }
+                else {
+                    FT_CHECK(vocab_size_padded_ % tensor_para_.world_size_ == 0);
+                    const int local_vocab_size = vocab_size_padded_ / tensor_para_.world_size_;
+                    float     alpha            = 1.0f;
+                    float     beta             = 0.0f;
+                    cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          local_vocab_size,  // n
+                                          local_batch_size * beam_width,
+                                          hidden_units_,  // k
+                                          &alpha,
+                                          padded_embedding_kernel_ptr_
+                                              + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          normed_decoder_output_buf_ + hidden_units_offset,
+                                          gemm_data_type,
+                                          hidden_units_,  // k
+                                          &beta,
+                                          nccl_logits_buf_ + vocab_size_units_offset
+                                              + tensor_para_.rank_ * local_batch_size * beam_width * local_vocab_size,
+                                          CUDA_R_32F,
+                                          local_vocab_size, /* n */
+                                          CUDA_R_32F,
+                                          cublasGemmAlgo_t(-1));
                     
 
-                //     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
-                //                     nccl_logits_buf_ + vocab_size_units_offset,
-                //                     local_batch_size * beam_width * local_vocab_size,
-                //                     tensor_para_.rank_,
-                //                     tensor_para_,
-                //                     stream_);
-                //     invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
-                //                           nccl_logits_buf_ + vocab_size_units_offset,
-                //                           tensor_para_.world_size_,
-                //                           local_batch_size * beam_width,
-                //                           local_vocab_size,
-                //                           stream_);
-                // }
+                    ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
+                                    nccl_logits_buf_ + vocab_size_units_offset,
+                                    local_batch_size * beam_width * local_vocab_size,
+                                    tensor_para_.rank_,
+                                    tensor_para_,
+                                    stream_);
+                    invokeTransposeAxis01(logits_buf_ + vocab_size_units_offset,
+                                          nccl_logits_buf_ + vocab_size_units_offset,
+                                          tensor_para_.world_size_,
+                                          local_batch_size * beam_width,
+                                          local_vocab_size,
+                                          stream_);
+                }
                 
                 
                 int                                     tmp_local_batch_size       = local_batch_size;
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index a1d78c852..0595358a0 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -44,7 +44,7 @@ void LlamaContextDecoder<T>::initialize()
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
-// TODO: 这里的SiluFfnLayer是不支持int8的dataType，再不做修改的情况下试一下会不会报错。
+
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
@@ -60,7 +60,8 @@ void LlamaContextDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index 4685217b0..b68c82207 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -55,7 +55,8 @@ void LlamaDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>

From 0e6ae5b1a48ad55a451788c39815518d9d6b5c3f Mon Sep 17 00:00:00 2001
From: shaoxin <shaoxin@xiaoice.com>
Date: Fri, 30 Jun 2023 20:38:06 +0800
Subject: [PATCH 3/3] rm unused code

---
 examples/cpp/llama/llama_example.cc    |  18 ------------------
 src/fastertransformer/.DS_Store        | Bin 6148 -> 0 bytes
 src/fastertransformer/models/.DS_Store | Bin 8196 -> 0 bytes
 3 files changed, 18 deletions(-)
 delete mode 100644 src/fastertransformer/.DS_Store
 delete mode 100644 src/fastertransformer/models/.DS_Store

diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index 14cff7bee..b42aea9bc 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -277,7 +277,6 @@ void llama_example(const INIReader reader)
         cublas_wrapper.setFP32GemmConfig();
     }
 
-    printf("******* Enter  gpt_weights ********** \n");
     const bool                          use_gptj_residual = false;
     fastertransformer::LlamaWeight<T> gpt_weights(hidden_units,
                                                   inter_size,
@@ -293,8 +292,6 @@ void llama_example(const INIReader reader)
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
-    printf("******* Enter loadModel  ********* \n");
-
     gpt_weights.loadModel(model_dir);
     unsigned long long random_seed;
     if (rank == 0) {
@@ -312,8 +309,6 @@ void llama_example(const INIReader reader)
                                                        false,  // with_relative_position_bias
                                                        true);  // causal_mask
 
-    printf("******* Inilize  Llama  ********* \n");
-
     Llama<T> gpt = Llama<T>(head_num,
                             size_per_head,
                             inter_size,
@@ -349,13 +344,10 @@ void llama_example(const INIReader reader)
     int* d_output_ids;
     int* d_sequence_lengths;
 
-    printf("******* deviceMalloc start  ********* \n");
 
     deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
     deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
 
-    printf("******* deviceMalloc end  ********* \n");
-
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -421,8 +413,6 @@ void llama_example(const INIReader reader)
 
     print_mem_usage();
 
-    printf("******* before cudaDeviceSynchronize ********* \n");
-
     int ite = 1;
     cudaDeviceSynchronize();
     mpi::barrier();
@@ -433,19 +423,16 @@ void llama_example(const INIReader reader)
     ft_nvtx::setScope("warmup_time");
     PUSH_RANGE("warmup time")
 
-    printf("******* before gpt.forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
 
-    printf("******* end gpt.forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
     POP_RANGE;
     ft_nvtx::resetScope();
 
-    printf("******* end cudaDeviceSynchronize ********* \n");
 
     if (rank == 0) {
 
@@ -458,11 +445,8 @@ void llama_example(const INIReader reader)
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
 
-            printf("******* before cudaD2Hcpy ********* \n");
-
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
-            printf("******* end cudaD2Hcpy ********* \n");
             {
                 std::cout << "Writing " << outCount << " elements\n";
                 int zeroCount = 0;
@@ -496,11 +480,9 @@ void llama_example(const INIReader reader)
 
     ft_nvtx::setScope("total_time");
     PUSH_RANGE("total time")
-    printf("******* before gpt forward ********* \n");
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
-    printf("******* after gpt forward ********* \n");
     cudaDeviceSynchronize();
     mpi::barrier();
 
diff --git a/src/fastertransformer/.DS_Store b/src/fastertransformer/.DS_Store
deleted file mode 100644
index d4edc9af914042cba222ab33425eac19388ca6d5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%We}f6unOKm{1WS1X5Wbjl?!8Y0_4y*o3q!kYGa)EC3CeOlU`)j-$zhfTBoQ
z1K+?eu!SGtUs%Dp9v96dQ<fD1-^f0GoNLD?zIIGRqBr$+i0VXSAu$%OAj>iC=d@-8
z-BJSzHO7b@kxvtfk&GDyi~@hV0vx+FI;6)mpi@f6?>3_N9mNj=IdlW*g2#u%eWdN5
z%1|P@PD7f48Nh#nkApGbg*-&fC&{=n#+Q{bDr)8k%n`;}VT;=<?6R8cJ-E>t_v4?q
z?rbvFEBWe<1Ah`Xo8LuYqgX0etcta1z3QLHSw9}c(@Ad-z2MqYDFgSsJaC`+qnT5`
zaVW!h;D@6~Ir!cPl$X!_(37*CoQB>+`Sz0zt7=u9`qq5j+HW@OcI#l#u;&LYly}?v
zi$&GCdFSrEC(dbb7RnF!{xE^lRM}OH=fLaBtp9C43IiD)qn&-qbOyz}TpfR)KmE&P
zxG8PEUAi{6pv`WlVehm$O%|uqZa3_`R&zxhUOenVaXz6lv#1quuKnJY5?d!i&7m<i
z)0B>}QN!#eEwH3%xk})_frq4VmZ?JGb1Ed5d6LWIjh|-Lz>F|=tir@07eO>s!4Q&{
zokfK~+P9>_&d&)L)+x<IHQ)3uvo`a)O{-DBDDWREz~_UF#MsriOi&#i$kY=6SVXfj
zl=*K0``8-08kY&82PP6%VB#wD6GJ5KXty=5t8tlN;!Z+8K7^iG=ogAmvtxW)(Mfa@
zG`Ug0D3DcPBdON;{(o@s`#<YsYDNL0z(1vcC?55XdRUUaTh|uHcdd=|8i|d0%LJ7R
kGJPGZ0$;^jNXpP>a{<`ZxJ(c&F!M)1$zU?0z@;ki3m={olK=n!

diff --git a/src/fastertransformer/models/.DS_Store b/src/fastertransformer/models/.DS_Store
deleted file mode 100644
index de13d48fdd4e9e824db21adcab91399ef3727fa6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHML2uJA6n^eHnld)^0Ma-hMdCV*ZevoVT}n3&NN}Nu9RQVN4O>_iS53N3s)~At
z-@sqs%pc*uaDwmI?v%Le6(O)I`z7}Gz4(20n!c2XL}%3BA=)G&50!0W4b2UO$GI+)
zis@mp0H2;E@+3@Ti1Wq*+I2VuoB~b(r+`zyDeyNafM+%rZ^?6CkGj?=;1u{T72xj&
zAC+y}*qOHKs{@Tn0br}>Rt3lEr$4OE9k6X<XWF6%MQAFlrV5i{2$MMoO~-iK;Ah%u
zItg`U_%T-&CPNV>@esa(lW1$}TBm?hU{L|i-K!K*Ktsx{{QU{WwtuOjh<B(1Tt*Qc
zqsDYfXOz=6omsqZ7LWZ$i~hv}3+K&o)T0kHpjO$h_-x_p1$;~?=3+B`T8s@kq!)<c
z9P@dO2r_UoM3B>!9ew50;?}4SmWhOd^4y5E`dZLRqfbaG*3d$;LPfNvP#VpVoR;xQ
zsUp4x?FX3ka-E9kszHCWys|n)+}vx5I3-<`(MpdZdX*A<0=l?K3$anqqJRv?*}%LZ
z`W_CFILn*OAEL5SUA<NFYTlanu6riO-8{%gStl62;j>p#CgJ=z2w%t1xYxLQDAPQM
z(`cv&aX$j&?Qxv;<+vk9X+Klkt{(8}UcJ}2Kbh=4Zf*HHyRGJQ%b&FN_O|@}-R5*!
z_wGG>bnvowo?N7ow;75sPa0a+=hy8Q92G*k><-gJro3I4aT}sYUP5ePiA{gOTfZzN
zVfjk}tphazFUOYbsl^%@Yu%E(5lYM|w(l6Mw0s4=RZ6(}90$KkZ<P|Gg4yhsz48_)
zVOHo8-mE$Dtsp)XmfzE;Wmi^e6!EKwD1w5K?O8=%MqWX+n~)rT5qZ6M?!YPF6wnH+
zXtTKn`*ivHe<znyz$x$#D<G;z-J=cyod4pnW93}iM}3dVg>f@&6$On-$6-}E4txEF
bA&z}OnbS6QrY(9<+=~DugKL}ue^i0r53^<j