diff --git a/.gitignore b/.gitignore
index 77849f435..8ecc0cca0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ __pycache__/
 /models
 /notebooks
 **/.ipynb_checkpoints/
+.DS_Store
 
 /3rdparty/NeMo/
-/3rdparty/apex/
\ No newline at end of file
+/3rdparty/apex/
+
diff --git a/examples/cpp/llama/llama_example.cc b/examples/cpp/llama/llama_example.cc
index a64c86bca..b42aea9bc 100644
--- a/examples/cpp/llama/llama_example.cc
+++ b/examples/cpp/llama/llama_example.cc
@@ -81,6 +81,7 @@ void llama_example(const INIReader reader)
 
     int tensor_para_size   = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
     int pipeline_para_size = reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size");
+    int int8_mode  = reader.GetInteger("ft_instance_hyperparameter", "int8_mode", 0);
 
     const size_t head_num             = reader.GetInteger(model_name, "head_num");
     const size_t size_per_head        = reader.GetInteger(model_name, "size_per_head");
@@ -177,6 +178,7 @@ void llama_example(const INIReader reader)
         tiled_stop_words.insert(tiled_stop_words.end(), stop_words.begin(), stop_words.end());
     }
 
+    
     int* d_stop_words = nullptr;
     deviceMalloc(&d_stop_words, tiled_stop_words.size(), false);
     cudaH2Dcpy(d_stop_words, tiled_stop_words.data(), tiled_stop_words.size());
@@ -193,6 +195,7 @@ void llama_example(const INIReader reader)
                    1,
                    "../examples/cpp/llama/start_ids.csv");
 
+
     int* d_input_ids;
     int* d_input_lengths;
     if (max_input_len == 0) {
@@ -285,6 +288,7 @@ void llama_example(const INIReader reader)
                                                   pipeline_para.world_size_,
                                                   pipeline_para.rank_,
                                                   use_gptj_residual,
+                                                  int8_mode,
                                                   prompt_learning_type,
                                                   prefix_prompt_table_pair);
 
@@ -331,12 +335,19 @@ void llama_example(const INIReader reader)
                             &allocator,
                             false,
                             &prop,
-                            attention_type);
+                            attention_type,
+                            int8_mode,
+                            nullptr,
+                            0,
+                            1.0f);
 
     int* d_output_ids;
     int* d_sequence_lengths;
+
+
     deviceMalloc(&d_output_ids, request_batch_size * beam_width * total_output_len, false);
     deviceMalloc(&d_sequence_lengths, request_batch_size * beam_width, false);
+
     std::vector<uint32_t>                   output_seq_len(request_batch_size, total_output_len);
     std::unordered_map<std::string, Tensor> input_tensors = std::unordered_map<std::string, Tensor>{
         {"input_ids",
@@ -411,15 +422,18 @@ void llama_example(const INIReader reader)
     ite = 1;
     ft_nvtx::setScope("warmup_time");
     PUSH_RANGE("warmup time")
+
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
+
     cudaDeviceSynchronize();
     mpi::barrier();
 
     POP_RANGE;
     ft_nvtx::resetScope();
 
+
     if (rank == 0) {
 
         std::string fName   = "out";
@@ -430,6 +444,7 @@ void llama_example(const INIReader reader)
         else {
             size_t outCount = total_output_len * request_batch_size * beam_width;
             int*   hBuf     = new int[outCount];
+
             cudaD2Hcpy(hBuf, d_output_ids, outCount);
 
             {
@@ -468,7 +483,6 @@ void llama_example(const INIReader reader)
     for (int i = 0; i < ite; ++i) {
         gpt.forward(&output_tensors, &input_tensors, &gpt_weights);
     }
-
     cudaDeviceSynchronize();
     mpi::barrier();
 
@@ -509,6 +523,5 @@ void llama_example(const INIReader reader)
     if (d_sequence_lengths != nullptr) {
         deviceFree(d_sequence_lengths);
     }
-
     return;
 }
diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc
index 7ac441198..14bb5e3f6 100644
--- a/src/fastertransformer/layers/FfnLayer.cc
+++ b/src/fastertransformer/layers/FfnLayer.cc
@@ -684,6 +684,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                               IAllocator*      allocator,
                               bool             is_free_buffer_after_forward,
                               bool             sparse,
+                              int              int8_mode,
                               bool             use_gated_activation):
     FfnLayer<T>(max_batch_size,
                 max_seq_len,
@@ -696,7 +697,7 @@ SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
                 allocator,
                 is_free_buffer_after_forward,
                 sparse,
-                0,
+                int8_mode,
                 use_gated_activation)
 {
 }
diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h
index af7ae7606..f84915d2f 100644
--- a/src/fastertransformer/layers/FfnLayer.h
+++ b/src/fastertransformer/layers/FfnLayer.h
@@ -210,6 +210,7 @@ class SiluFfnLayer: public FfnLayer<T> {
                  IAllocator*      allocator,
                  bool             is_free_buffer_after_forward,
                  bool             sparse               = false,
+                 int              int8_mode            = 0,
                  bool             use_gated_activation = false);
 
     SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
index 25a2da86b..af4714d82 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.cc
@@ -76,7 +76,8 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                                                           bool             is_sparse,
                                                           bool             use_gated_activation,
                                                           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                                                          int                                 enable_custom_all_reduce):
+                                                          int                                 enable_custom_all_reduce,
+                                                          int              int8_mode):
     SiluFfnLayer<T>(max_batch_size,
                     max_seq_len,
                     head_num,
@@ -88,6 +89,7 @@ TensorParallelSiluFfnLayer<T>::TensorParallelSiluFfnLayer(size_t           max_b
                     allocator,
                     is_free_buffer_after_forward,
                     is_sparse,
+                    int8_mode,
                     use_gated_activation),
     tensor_para_(tensor_para),
     custom_all_reduce_comm_(custom_all_reduce_comm),
diff --git a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
index ae481373a..5f0e6d625 100644
--- a/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
+++ b/src/fastertransformer/layers/TensorParallelSiluFfnLayer.h
@@ -47,7 +47,8 @@ class TensorParallelSiluFfnLayer: public SiluFfnLayer<T> {
                                bool                                is_sparse,
                                bool                                use_gated_activation     = false,
                                std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-                               int                                 enable_custom_all_reduce = 0);
+                               int                                 enable_custom_all_reduce = 0,
+                               int                                 int8_mode                = 0);
 
     TensorParallelSiluFfnLayer(TensorParallelSiluFfnLayer<T> const& ffn_layer);
 
diff --git a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
index c5ea150f4..ef2ac7b3a 100644
--- a/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
+++ b/src/fastertransformer/layers/adapter_layers/LinearAdapterLayer.cc
@@ -88,7 +88,8 @@ LinearAdapterLayer<T>::LinearAdapterLayer(LinearAdapterConfig const&          co
                                                  is_sparse,
                                                  false,
                                                  custom_all_reduce_comm,
-                                                 enable_custom_all_reduce)},
+                                                 enable_custom_all_reduce,
+                                                 0)},
     layer_norm_type_{config.layerNormType()},
     layer_norm_eps_{layer_norm_eps},
     max_token_size_{max_batch_size * max_seq_len},
diff --git a/src/fastertransformer/models/llama/Llama.cc b/src/fastertransformer/models/llama/Llama.cc
index 8e3f1cedf..1eac9fd20 100644
--- a/src/fastertransformer/models/llama/Llama.cc
+++ b/src/fastertransformer/models/llama/Llama.cc
@@ -42,6 +42,7 @@ void Llama<T>::initialize()
                                                       is_free_buffer_after_forward_,
                                                       is_context_qk_buf_float_,
                                                       attention_type_,
+                                                      int8_mode_,
                                                       custom_all_reduce_comm_,
                                                       enable_custom_all_reduce_);
 
@@ -59,6 +60,7 @@ void Llama<T>::initialize()
                                        cublas_wrapper_,
                                        allocator_,
                                        is_free_buffer_after_forward_,
+                                       int8_mode_,
                                        custom_all_reduce_comm_,
                                        enable_custom_all_reduce_);
 
@@ -165,6 +167,13 @@ void Llama<T>::allocateBuffer(
 
     generation_should_stop_ = (bool*)allocator_->reMalloc(generation_should_stop_, sizeof(bool), true, true);
 
+    if (shared_contexts_ratio_ > 0.0f) {
+        shared_contexts_idx_  = (int*)allocator_->reMalloc(shared_contexts_idx_, batch_size * sizeof(int), false);
+        batch_to_compact_idx_ = (int*)allocator_->reMalloc(batch_to_compact_idx_, batchxbeam * sizeof(int), false);
+        compact_idx_          = (int*)allocator_->reMalloc(compact_idx_, batch_size * sizeof(int), false);
+        compact_size_         = (int*)allocator_->reMalloc(compact_size_, sizeof(int), false);
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -216,6 +225,11 @@ void Llama<T>::freeBuffer()
 
         allocator_->free((void**)(&generation_should_stop_), true);
 
+        if (shared_contexts_ratio_ > 0.0f) {
+            allocator_->free((void**)(&shared_contexts_idx_));
+            allocator_->free((void**)(&compact_size_));
+        }
+
         is_allocate_buffer_ = false;
     }
 }
@@ -246,8 +260,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -263,7 +279,9 @@ Llama<T>::Llama(size_t                              head_num,
     use_gptj_residual_(use_gptj_residual),
     hidden_units_(head_num * size_per_head),
     local_head_num_(head_num / 1),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     tensor_para_.world_size_   = 1;
     tensor_para_.rank_         = 0;
@@ -310,8 +328,10 @@ Llama<T>::Llama(size_t                              head_num,
                 bool                                is_free_buffer_after_forward,
                 cudaDeviceProp*                     cuda_device_prop,
                 AttentionType                       attention_type,
+                int                                 int8_mode,
                 std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
-                int                                 enable_custom_all_reduce):
+                int                                 enable_custom_all_reduce,
+                float                               shared_contexts_ratio):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
     head_num_(head_num),
     size_per_head_(size_per_head),
@@ -331,7 +351,9 @@ Llama<T>::Llama(size_t                              head_num,
     local_head_num_(head_num / tensor_para.world_size_),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce),
-    attention_type_(attention_type)
+    attention_type_(attention_type),
+    int8_mode_(int8_mode),
+    shared_contexts_ratio_(shared_contexts_ratio)
 {
     int local_vacab_size = ceil(vocab_size_ / 1.f / tensor_para_.world_size_);
     if (std::is_same<half, T>::value) {
@@ -363,7 +385,9 @@ Llama<T>::Llama(Llama<T> const& gpt):
     vocab_size_padded_(gpt.vocab_size_padded_),
     custom_all_reduce_comm_(gpt.custom_all_reduce_comm_),
     enable_custom_all_reduce_(gpt.enable_custom_all_reduce_),
-    attention_type_(gpt.attention_type_)
+    attention_type_(gpt.attention_type_),
+    int8_mode_(gpt.int8_mode_),
+    shared_contexts_ratio_(gpt.shared_contexts_ratio_)
 {
     initialize();
 }
@@ -585,6 +609,23 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
         cudaMemsetAsync(cache_indirections_[0], 0, 2 * sizeof(int) * batch_size * beam_width * max_seq_len, stream_);
     }
 
+    int  compact_size;
+    bool use_shared_contexts = (shared_contexts_ratio_ > 0.0f) && (max_input_length >= 1) && (batch_size > 1);
+    if (use_shared_contexts) {
+        invokeFindContextDups(shared_contexts_idx_,
+                              batch_to_compact_idx_,
+                              compact_idx_,
+                              compact_size_,
+                              input_tensors->at("input_ids").getPtr<int>(),
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream_);
+        cudaD2Hcpy(&compact_size, compact_size_, 1);
+        use_shared_contexts = compact_size <= shared_contexts_ratio_ * batch_size;
+        sync_check_cuda_error();
+    }
+
     // Prefix prompts
     if (has_prefix_prompt_) {
         cudaMemcpyAsync(prompt_learning_weight_batch_,
@@ -686,6 +727,14 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                     {batch_size * beam_width},
                     has_prefix_prompt_ ? tiled_prompt_lengths_buf_ : nullptr}}};
 
+        if (use_shared_contexts) {
+            decoder_input_tensors.insert(
+                {"compact_idx", Tensor(MEMORY_GPU, TYPE_INT32, {(size_t)compact_size}, compact_idx_)});
+            decoder_input_tensors.insert(
+                {"batch_to_compact_idx",
+                 Tensor(MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, batch_to_compact_idx_)});
+        }
+
         std::unordered_map<std::string, Tensor> decoder_output_tensors{
             {"decoder_output",
              Tensor{MEMORY_GPU,
@@ -877,6 +926,7 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                          stream_);
                 sync_check_cuda_error();
 
+                
                 if (tensor_para_.world_size_ == 1) {
                     float alpha = 1.0f;
                     float beta  = 0.0f;
@@ -924,6 +974,8 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                           local_vocab_size, /* n */
                                           CUDA_R_32F,
                                           cublasGemmAlgo_t(-1));
+                    
+
                     ftNcclAllGather(nccl_logits_buf_ + vocab_size_units_offset,
                                     nccl_logits_buf_ + vocab_size_units_offset,
                                     local_batch_size * beam_width * local_vocab_size,
@@ -937,7 +989,8 @@ void Llama<T>::forward(std::unordered_map<std::string, Tensor>*       output_ten
                                           local_vocab_size,
                                           stream_);
                 }
-
+                
+                
                 int                                     tmp_local_batch_size       = local_batch_size;
                 bool                                    is_initialize_random_table = step == max_input_length;
                 std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
@@ -1229,5 +1282,4 @@ template class Llama<half>;
 #ifdef ENABLE_BF16
 template class Llama<__nv_bfloat16>;
 #endif
-
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/Llama.h b/src/fastertransformer/models/llama/Llama.h
index f318e6947..a0958280e 100644
--- a/src/fastertransformer/models/llama/Llama.h
+++ b/src/fastertransformer/models/llama/Llama.h
@@ -41,6 +41,7 @@ class Llama: public BaseLayer {
     float layernorm_eps_;
 
     static constexpr bool  neox_rotary_style_ = true;
+    float       shared_contexts_ratio_;
 
     int    start_id_;
     int    end_id_;
@@ -54,6 +55,7 @@ class Llama: public BaseLayer {
     int                                 enable_custom_all_reduce_;
 
     AttentionType attention_type_;
+    const int     int8_mode_      = 0;
 
     size_t     vocab_size_padded_;
     const bool is_context_qk_buf_float_ =
@@ -120,6 +122,11 @@ class Llama: public BaseLayer {
 
     bool* generation_should_stop_ = nullptr;
 
+    int* shared_contexts_idx_  = nullptr;
+    int* compact_idx_          = nullptr;
+    int* batch_to_compact_idx_ = nullptr;
+    int* compact_size_         = nullptr;
+
     T*     context_decoder_input_buf_;
     T*     context_decoder_output_buf_;
     float* output_log_probs_buf_;
@@ -165,8 +172,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(size_t                              head_num,
           size_t                              size_per_head,
@@ -195,8 +204,10 @@ class Llama: public BaseLayer {
           bool                                is_free_buffer_after_forward,
           cudaDeviceProp*                     cuda_device_prop         = nullptr,
           AttentionType                       attention_type           = AttentionType::UNFUSED_MHA,
+          int                                 int8_mode                = 0,
           std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm   = nullptr,
-          int                                 enable_custom_all_reduce = 0);
+          int                                 enable_custom_all_reduce = 0,
+          float                               shared_contexts_ratio    = 1.0f);
 
     Llama(Llama<T> const& Llama);
 
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
index e3afd1780..0595358a0 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -40,10 +40,11 @@ void LlamaContextDecoder<T>::initialize()
                                                                           is_free_buffer_after_forward_,
                                                                           is_qk_buf_float_,
                                                                           false,
-                                                                          0,
+                                                                          int8_mode_,
                                                                           custom_all_reduce_comm_,
                                                                           enable_custom_all_reduce_);
 
+
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    0,  // max_seq_len
                                                    head_num_,
@@ -59,7 +60,8 @@ void LlamaContextDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>
@@ -69,7 +71,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
 }
 
 template<typename T>
-void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts)
 {
     decoder_normed_input_ = reinterpret_cast<T*>(
         allocator_->reMalloc(decoder_normed_input_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
@@ -83,6 +85,20 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t seq_len)
     padding_offset_ =
         reinterpret_cast<int*>(allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * seq_len, false));
     cu_seqlens_ = reinterpret_cast<int*>(allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false));
+    
+    if (use_shared_contexts) {
+        compact_decoder_features_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_decoder_features_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        compact_attention_mask_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(compact_attention_mask_, sizeof(T) * batch_size * seq_len * seq_len, false));
+        compact_input_lengths_ =
+            reinterpret_cast<int*>(allocator_->reMalloc(compact_input_lengths_, sizeof(int) * batch_size, false));
+        k_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(k_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+        v_cache_layer_ = reinterpret_cast<T*>(
+            allocator_->reMalloc(v_cache_layer_, sizeof(T) * batch_size * seq_len * hidden_units_, false));
+    }
+
     is_allocate_buffer_ = true;
 }
 
@@ -97,6 +113,13 @@ void LlamaContextDecoder<T>::freeBuffer()
         allocator_->free((void**)(&h_pinned_token_num_ptr_), true);
         allocator_->free((void**)(&padding_offset_));
         allocator_->free((void**)(&cu_seqlens_));
+        if (compact_decoder_features_ != nullptr) {
+            allocator_->free((void**)(&compact_decoder_features_));
+            allocator_->free((void**)(&compact_attention_mask_));
+            allocator_->free((void**)(&compact_input_lengths_));
+            allocator_->free((void**)(&k_cache_layer_));
+            allocator_->free((void**)(&v_cache_layer_));
+        }
         is_allocate_buffer_ = false;
     }
 }
@@ -147,6 +170,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
                                             bool                                is_free_buffer_after_forward,
                                             bool                                is_qk_buf_float,
                                             AttentionType                       attention_type,
+                                            int                                 int8_mode,
                                             std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                                             int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -163,6 +187,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t
     pipeline_para_(pipeline_para),
     is_qk_buf_float_(is_qk_buf_float),
     attention_type_(attention_type),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -185,6 +210,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(LlamaContextDecoder<T> const& decode
     pipeline_para_(decoder.pipeline_para_),
     is_qk_buf_float_(decoder.is_qk_buf_float_),
     attention_type_(decoder.attention_type_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -238,15 +264,23 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     // For example, the shape of decoder_input becomes [ite, batch_size, seq_len, hidden_dimension] during
     // computing.
 
-    FT_CHECK(input_tensors->size() == 5);
+    FT_CHECK(input_tensors->size() >= 5);
     FT_CHECK(output_tensors->size() == 4);
 
-    const int batch_size = input_tensors->at("decoder_input").shape[0];
-    const int seq_len    = input_tensors->at("decoder_input").shape[1];
+    const bool use_shared_contexts = input_tensors->find("compact_idx") != input_tensors->end();
+    FT_CHECK(!use_shared_contexts || (input_tensors->find("batch_to_compact_idx") != input_tensors->end()));
+    const size_t request_batch_size = input_tensors->at("decoder_input").shape[0];
+    // compacted batch size.
+    const size_t batch_size =
+        use_shared_contexts ? input_tensors->at("compact_idx").shape[0] : input_tensors->at("decoder_input").shape[0];
+    const int seq_len = input_tensors->at("decoder_input").shape[1];  // max_input_len
+    // The maximum length of generation.
+    const size_t max_seq_len = output_tensors->at("value_cache").shape[3];
+
     const int max_prompt_length =
         input_tensors->at("attention_mask").shape[3] - input_tensors->at("attention_mask").shape[2];
     const DataType data_type = getTensorType<T>();
-    allocateBuffer(batch_size, seq_len);
+    allocateBuffer(batch_size, seq_len, use_shared_contexts);
 
     T*         decoder_input           = input_tensors->at("decoder_input").getPtr<T>();
     T*         decoder_output          = output_tensors->at("decoder_output").getPtr<T>();
@@ -254,6 +288,20 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     const T**  d_prefix_prompt_batch   = input_tensors->at("d_prefix_prompt_batch").getPtr<const T*>();
     const int* d_prefix_prompt_lengths = input_tensors->at("d_prefix_prompt_lengths").getPtr<const int>();
 
+    if (use_shared_contexts) {
+        invokeCompactInputs(compact_decoder_features_,
+                            compact_attention_mask_,
+                            compact_input_lengths_,
+                            decoder_input,
+                            attention_mask,
+                            input_tensors->at("input_lengths").getPtr<int>(),
+                            input_tensors->at("compact_idx").getPtr<int>(),
+                            batch_size,
+                            seq_len,
+                            hidden_units_,
+                            stream_);
+    }
+
     const int local_batch_size = getLocalBatchSize(batch_size, seq_len, pipeline_para_.world_size_);
     FT_CHECK(batch_size % local_batch_size == 0);
     const int iteration_num = batch_size / local_batch_size;
@@ -271,6 +319,12 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         self_v_cache_size.push_back(*t);
     }
 
+    if (use_shared_contexts) {
+        // we use k_cache_layer_ and v_cache_layer_
+        self_k_cache_size[3] = seq_len;
+        self_v_cache_size[2] = seq_len;
+    }
+
     AttentionType attention_type  = (d_prefix_prompt_lengths != nullptr) ?
                                         getUnfusedAttentionType(attention_type_) :
                                         attention_type_;
@@ -279,7 +333,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
     for (int ite = 0; ite < iteration_num; ite++) {
         size_t h_token_num = local_batch_size * seq_len;
         if (is_unpadded_mha) {
-            const int* base_input_lengths = input_tensors->at("input_lengths").getPtr<int>();
+            const int* base_input_lengths =
+                use_shared_contexts ? compact_input_lengths_ : input_tensors->at("input_lengths").getPtr<int>();
             invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
                                                &h_token_num,
                                                padding_offset_,
@@ -295,8 +350,9 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
 
             if (l == 0 && is_unpadded_mha) {
+                const T* base_input = (use_shared_contexts ? compact_decoder_features_ : decoder_input);
                 invokeRemovePadding(decoder_layer_output_,
-                                    decoder_input + ite * local_batch_size * seq_len * hidden_units_,
+                                    base_input + ite * local_batch_size * seq_len * hidden_units_,
                                     padding_offset_,
                                     h_token_num,
                                     hidden_units_,
@@ -308,11 +364,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             T*         layer_output = decoder_layer_output_;
             if (!is_unpadded_mha) {
                 if (l == 0) {
-                    layer_input = decoder_input;
+                    layer_input = use_shared_contexts ? compact_decoder_features_ : decoder_input;
                     layer_input += ite * local_batch_size * seq_len * hidden_units_;
                 }
                 if (l == num_layer_ - 1) {
-                    layer_output = decoder_output;
+                    layer_output = use_shared_contexts ? compact_decoder_features_ : decoder_output;
                     layer_output += ite * local_batch_size * seq_len * hidden_units_;
                 }
             }
@@ -328,7 +384,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                     ftNcclAllGather(layer_input, layer_input, data_size, tensor_para_.rank_, tensor_para_, stream_);
                 }
             }
-
+            // TODO: 这里用的LN跟neox不一样，不太清楚这里需不需要改成int8的LN
             invokeGeneralT5LayerNorm(decoder_normed_input_,
                                    layer_input,
                                    gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -337,8 +393,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    h_token_num,
                                    hidden_units_,
                                    stream_);
+
             sync_check_cuda_error();
 
+            const T* attention_ptr = use_shared_contexts ? compact_attention_mask_ : attention_mask;
+
             TensorMap self_attention_input_tensors{
                 {"input_query",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, decoder_normed_input_}},
@@ -346,7 +405,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                  Tensor{MEMORY_GPU,
                         data_type,
                         {(size_t)local_batch_size, (size_t)1, (size_t)seq_len, (size_t)(seq_len + max_prompt_length)},
-                        attention_mask + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
+                        attention_ptr + local_batch_size * ite * seq_len * (seq_len + max_prompt_length)}},
                 {"attention_type", Tensor{MEMORY_CPU, TYPE_VOID, {1}, &attention_type}},
                 {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {(size_t)1}, &is_final}},
                 {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {(size_t)1}, &l}}};
@@ -381,17 +440,43 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
             cache_offset += ite_cache_offset;
 
+            T* k_cache_ptr = use_shared_contexts ? k_cache_layer_ : k_cache.getPtrWithOffset<T>(cache_offset);
+            T* v_cache_ptr = use_shared_contexts ? v_cache_layer_ : v_cache.getPtrWithOffset<T>(cache_offset);
+
             TensorMap self_attention_output_tensors{
                 {"hidden_features",
                  Tensor{MEMORY_GPU, data_type, {h_token_num, (size_t)hidden_units_}, self_attn_output_}},
-                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache.getPtrWithOffset(cache_offset)}},
-                {"value_cache",
-                 Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache.getPtrWithOffset(cache_offset)}}};
+                {"key_cache", Tensor{MEMORY_GPU, data_type, self_k_cache_size, k_cache_ptr}},
+                {"value_cache", Tensor{MEMORY_GPU, data_type, self_v_cache_size, v_cache_ptr}}};
 
             self_attention_layer_->forward(&self_attention_output_tensors,
                                            &self_attention_input_tensors,
                                            &gpt_decoder_layer_weight->at(l)->self_attention_weights);
 
+            if (use_shared_contexts) {
+                // Even with local batches, we must process the whole K/V caches as any
+                // element in batch_idx_to_compact_idx may reference the local batch
+                // we're processing. We also need to discard references that aren't in
+                // that particular local batch.
+                const size_t cache_stride_per_batch = hidden_units_ / tensor_para_.world_size_ * max_seq_len;
+                const size_t cache_layer_offset =
+                    (l - getFirstLayerParallelId()) * request_batch_size * cache_stride_per_batch;
+                invokeUnCompactCaches(k_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      v_cache.getPtrWithOffset<T>(cache_layer_offset),
+                                      k_cache_layer_,
+                                      v_cache_layer_,
+                                      input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                                      request_batch_size,  // batch_size (uncompact)
+                                      v_cache.shape[2],    // local_head_num
+                                      max_seq_len,
+                                      seq_len,
+                                      size_per_head_,
+                                      local_batch_size,
+                                      ite,
+                                      stream_);
+                sync_check_cuda_error();
+            }
+
             if (is_final == false) {
                 if (use_gptj_residual_) {
                     invokeGeneralLayerNorm(decoder_normed_input_,
@@ -402,10 +487,11 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                            h_token_num,
                                            hidden_units_,
                                            (float*)nullptr,
-                                           0,
+                                           int8_mode_,
                                            stream_);
                 }
                 else {
+                    // TODO: modify or not ?
                     invokeGeneralAddResidualT5PreLayerNorm(
                         self_attn_output_,
                         decoder_normed_input_,
@@ -472,7 +558,8 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                 }
 
                 if ((l == num_layer_ - 1) && is_unpadded_mha) {
-                    invokeRebuildPadding(decoder_output + ite * local_batch_size * seq_len * hidden_units_,
+                    T* base_ptr = use_shared_contexts ? compact_decoder_features_ : decoder_output;
+                    invokeRebuildPadding(base_ptr + ite * local_batch_size * seq_len * hidden_units_,
                                          decoder_layer_output_,
                                          padding_offset_,
                                          h_token_num,
@@ -483,12 +570,22 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
         }
     }
 
+    if (use_shared_contexts) {
+        invokeUnCompactOutputs(decoder_output,
+                               compact_decoder_features_,
+                               input_tensors->at("batch_to_compact_idx").getPtr<int>(),
+                               request_batch_size,  // batch
+                               seq_len * hidden_units_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+
     // TODO(bhsueh) We could optimize this point by only computing the last token for the last layer
     invokeLookupHiddenStateOfLastToken(output_tensors->at("last_token_hidden_units").getPtr<T>(),
                                        output_tensors->at("decoder_output").getPtr<T>(),
                                        input_tensors->at("input_lengths").getPtr<int>(),
                                        seq_len,
-                                       batch_size,
+                                       request_batch_size,
                                        hidden_units_,
                                        stream_);
     sync_check_cuda_error();
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h
index 788d1d45d..a2f91f7b8 100644
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -56,13 +56,15 @@ class LlamaContextDecoder: public BaseLayer {
 
     AttentionType attention_type_;
 
+    int int8_mode_ = 0;
+
     bool is_qk_buf_float_;
 
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
     void allocateBuffer() override;
-    void allocateBuffer(size_t batch_size, size_t seq_len);
+    void allocateBuffer(size_t batch_size, size_t seq_len, bool use_shared_contexts);
     void freeBuffer() override;
 
     bool isValidLayerParallelId(uint l);
@@ -81,6 +83,12 @@ class LlamaContextDecoder: public BaseLayer {
     int*    padding_offset_         = nullptr;
     int*    cu_seqlens_             = nullptr;
 
+    T*   compact_decoder_features_ = nullptr;
+    T*   compact_attention_mask_   = nullptr;
+    int* compact_input_lengths_    = nullptr;
+    T*   k_cache_layer_            = nullptr;
+    T*   v_cache_layer_            = nullptr;
+
 public:
     LlamaContextDecoder(size_t                              head_num,
                         size_t                              size_per_head,
@@ -98,6 +106,7 @@ class LlamaContextDecoder: public BaseLayer {
                         bool                                is_free_buffer_after_forward,
                         bool                                is_qk_buf_float,
                         AttentionType                       attention_type            = AttentionType::FUSED_MHA,
+                        int                                 int8_mode                = 0,
                         std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                         int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
index a5cffa731..b68c82207 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -35,10 +35,11 @@ void LlamaDecoder<T>::initialize()
                                                                            !use_gptj_residual_,
                                                                            is_free_buffer_after_forward_,
                                                                            false,
-                                                                           0,
+                                                                           int8_mode_,
                                                                            custom_all_reduce_comm_,
                                                                            enable_custom_all_reduce_);
 
+    // TODO: SiLu ftn layer not support int8
     ffn_layer_ = new TensorParallelSiluFfnLayer<T>(0,  // max_batch_size
                                                    1,
                                                    head_num_,
@@ -54,7 +55,8 @@ void LlamaDecoder<T>::initialize()
                                                    false,
                                                    true,  // use_gated_activation = true;
                                                    custom_all_reduce_comm_,
-                                                   enable_custom_all_reduce_);
+                                                   enable_custom_all_reduce_,
+                                                   int8_mode_); 
 }
 
 template<typename T>
@@ -133,6 +135,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
                               cublasMMWrapper*                    cublas_wrapper,
                               IAllocator*                         allocator,
                               bool                                is_free_buffer_after_forward,
+                              int                                 int8_mode,
                               std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm,
                               int                                 enable_custom_all_reduce):
     BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
@@ -147,6 +150,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t                              head_num,
     hidden_units_(head_num_ * size_per_head),
     tensor_para_(tensor_para),
     pipeline_para_(pipeline_para),
+    int8_mode_(int8_mode),
     custom_all_reduce_comm_(custom_all_reduce_comm),
     enable_custom_all_reduce_(enable_custom_all_reduce)
 {
@@ -167,6 +171,7 @@ LlamaDecoder<T>::LlamaDecoder(LlamaDecoder<T> const& decoder):
     hidden_units_(decoder.hidden_units_),
     tensor_para_(decoder.tensor_para_),
     pipeline_para_(decoder.pipeline_para_),
+    int8_mode_(decoder.int8_mode_),
     custom_all_reduce_comm_(decoder.custom_all_reduce_comm_),
     enable_custom_all_reduce_(decoder.enable_custom_all_reduce_)
 {
@@ -260,6 +265,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
             }
         }
 
+        // TODO 使用的是T5 LN，这里是没有int8的参数支持
         invokeGeneralT5LayerNorm(decoder_normed_input_,
                                  layer_input,
                                  gpt_decoder_layer_weight->at(l)->pre_layernorm_weights.gamma,
@@ -301,7 +307,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
                                    local_batch_size,
                                    hidden_units_,
                                    (float*)nullptr,
-                                   0,
+                                   int8_mode_,
                                    stream_);
         }
         else {
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h
index 6cdd7df27..dc44b0f32 100644
--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -70,6 +70,8 @@ class LlamaDecoder: public BaseLayer {
     BaseAttentionLayer<T>* self_attention_layer_;
     FfnLayer<T>*           ffn_layer_;
 
+    int int8_mode_ = 0;
+
 public:
     LlamaDecoder(size_t                              head_num,
                  size_t                              size_per_head,
@@ -85,6 +87,7 @@ class LlamaDecoder: public BaseLayer {
                  cublasMMWrapper*                    cublas_wrapper,
                  IAllocator*                         allocator,
                  bool                                is_free_buffer_after_forward,
+                 int                                 int8_mode                 = 0,
                  std::shared_ptr<AbstractCustomComm> custom_all_reduce_comm    = nullptr,
                  int                                 enable_custom_all_reduce_ = 0);
 
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
index 3e97b67d0..a8dadefea 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -24,15 +24,26 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int  hidden_units,
                                                     const int  inter_size,
                                                     const int  tensor_para_size,
                                                     const int  tensor_para_rank,
-                                                    const bool use_gptj_residual):
+                                                    const bool use_gptj_residual,
+                                                    const int  int8_mode):
     hidden_units_(hidden_units),
     inter_size_(inter_size),
     tensor_para_size_(tensor_para_size),
     tensor_para_rank_(tensor_para_rank),
+    int8_mode_(int8_mode),
     use_gptj_residual_(use_gptj_residual)
 {
     mallocWeights();
     setWeightPtr();
+
+    FT_CHECK_WITH_INFO(int8_mode_ != 2, "Llama doesn't support int8_model == 2");
+    FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value && int8_mode_ == 1),
+                       "Weight only quant does not work with FP32 compute.");
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const int int8_mode): int8_mode_(int8_mode)
+{
 }
 
 template<typename T>
@@ -60,38 +71,92 @@ LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
         ffn_weights.intermediate_weight2.bias   = nullptr;
         ffn_weights.output_weight.kernel       = nullptr;
         ffn_weights.output_weight.bias         = nullptr;
+
+        if (int8_mode_ != 0) {
+            for (int i = 0; i < int8_weights_ptr.size(); i++) {
+                if (int8_weights_ptr[i] != nullptr) {
+                    deviceFree(int8_weights_ptr[i]);
+                }
+            }
+
+            if (int8_mode_ == 1) {
+                for (int i = 0; i < weight_only_scale_ptr.size(); i++) {
+                    if (weight_only_scale_ptr[i] != nullptr) {
+                        deviceFree(weight_only_scale_ptr[i]);
+                    }
+                }
+            }
+
+            self_attention_weights.query_weight.int8_kernel                             = nullptr;
+            self_attention_weights.query_weight.weight_only_quant_scale                 = nullptr;
+            self_attention_weights.attention_output_weight.int8_kernel                  = nullptr;
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = nullptr;
+
+            //  作一下标记  intermediate_weight => gate_proj;  intermediate_weight2 => up_proj; output_weight => down_proj.
+            ffn_weights.intermediate_weight.int8_kernel                                 = nullptr;
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = nullptr;
+            ffn_weights.intermediate_weight2.int8_kernel                                = nullptr;
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = nullptr;
+            ffn_weights.output_weight.int8_kernel                                       = nullptr;
+            ffn_weights.output_weight.weight_only_quant_scale                           = nullptr;
+        }
+
         is_maintain_buffer                     = false;
     }
 }
 
 template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
-    hidden_units_(other.hidden_units_),
-    inter_size_(other.inter_size_),
-    tensor_para_size_(other.tensor_para_size_),
-    tensor_para_rank_(other.tensor_para_rank_),
-    use_gptj_residual_(other.use_gptj_residual_)
+void LlamaDecoderLayerWeight<T>::copyFrom(const LlamaDecoderLayerWeight& other)
 {
-    mallocWeights();
     cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
     cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
     if (!use_gptj_residual_) {
         cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
     }
-
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
     cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
     cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
     cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
     cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+
+    if (int8_mode_ == 0) {
+        cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
+    }
+    else {
+        cudaD2Dcpy(int8_weights_ptr[0], other.int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[1], other.int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        cudaD2Dcpy(int8_weights_ptr[2], other.int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[3], other.int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        cudaD2Dcpy(int8_weights_ptr[4], other.int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+        
+        if (int8_mode_ == 1) {
+            cudaD2Dcpy(weight_only_scale_ptr[0], other.weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[1], other.weight_only_scale_ptr[1], hidden_units_);
+            cudaD2Dcpy(weight_only_scale_ptr[2], other.weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+
+            // TODO: 不太清楚这里存的缩放因子对应的是gate_pro_weight 还是给 up_proj/down_proj用的，后面做一下验证，回来再改一下
+            cudaD2Dcpy(weight_only_scale_ptr[3], other.weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            cudaD2Dcpy(weight_only_scale_ptr[4], other.weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other):
+    hidden_units_(other.hidden_units_),
+    inter_size_(other.inter_size_),
+    tensor_para_size_(other.tensor_para_size_),
+    tensor_para_rank_(other.tensor_para_rank_),
+    int8_mode_(other.int8_mode_),
+    use_gptj_residual_(other.use_gptj_residual_)
+{
+    mallocWeights();
+    copyFrom(other);
     setWeightPtr();
 }
 
@@ -102,26 +167,12 @@ LlamaDecoderLayerWeight<T>& LlamaDecoderLayerWeight<T>::operator=(const LlamaDec
     inter_size_        = other.inter_size_;
     tensor_para_size_  = other.tensor_para_size_;
     tensor_para_rank_  = other.tensor_para_rank_;
+    int8_mode_          = other.int8_mode_;
     use_gptj_residual_ = other.use_gptj_residual_;
 
     mallocWeights();
 
-    cudaD2Dcpy(weights_ptr[0], other.weights_ptr[0], hidden_units_);
-    cudaD2Dcpy(weights_ptr[1], other.weights_ptr[1], hidden_units_);
-    cudaD2Dcpy(weights_ptr[2], other.weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[3], other.weights_ptr[3], 3 * hidden_units_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[4], other.weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);
-    if (!use_gptj_residual_) {
-        cudaD2Dcpy(weights_ptr[5], other.weights_ptr[5], hidden_units_);
-    }
-    cudaD2Dcpy(weights_ptr[6], other.weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[7], other.weights_ptr[7], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[8], other.weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[9], other.weights_ptr[9], inter_size_ / tensor_para_size_);
-    cudaD2Dcpy(weights_ptr[10], other.weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_);
-    cudaD2Dcpy(weights_ptr[11], other.weights_ptr[11], hidden_units_);
-    cudaD2Dcpy(weights_ptr[12], other.weights_ptr[12], hidden_units_);
-    cudaD2Dcpy(weights_ptr[13], other.weights_ptr[13], hidden_units_);
+    copyFrom(other);
     setWeightPtr();
     return *this;
 }
@@ -137,42 +188,90 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
     loadWeightFromBin<T>(
         weights_ptr[1], {(size_t)hidden_units_}, dir_path + ".input_layernorm.weight.bin", model_file_type);
 
-    loadWeightFromBin<T>(weights_ptr[2],
+    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
+
+    if (!use_gptj_residual_) {
+        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
+    }
+
+    // FIXME(sunpeng17): check if the weights are correct
+    // loadWeightFromBin<T>(weights_ptr[6],
+    //                      {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+    //                      dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
+
+    // loadWeightFromBin<T>(weights_ptr[10],
+    //                      {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+    //                      dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+    //                      model_file_type);
+
+
+    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+
+    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
+    loadWeightFromBin<T>(
+        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+
+    if (int8_mode_ == 0) {        
+        loadWeightFromBin<T>(weights_ptr[2],
                          {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
                          dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[3], (size_t)(3 * hidden_units_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[4],
+        loadWeightFromBin<T>(weights_ptr[4],
                          {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".attention.dense.weight." + rank_spec + ".bin",
                          model_file_type);
-    if (!use_gptj_residual_) {
-        deviceFill(weights_ptr[5], (size_t)hidden_units_, (T)0.0);
-    }
 
-    // FIXME(sunpeng17): check if the weights are correct
-    loadWeightFromBin<T>(weights_ptr[6],
+        loadWeightFromBin<T>(weights_ptr[6],
                          {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
                          dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[7], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
 
-    loadWeightFromBin<T>(weights_ptr[8],
-                         {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+        loadWeightFromBin<T>(weights_ptr[8],
+                         {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[9], (size_t)(inter_size_ / tensor_para_size_), (T)0.0);
-
-    loadWeightFromBin<T>(weights_ptr[10],
+        loadWeightFromBin<T>(weights_ptr[10],
                          {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
                          dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
                          model_file_type);
-    deviceFill(weights_ptr[11], (size_t)(hidden_units_), (T)0.0);
+    }
+    else if (int8_mode_ == 1) {
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[0],
+                                                     weight_only_scale_ptr[0],
+                                                     {(size_t)hidden_units_, (size_t)(3 * hidden_units_ / tensor_para_size_)},
+                                                     dir_path + ".attention.query_key_value.weight." + rank_spec + ".bin",
+                                                     model_file_type);
 
-    deviceFill(weights_ptr[12], (size_t)(hidden_units_), (T)0.0);
-    loadWeightFromBin<T>(
-        weights_ptr[13], {(size_t)hidden_units_}, dir_path + ".post_attention_layernorm.weight.bin", model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[1],
+                                                     weight_only_scale_ptr[1],
+                                                     {(size_t)(hidden_units_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".attention.dense.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[2],
+                                                     weight_only_scale_ptr[2],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.gate_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[3],
+                                                     weight_only_scale_ptr[3],
+                                                     {(size_t)hidden_units_, (size_t)(inter_size_ / tensor_para_size_)},
+                                                     dir_path + ".mlp.up_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+        loadWeightFromBinAndQuantizeForWeightOnly<T>(int8_weights_ptr[4],
+                                                     weight_only_scale_ptr[4],
+                                                     {(size_t)(inter_size_ / tensor_para_size_), (size_t)hidden_units_},
+                                                     dir_path + ".mlp.down_proj.weight." + rank_spec + ".bin",
+                                                     model_file_type);
+
+    }
 }
 
 template<typename T>
@@ -194,6 +293,23 @@ void LlamaDecoderLayerWeight<T>::setWeightPtr()
 
     post_attention_layernorm_weights.beta  = weights_ptr[12];
     post_attention_layernorm_weights.gamma = weights_ptr[13];
+
+    if (int8_mode_ != 0) {
+        self_attention_weights.query_weight.int8_kernel                 = int8_weights_ptr[0];
+        self_attention_weights.attention_output_weight.int8_kernel      = int8_weights_ptr[1];
+        ffn_weights.intermediate_weight.int8_kernel                     = int8_weights_ptr[2];
+        ffn_weights.intermediate_weight2.int8_kernel                    = int8_weights_ptr[3];
+        ffn_weights.output_weight.int8_kernel                           = int8_weights_ptr[4];
+
+        if (int8_mode_ == 1) {
+            self_attention_weights.query_weight.weight_only_quant_scale                 = weight_only_scale_ptr[0];
+            self_attention_weights.attention_output_weight.weight_only_quant_scale      = weight_only_scale_ptr[1];
+            ffn_weights.intermediate_weight.weight_only_quant_scale                     = weight_only_scale_ptr[2];
+            ffn_weights.intermediate_weight2.weight_only_quant_scale                    = weight_only_scale_ptr[3];
+            ffn_weights.output_weight.weight_only_quant_scale                           = weight_only_scale_ptr[4];
+        }
+    }
+    
     is_maintain_buffer                     = true;
 }
 
@@ -202,21 +318,48 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
 {
     deviceMalloc(&weights_ptr[0], hidden_units_); // pre layernorm beta
     deviceMalloc(&weights_ptr[1], hidden_units_); // pre layernorm gamma
-    deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
+    // deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_); // qkv kernel
     deviceMalloc(&weights_ptr[3], 3 * hidden_units_ / tensor_para_size_); // qkv bias
-    deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
+    // deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_); // attention output weight
     if (!use_gptj_residual_) {
         deviceMalloc(&weights_ptr[5], hidden_units_); // attention output bias
     }
 
-    deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
+    // deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight kernel
     deviceMalloc(&weights_ptr[7], inter_size_ / tensor_para_size_);                 // intermediate_weight bias
-    deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
+    // deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_); // intermediate_weight2 kernel
     deviceMalloc(&weights_ptr[9], inter_size_ / tensor_para_size_);                 // intermediate_weight2 bias
-    deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
+    // deviceMalloc(&weights_ptr[10], inter_size_ / tensor_para_size_ * hidden_units_); // output_weight kernel
     deviceMalloc(&weights_ptr[11], hidden_units_);                                   // output_weight bias
     deviceMalloc(&weights_ptr[12], hidden_units_); // post attn layernorm beta
     deviceMalloc(&weights_ptr[13], hidden_units_); // post attn layernorm gamma
+
+    if (int8_mode_ == 0) {
+        deviceMalloc(&weights_ptr[2], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);  // qkv weight
+        deviceMalloc(&weights_ptr[4], hidden_units_ / tensor_para_size_ * hidden_units_);  // attention output weight
+        deviceMalloc(&weights_ptr[6], hidden_units_ * inter_size_ / tensor_para_size_);   // intermediate_weight kernel
+        deviceMalloc(&weights_ptr[8], hidden_units_ * inter_size_ / tensor_para_size_);  // intermediate_weight2 kernel
+        deviceMalloc(&weights_ptr[10],  inter_size_ / tensor_para_size_ * hidden_units_);  // output_weight kernel    
+    }
+    else {
+        // Alloc FFN and Attention int8 weights
+        deviceMalloc(&int8_weights_ptr[0], hidden_units_ * 3 * hidden_units_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[1], hidden_units_ / tensor_para_size_ * hidden_units_);
+        deviceMalloc(&int8_weights_ptr[2], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[3], hidden_units_ * inter_size_ / tensor_para_size_);
+        deviceMalloc(&int8_weights_ptr[4], inter_size_ / tensor_para_size_ * hidden_units_);
+
+
+        if (int8_mode_ == 1) {
+            // Alloc scales for weight only quant for attention and FFN weights
+            deviceMalloc(&weight_only_scale_ptr[0], 3 * hidden_units_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[1], hidden_units_);
+            deviceMalloc(&weight_only_scale_ptr[2], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[3], inter_size_ / tensor_para_size_);
+            deviceMalloc(&weight_only_scale_ptr[4], hidden_units_);
+        }
+    }
+
 }
 
 template struct LlamaDecoderLayerWeight<float>;
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
index 008e1a3b4..5a76ba4c5 100644
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -29,11 +29,13 @@ template<typename T>
 struct LlamaDecoderLayerWeight {
 public:
     LlamaDecoderLayerWeight() = default;
+    LlamaDecoderLayerWeight(const int int8_mode);
     LlamaDecoderLayerWeight(const int  hidden_units,
                             const int  inter_size,
                             const int  tensor_para_size  = 1,
                             const int  tensor_para_rank  = 0,
-                            const bool use_gptj_residual = true);
+                            const bool use_gptj_residual = true,
+                            const int int8_mode = 0);
     ~LlamaDecoderLayerWeight();
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other);
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other);
@@ -54,9 +56,14 @@ struct LlamaDecoderLayerWeight {
     const int attention_dense_bias_weight_id = 5;
     bool      is_maintain_buffer             = false;
     T*        weights_ptr[14];
+    int       int8_mode_ = 0;
+
+    std::vector<int8_t*> int8_weights_ptr = std::vector<int8_t*>(5, nullptr);
+    std::vector<T*>      weight_only_scale_ptr = std::vector<T*>(5, nullptr);
 
     void setWeightPtr();
     void mallocWeights();
+    void copyFrom(const LlamaDecoderLayerWeight& other);
 };
 
 }  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
index 6105267ff..e9e11b6a1 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -29,6 +29,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
                             const int                                  layer_para_size,
                             const int                                  layer_para_rank,
                             const bool                                 use_gptj_residual,
+                            const int                                  int8_mode,
                             PromptLearningType                         prompt_learning_type,
                             std::map<std::string, std::pair<int, int>> prompt_learning_pair):
     hidden_units_(hidden_units),
@@ -41,6 +42,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     layer_para_size_(layer_para_size),
     layer_para_rank_(layer_para_rank),
     use_gptj_residual_(use_gptj_residual),
+    int8_mode_(int8_mode),
     prompt_learning_type_(prompt_learning_type),
     prompt_learning_pair_(prompt_learning_pair)
 {
@@ -62,7 +64,7 @@ LlamaWeight<T>::LlamaWeight(const int                                  hidden_un
     for (int l = 0; l < num_layer_; l++) {
         if (isValidLayerParallelId(l)) {
             decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
-                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_));
+                hidden_units_, inter_size_, tensor_para_size_, tensor_para_rank_, use_gptj_residual_, int8_mode_));
         }
         else {
             // Layer-parallelism: allocate empty layer because
@@ -103,6 +105,7 @@ LlamaWeight<T>::LlamaWeight(const LlamaWeight& other):
     layer_para_size_(other.layer_para_size_),
     layer_para_rank_(other.layer_para_rank_),
     use_gptj_residual_(other.use_gptj_residual_),
+    int8_mode_(other.int8_mode_),
     prompt_token_weight_size_(other.prompt_token_weight_size_),
     malloc_load_prompt_weights_(other.malloc_load_prompt_weights_),
     prompt_learning_type_(other.prompt_learning_type_),
@@ -149,6 +152,7 @@ LlamaWeight<T>& LlamaWeight<T>::operator=(const LlamaWeight& other)
     layer_para_size_            = other.layer_para_size_;
     layer_para_rank_            = other.layer_para_rank_;
     use_gptj_residual_          = other.use_gptj_residual_;
+    int8_mode_                  = other.int8_mode_;
     prompt_token_weight_size_   = other.prompt_token_weight_size_;
     malloc_load_prompt_weights_ = other.malloc_load_prompt_weights_;
     prompt_learning_type_       = other.prompt_learning_type_;
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
index ec909ca49..eb90dc5bf 100644
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -38,6 +38,7 @@ struct LlamaWeight {
         const int                                  layer_para_size      = 1,
         const int                                  layer_para_rank      = 0,
         const bool                                 use_gptj_residual_   = false,
+        const int                                  int8_mode            = 0,
         PromptLearningType                         prompt_learning_type = PromptLearningType::no_prompt,
         std::map<std::string, std::pair<int, int>> prompt_learning_pair = std::map<std::string, std::pair<int, int>>{});
 
@@ -88,6 +89,8 @@ struct LlamaWeight {
     int layer_para_size_;
     int layer_para_rank_;
 
+    size_t int8_mode_    = 0;
+
     // residual type
     bool use_gptj_residual_;