diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc index 14bb5e3f6..4b18b54ad 100644 --- a/src/fastertransformer/layers/FfnLayer.cc +++ b/src/fastertransformer/layers/FfnLayer.cc @@ -81,7 +81,7 @@ void FfnLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors, c } // TODO: INT8 and Sparsity are currently not implemented (geglu or reglu) - const bool use_gated_activation = use_gated_activation_ && ffn_weights->intermediate_weight2.kernel != nullptr; + const bool use_gated_activation = use_gated_activation_ && (ffn_weights->intermediate_weight2.kernel != nullptr || ffn_weights->intermediate_weight2.int8_kernel != nullptr); // moe can't be used with use_gated_activation currently FT_CHECK(!(use_gated_activation && use_moe)); diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc index a8dadefea..b7e0fe002 100644 --- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc @@ -50,7 +50,7 @@ template LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() { if (is_maintain_buffer == true) { - for (int i = 0; i < 12; i++) { + for (int i = 0; i < 14; i++) { if (!use_gptj_residual_ && i != attention_dense_bias_weight_id) { cudaFree(weights_ptr[i]); }