From 2f87167f70ca6bd268b47744c2cf57689093c8d4 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 30 Nov 2020 14:34:03 +0000 Subject: [PATCH 1/9] Layer normalization code from Marian --- LICENSE | 2 + src/operator/nn/layer_norm.cc | 164 +++++++++++++++++++++++++++++++--- 2 files changed, 154 insertions(+), 12 deletions(-) diff --git a/LICENSE b/LICENSE index d9c398334c26..610bb5fdfca0 100644 --- a/LICENSE +++ b/LICENSE @@ -309,6 +309,8 @@ (Except 3rdparty/intgemm/test/3rd_party/catch.hpp is under the Boost license, listed separately) builtin_fp16.h - For details, see 3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h Copyright (c) 2009-2015 by llvm/compiler-rt contributors + LayerNormCPUKernel from Marian - For details see src/operator/nn/layer_norm.cc + Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam Mickiewicz University ======================================================================================= 3-clause BSD license diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 11178b358c2d..79f2fe6addef 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -15,6 +15,37 @@ * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. + * + * Function LayerNormCPUKernel is adapated from Marian + * https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp + * under the MIT license + * MIT License + * + * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam + * Mickiewicz University + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + * */ /*! @@ -68,16 +99,117 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs, return true; } -template<> -void LayerNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - return LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); +/* CPU optimized kernel for LayerNorm assuming axis = -1. + * Data is the underlying storage data type. + * Accum is the type to use for accumulation. + * Apparently there isn't a reduction operator for half_t and anyway it isn't + * efficient to use on the CPU, so use float for reduction of half_t. + * + * width is the number of values being summed to compute a mean. + * instances is how many independent layer normalization problems are packed into the tensors. + * + * Inputs: + * data is instances x width + * gamma is width + * beta is width + * + * Outputs: + * out is instances x width, can be same as data + * mean is instances: means of each problem + * std is instances: standard deviation of each problem + * + */ +template ::value, + float, + Data>::type> +void LayerNormCPUKernel(size_t width, + size_t instances, + Data eps, + const Data *data, + const Data *gamma, + const Data *beta, + Data *out, + Data *mean, + Data *std) { + // Parallelize over independent instances to normalize. +#pragma omp parallel for + for (size_t j = 0; j < instances; ++j) { + const Data *from = data + j * width; + + // Sum the values to compute mean. + Accum sum = 0.f; +#pragma omp simd reduction(+ : sum) + for (size_t i = 0; i < width; ++i) { + sum += from[i]; + } + Accum mean_value = sum / width; + mean[j] = static_cast(mean_value); + + // Sum squares from mean to compute stddev. + Accum squares = 0.f; +#pragma omp simd reduction(+ : squares) + for (size_t i = 0; i < width; ++i) { + Accum off = from[i] - mean_value; + squares += off * off; + } + Accum sigma = std::sqrt(squares / width + eps); + std[j] = static_cast(sigma); + + // Write normalized values. + Data *to = out + j * width; +#pragma omp simd + for (size_t i = 0; i < width; ++i) { + to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i]; + } + } +} + +/* Wrap the above LayerNormCPUKernel in MXNet's API. Returns true if it + * is able to run. + */ +bool LayerNormCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const LayerNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 3U); + + switch (req[layernorm::kOut]) { + case kNullOp: + return true; + case kWriteTo: + break; + case kWriteInplace: + break; + default: + // Should only be kAddTo, which isn't supported by the others implementation either. + return false; + } + // Axis must be the last one. + int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim()); + if (axis != inputs[layernorm::kData].ndim() - 1) { + return false; + } + MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, { + LayerNormCPUKernel( + inputs[layernorm::kData].shape_[axis], + outputs[layernorm::kMean].Size(), + param.eps, + inputs[layernorm::kData].dptr(), + inputs[layernorm::kGamma].dptr(), + inputs[layernorm::kBeta].dptr(), + outputs[layernorm::kOut].dptr(), + outputs[layernorm::kMean].dptr(), + outputs[layernorm::kStd].dptr()); + }); + return true; } #if MSHADOW_USE_MKL == 1 -void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, +bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -113,13 +245,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, outputs[layernorm::kStd].dptr(), static_cast(param.eps)); }); + return true; } else { // fallback - LayerNormCompute(attrs, ctx, inputs, req, outputs); + return false; } } #endif +template<> +void LayerNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MSHADOW_USE_MKL == 1 + if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return; +#endif + if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; + LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); +} template<> void LayerNormGradCompute(const nnvm::NodeAttrs& attrs, @@ -175,11 +319,7 @@ axis to be the last item in the input shape. }) .set_attr("FInferShape", LayerNormShape) .set_attr("FInferType", ElemwiseType<3, 3>) -#if MSHADOW_USE_MKL == 1 -.set_attr("FCompute", LayerNormComputeMKL) -#else .set_attr("FCompute", LayerNormCompute) -#endif .set_attr("FGradient", [](const nnvm::ObjectPtr& n, const std::vector& ograds) { std::vector heads; From 95efe8fcb0e79821c8751f754c2ca8896b15a5a9 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 30 Nov 2020 17:07:58 +0000 Subject: [PATCH 2/9] Remove MKL version of LayerNorm. Experiment with OMP_NUM_THREADS=4, times in s, c5.12xlarge |batchxchanne| New code | MKL | | 1x 32 | 0.0000288| 0.0000278| | 128x 32 | 0.0000308| 0.0000311| | 2560x 32 | 0.0000712| 0.0000672| | 4096x 32 | 0.0000946| 0.0000910| | 8192x 32 | 0.0001597| 0.0001523| |16384x 32 | 0.0002905| 0.0002619| | 1x 64 | 0.0000264| 0.0000256| | 128x 64 | 0.0000339| 0.0000330| | 2560x 64 | 0.0000829| 0.0000972| | 4096x 64 | 0.0001137| 0.0001356| | 8192x 64 | 0.0002027| 0.0002435| |16384x 64 | 0.0003715| 0.0004639| | 1x 128 | 0.0000262| 0.0000263| | 128x 128 | 0.0000325| 0.0000389| | 2560x 128 | 0.0001074| 0.0001580| | 4096x 128 | 0.0001505| 0.0002336| | 8192x 128 | 0.0002861| 0.0004481| |16384x 128 | 0.0005648| 0.0008613| | 1x 256 | 0.0000273| 0.0000276| | 128x 256 | 0.0000390| 0.0000431| | 2560x 256 | 0.0001533| 0.0002811| | 4096x 256 | 0.0002258| 0.0004300| | 8192x 256 | 0.0004300| 0.0008464| |16384x 256 | 0.0010436| 0.0017613| | 1x 512 | 0.0000256| 0.0000302| | 128x 512 | 0.0000408| 0.0000551| | 2560x 512 | 0.0002444| 0.0005225| | 4096x 512 | 0.0003828| 0.0008147| | 8192x 512 | 0.0008832| 0.0017192| |16384x 512 | 0.0058463| 0.0074497| | 1x 768 | 0.0000252| 0.0000308| | 128x 768 | 0.0000450| 0.0000676| | 2560x 768 | 0.0003440| 0.0007719| | 4096x 768 | 0.0005890| 0.0013346| | 8192x 768 | 0.0014946| 0.0026145| |16384x 768 | 0.0089495| 0.0113557| | 1x 1024 | 0.0000285| 0.0000308| | 128x 1024 | 0.0000487| 0.0000786| | 2560x 1024 | 0.0004614| 0.0010190| | 4096x 1024 | 0.0008083| 0.0017376| | 8192x 1024 | 0.0059020| 0.0075588| |16384x 1024 | 0.0116553| 0.0146855| Benchmark program ```python import mxnet as mx import time def time_procedure(shape, count): data = mx.nd.random_uniform(shape=shape, low=-1.0, high = 1.0) factors = mx.nd.random_uniform(shape=(shape[-1],)) mx.nd.waitall() begin = time.time() for i in range(0, count): out = mx.nd.LayerNorm(data, factors, factors) mx.nd.waitall() return (time.time() - begin) / count count = 200 for channel in [32, 64, 128, 256, 512, 768, 1024]: for batch in [1, 128, 2560, 4096, 8192, 16384]: s = (batch, channel) timing = time_procedure(s, count) print("{:5d}x{:5d} | {:.7f}".format(s[0], s[1], timing)) ``` --- src/operator/nn/layer_norm.cc | 52 ----------------------------------- 1 file changed, 52 deletions(-) diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 79f2fe6addef..5d3095058861 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -58,10 +58,6 @@ #include #include "../elemwise_op_common.h" -#if MSHADOW_USE_MKL == 1 -#include "../mkl_functions-inl.h" -#endif - namespace mxnet { namespace op { @@ -208,59 +204,11 @@ bool LayerNormCPU(const nnvm::NodeAttrs& attrs, return true; } -#if MSHADOW_USE_MKL == 1 -bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - const LayerNormParam& param = nnvm::get(attrs.parsed); - if (req[0] == kNullOp) return; - CHECK_NE(req[0], kAddTo); - CHECK_EQ(inputs.size(), 3U); - int axis = GetRealAxis(param.axis, inputs[0].ndim()); - - // This optimization only applys for LayerNorm on the last dimension with dtype FP32 or FP64. - if (axis == (inputs[layernorm::kData].ndim() - 1) && - (inputs[0].type_flag_ == kFloat32 || inputs[0].type_flag_ == kFloat64)) { - // Compute necessary data for the reduce operation. - mxnet::TShape red_src_shape, red_dst_shape; - BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_, - &red_src_shape, &red_dst_shape); - const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape); - const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape); - const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape); - const int outter_size = red_dst_shape.Size(); - const int channel_size = red_src_shape.Size() / red_dst_shape.Size(); - - // call - MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, { - mkl_func::LayerNormLastDim(outter_size, channel_size, - in_data.dptr(), - outputs[layernorm::kOut].dptr(), - inputs[layernorm::kGamma].dptr(), - inputs[layernorm::kBeta].dptr(), - outputs[layernorm::kMean].dptr(), - outputs[layernorm::kStd].dptr(), - static_cast(param.eps)); - }); - return true; - } else { - // fallback - return false; - } -} -#endif - template<> void LayerNormCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { -#if MSHADOW_USE_MKL == 1 - if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return; -#endif if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); } From 40d3326f4e1f2ca95470f1b4b5e13bcee56dcb1b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 4 Dec 2020 14:11:03 +0000 Subject: [PATCH 3/9] Enable pragma omp simd on MSVC --- CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e3479387d7c3..1f16b16dcfe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -447,6 +447,16 @@ if(USE_OPENMP) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + # Enable pragma omp simd + # "While the name of this switch is 'experimental', the switch itself, and + # the functionality it enables is fully supported and production-ready. + # The name reflects that it doesn’t enable any complete subset or + # version of an OpenMP standard." + # -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/ + if(MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental") + endif() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") add_definitions(-DMXNET_USE_OPENMP=1) From 3605226bc03c14239fc1df93d753bba410c1318d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 7 Dec 2020 10:08:12 +0000 Subject: [PATCH 4/9] Fix MSVC error C3016: 'j': index variable in OpenMP 'for' statement must have signed integral type --- src/operator/nn/layer_norm.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 5d3095058861..6b9efabdb3b9 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -130,8 +130,10 @@ void LayerNormCPUKernel(size_t width, Data *mean, Data *std) { // Parallelize over independent instances to normalize. + // MSVC says index variable in OpenMP 'for' statement must have signed integral type. + const ssize_t signed_instances = static_cast(instances); #pragma omp parallel for - for (size_t j = 0; j < instances; ++j) { + for (ssize_t j = 0; j < signed_instances; ++j) { const Data *from = data + j * width; // Sum the values to compute mean. From dcb61aaf323d916fd094b49b114bf83fad535692 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 7 Dec 2020 12:24:39 +0000 Subject: [PATCH 5/9] Try to make MSVC happy since it doesn't have ssize_t --- src/operator/nn/layer_norm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 6b9efabdb3b9..fdb5dc09ae51 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -131,9 +131,9 @@ void LayerNormCPUKernel(size_t width, Data *std) { // Parallelize over independent instances to normalize. // MSVC says index variable in OpenMP 'for' statement must have signed integral type. - const ssize_t signed_instances = static_cast(instances); + const mshadow::index_t signed_instances = static_cast(instances); #pragma omp parallel for - for (ssize_t j = 0; j < signed_instances; ++j) { + for (nnvm::dim_t j = 0; j < signed_instances; ++j) { const Data *from = data + j * width; // Sum the values to compute mean. From a11dc7e3428693586a660c0f5838ee92899d4b6b Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 7 Dec 2020 14:32:37 +0000 Subject: [PATCH 6/9] Change gcc 8 PPA to ppa:jonathonf/gcc --- ci/docker/install/ubuntu_gcc8.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/docker/install/ubuntu_gcc8.sh b/ci/docker/install/ubuntu_gcc8.sh index cd31f8213c1a..7488aaecd4ad 100755 --- a/ci/docker/install/ubuntu_gcc8.sh +++ b/ci/docker/install/ubuntu_gcc8.sh @@ -17,7 +17,6 @@ # specific language governing permissions and limitations # under the License. -sudo add-apt-repository ppa:jonathonf/gcc-8.0 -sudo add-apt-repository ppa:jonathonf/gcc-7.3 +sudo add-apt-repository ppa:jonathonf/gcc sudo apt-get update || true sudo apt-get install -y gcc-8 g++-8 From 2d2a91e82040a93a2aa31455e3b50c0a0110867d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 21 Dec 2020 11:50:42 +0000 Subject: [PATCH 7/9] Option to use MKL version requested by @samskalicky --- CMakeLists.txt | 4 +++ src/operator/nn/layer_norm.cc | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f16b16dcfe9..c4b37bb8a2ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON option(USE_LAPACK "Build with lapack support" ON) option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) +option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal. No effect unless USE_MKL_IF_AVAILABLE is set." OFF) if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING)) option(USE_MKLDNN "Build with MKL-DNN support" ON) else() @@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE) link_libraries(gcov) endif() +if(USE_MKL_LAYERNORM) + add_definitions(-DMXNET_USE_MKL_LAYERNORM=1) +endif() if(USE_MKLDNN) # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3). if(MSVC) diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index fdb5dc09ae51..9c60e08b01f5 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -206,12 +206,60 @@ bool LayerNormCPU(const nnvm::NodeAttrs& attrs, return true; } +#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1 +bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + const LayerNormParam& param = nnvm::get(attrs.parsed); + if (req[0] == kNullOp) return true; + CHECK_NE(req[0], kAddTo); + CHECK_EQ(inputs.size(), 3U); + int axis = GetRealAxis(param.axis, inputs[0].ndim()); + + // This optimization only applys for LayerNorm on the last dimension with dtype FP32 or FP64. + if (axis == (inputs[layernorm::kData].ndim() - 1) && + (inputs[0].type_flag_ == kFloat32 || inputs[0].type_flag_ == kFloat64)) { + // Compute necessary data for the reduce operation. + mxnet::TShape red_src_shape, red_dst_shape; + BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_, + &red_src_shape, &red_dst_shape); + const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape); + const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape); + const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape); + const int outter_size = red_dst_shape.Size(); + const int channel_size = red_src_shape.Size() / red_dst_shape.Size(); + + // call + MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, { + mkl_func::LayerNormLastDim(outter_size, channel_size, + in_data.dptr(), + outputs[layernorm::kOut].dptr(), + inputs[layernorm::kGamma].dptr(), + inputs[layernorm::kBeta].dptr(), + outputs[layernorm::kMean].dptr(), + outputs[layernorm::kStd].dptr(), + static_cast(param.eps)); + }); + return true; + } else { + // fallback + return false; + } +} +#endif + template<> void LayerNormCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; +#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1 + if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return; +#endif LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); } From e5093eb1b79373efb25e14b435700617a2098073 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 21 Dec 2020 13:00:48 +0000 Subject: [PATCH 8/9] Fix order if MKL override is on --- src/operator/nn/layer_norm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 9c60e08b01f5..08bd1a3955ec 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -256,10 +256,10 @@ void LayerNormCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; #if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1 if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return; #endif + if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); } From a5665587e05a65a6b003b47e3cc98eeed7fa1f5d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 28 Dec 2020 13:18:33 +0000 Subject: [PATCH 9/9] Have CI test MKL layer norm in build_ubuntu_cpu_mkl --- Makefile | 5 +++++ ci/docker/runtime_functions.sh | 1 + 2 files changed, 6 insertions(+) diff --git a/Makefile b/Makefile index b332b9b5b7b2..d4e221d4bb61 100644 --- a/Makefile +++ b/Makefile @@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1) LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a endif +# Use MKL's layernorm implementation. Only has an impact if MKL is compiled in. +ifeq ($(USE_MKL_LAYERNORM), 1) + CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1 +endif + # setup opencv ifeq ($(USE_OPENCV), 1) CFLAGS += -DMXNET_USE_OPENCV=1 diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 68499443e51c..60f1c289c0a7 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() { DEV=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=mkl \ + USE_MKL_LAYERNORM=1 \ USE_TVM_OP=1 \ USE_MKLDNN=0 \ USE_INTEL_PATH=/opt/intel \