diff --git a/CMakeLists.txt b/CMakeLists.txt index e3479387d7c3..c4b37bb8a2ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON option(USE_LAPACK "Build with lapack support" ON) option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) +option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal. No effect unless USE_MKL_IF_AVAILABLE is set." OFF) if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING)) option(USE_MKLDNN "Build with MKL-DNN support" ON) else() @@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE) link_libraries(gcov) endif() +if(USE_MKL_LAYERNORM) + add_definitions(-DMXNET_USE_MKL_LAYERNORM=1) +endif() if(USE_MKLDNN) # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3). if(MSVC) @@ -447,6 +451,16 @@ if(USE_OPENMP) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + # Enable pragma omp simd + # "While the name of this switch is 'experimental', the switch itself, and + # the functionality it enables is fully supported and production-ready. + # The name reflects that it doesn’t enable any complete subset or + # version of an OpenMP standard." + # -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/ + if(MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental") + endif() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") add_definitions(-DMXNET_USE_OPENMP=1) diff --git a/LICENSE b/LICENSE index 79ca84b2377a..e937dd75c0e3 100644 --- a/LICENSE +++ b/LICENSE @@ -248,6 +248,7 @@ docs/python_docs/themes/mx-theme 3rdparty/intgemm 3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h + src/operator/nn/layer_norm.cc ======================================================================================= 3-clause BSD license diff --git a/Makefile b/Makefile index b332b9b5b7b2..d4e221d4bb61 100644 --- a/Makefile +++ b/Makefile @@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1) LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a endif +# Use MKL's layernorm implementation. Only has an impact if MKL is compiled in. +ifeq ($(USE_MKL_LAYERNORM), 1) + CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1 +endif + # setup opencv ifeq ($(USE_OPENCV), 1) CFLAGS += -DMXNET_USE_OPENCV=1 diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 68499443e51c..60f1c289c0a7 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() { DEV=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=mkl \ + USE_MKL_LAYERNORM=1 \ USE_TVM_OP=1 \ USE_MKLDNN=0 \ USE_INTEL_PATH=/opt/intel \ diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc index 11178b358c2d..08bd1a3955ec 100644 --- a/src/operator/nn/layer_norm.cc +++ b/src/operator/nn/layer_norm.cc @@ -15,6 +15,37 @@ * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. + * + * Function LayerNormCPUKernel is adapated from Marian + * https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp + * under the MIT license + * MIT License + * + * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam + * Mickiewicz University + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + * */ /*! @@ -27,10 +58,6 @@ #include #include "../elemwise_op_common.h" -#if MSHADOW_USE_MKL == 1 -#include "../mkl_functions-inl.h" -#endif - namespace mxnet { namespace op { @@ -68,23 +95,126 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs, return true; } -template<> -void LayerNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - return LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); +/* CPU optimized kernel for LayerNorm assuming axis = -1. + * Data is the underlying storage data type. + * Accum is the type to use for accumulation. + * Apparently there isn't a reduction operator for half_t and anyway it isn't + * efficient to use on the CPU, so use float for reduction of half_t. + * + * width is the number of values being summed to compute a mean. + * instances is how many independent layer normalization problems are packed into the tensors. + * + * Inputs: + * data is instances x width + * gamma is width + * beta is width + * + * Outputs: + * out is instances x width, can be same as data + * mean is instances: means of each problem + * std is instances: standard deviation of each problem + * + */ +template ::value, + float, + Data>::type> +void LayerNormCPUKernel(size_t width, + size_t instances, + Data eps, + const Data *data, + const Data *gamma, + const Data *beta, + Data *out, + Data *mean, + Data *std) { + // Parallelize over independent instances to normalize. + // MSVC says index variable in OpenMP 'for' statement must have signed integral type. + const mshadow::index_t signed_instances = static_cast(instances); +#pragma omp parallel for + for (nnvm::dim_t j = 0; j < signed_instances; ++j) { + const Data *from = data + j * width; + + // Sum the values to compute mean. + Accum sum = 0.f; +#pragma omp simd reduction(+ : sum) + for (size_t i = 0; i < width; ++i) { + sum += from[i]; + } + Accum mean_value = sum / width; + mean[j] = static_cast(mean_value); + + // Sum squares from mean to compute stddev. + Accum squares = 0.f; +#pragma omp simd reduction(+ : squares) + for (size_t i = 0; i < width; ++i) { + Accum off = from[i] - mean_value; + squares += off * off; + } + Accum sigma = std::sqrt(squares / width + eps); + std[j] = static_cast(sigma); + + // Write normalized values. + Data *to = out + j * width; +#pragma omp simd + for (size_t i = 0; i < width; ++i) { + to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i]; + } + } } -#if MSHADOW_USE_MKL == 1 -void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, +/* Wrap the above LayerNormCPUKernel in MXNet's API. Returns true if it + * is able to run. + */ +bool LayerNormCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const LayerNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 3U); + + switch (req[layernorm::kOut]) { + case kNullOp: + return true; + case kWriteTo: + break; + case kWriteInplace: + break; + default: + // Should only be kAddTo, which isn't supported by the others implementation either. + return false; + } + // Axis must be the last one. + int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim()); + if (axis != inputs[layernorm::kData].ndim() - 1) { + return false; + } + MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, { + LayerNormCPUKernel( + inputs[layernorm::kData].shape_[axis], + outputs[layernorm::kMean].Size(), + param.eps, + inputs[layernorm::kData].dptr(), + inputs[layernorm::kGamma].dptr(), + inputs[layernorm::kBeta].dptr(), + outputs[layernorm::kOut].dptr(), + outputs[layernorm::kMean].dptr(), + outputs[layernorm::kStd].dptr()); + }); + return true; +} + +#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1 +bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { using namespace mshadow; const LayerNormParam& param = nnvm::get(attrs.parsed); - if (req[0] == kNullOp) return; + if (req[0] == kNullOp) return true; CHECK_NE(req[0], kAddTo); CHECK_EQ(inputs.size(), 3U); int axis = GetRealAxis(param.axis, inputs[0].ndim()); @@ -113,13 +243,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs, outputs[layernorm::kStd].dptr(), static_cast(param.eps)); }); + return true; } else { // fallback - LayerNormCompute(attrs, ctx, inputs, req, outputs); + return false; } } #endif +template<> +void LayerNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1 + if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return; +#endif + if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return; + LayerNormComputeGeneral(attrs, ctx, inputs, req, outputs); +} template<> void LayerNormGradCompute(const nnvm::NodeAttrs& attrs, @@ -175,11 +317,7 @@ axis to be the last item in the input shape. }) .set_attr("FInferShape", LayerNormShape) .set_attr("FInferType", ElemwiseType<3, 3>) -#if MSHADOW_USE_MKL == 1 -.set_attr("FCompute", LayerNormComputeMKL) -#else .set_attr("FCompute", LayerNormCompute) -#endif .set_attr("FGradient", [](const nnvm::ObjectPtr& n, const std::vector& ograds) { std::vector heads;