Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT
option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
option(USE_LAPACK "Build with lapack support" ON)
option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal. No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
option(USE_MKLDNN "Build with MKL-DNN support" ON)
else()
Expand Down Expand Up @@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE)
link_libraries(gcov)
endif()

if(USE_MKL_LAYERNORM)
add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
endif()
if(USE_MKLDNN)
# CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
if(MSVC)
Expand Down Expand Up @@ -447,6 +451,16 @@ if(USE_OPENMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
# Enable pragma omp simd
# "While the name of this switch is 'experimental', the switch itself, and
# the functionality it enables is fully supported and production-ready.
# The name reflects that it doesn’t enable any complete subset or
# version of an OpenMP standard."
# -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
if(MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
endif()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
add_definitions(-DMXNET_USE_OPENMP=1)
Expand Down
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@
docs/python_docs/themes/mx-theme
3rdparty/intgemm
3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
src/operator/nn/layer_norm.cc

=======================================================================================
3-clause BSD license
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1)
LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a
endif

# Use MKL's layernorm implementation. Only has an impact if MKL is compiled in.
ifeq ($(USE_MKL_LAYERNORM), 1)
CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1
endif

# setup opencv
ifeq ($(USE_OPENCV), 1)
CFLAGS += -DMXNET_USE_OPENCV=1
Expand Down
1 change: 1 addition & 0 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() {
DEV=1 \
USE_CPP_PACKAGE=1 \
USE_BLAS=mkl \
USE_MKL_LAYERNORM=1 \
USE_TVM_OP=1 \
USE_MKLDNN=0 \
USE_INTEL_PATH=/opt/intel \
Expand Down
174 changes: 156 additions & 18 deletions src/operator/nn/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,37 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* Function LayerNormCPUKernel is adapated from Marian
* https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
* under the MIT license
* MIT License
*
* Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
* Mickiewicz University
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* All or part of this file was contributed by Intel under license:
* Copyright (C) 2017-2018 Intel Corporation
* SPDX-License-Identifier: MIT
*
*/

/*!
Expand All @@ -27,10 +58,6 @@
#include <nnvm/op_attr_types.h>
#include "../elemwise_op_common.h"

#if MSHADOW_USE_MKL == 1
#include "../mkl_functions-inl.h"
#endif

namespace mxnet {
namespace op {

Expand Down Expand Up @@ -68,23 +95,126 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
return true;
}

template<>
void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
/* CPU optimized kernel for LayerNorm assuming axis = -1.
* Data is the underlying storage data type.
* Accum is the type to use for accumulation.
* Apparently there isn't a reduction operator for half_t and anyway it isn't
* efficient to use on the CPU, so use float for reduction of half_t.
*
* width is the number of values being summed to compute a mean.
* instances is how many independent layer normalization problems are packed into the tensors.
*
* Inputs:
* data is instances x width
* gamma is width
* beta is width
*
* Outputs:
* out is instances x width, can be same as data
* mean is instances: means of each problem
* std is instances: standard deviation of each problem
*
*/
template <typename Data, typename Accum = typename
/* By default accumulate in float32 for float16. Otherwise use same type. */
std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
float,
Data>::type>
void LayerNormCPUKernel(size_t width,
size_t instances,
Data eps,
const Data *data,
const Data *gamma,
const Data *beta,
Data *out,
Data *mean,
Data *std) {
// Parallelize over independent instances to normalize.
// MSVC says index variable in OpenMP 'for' statement must have signed integral type.
const mshadow::index_t signed_instances = static_cast<mshadow::index_t>(instances);
#pragma omp parallel for
for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
const Data *from = data + j * width;

// Sum the values to compute mean.
Accum sum = 0.f;
#pragma omp simd reduction(+ : sum)
for (size_t i = 0; i < width; ++i) {
sum += from[i];
}
Accum mean_value = sum / width;
mean[j] = static_cast<Data>(mean_value);

// Sum squares from mean to compute stddev.
Accum squares = 0.f;
#pragma omp simd reduction(+ : squares)
for (size_t i = 0; i < width; ++i) {
Accum off = from[i] - mean_value;
squares += off * off;
}
Accum sigma = std::sqrt(squares / width + eps);
std[j] = static_cast<Data>(sigma);

// Write normalized values.
Data *to = out + j * width;
#pragma omp simd
for (size_t i = 0; i < width; ++i) {
to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i];
}
}
}

#if MSHADOW_USE_MKL == 1
void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
/* Wrap the above LayerNormCPUKernel in MXNet's API. Returns true if it
* is able to run.
*/
bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 3U);

switch (req[layernorm::kOut]) {
case kNullOp:
return true;
case kWriteTo:
break;
case kWriteInplace:
break;
default:
// Should only be kAddTo, which isn't supported by the others implementation either.
return false;
}
// Axis must be the last one.
int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
if (axis != inputs[layernorm::kData].ndim() - 1) {
return false;
}
MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
LayerNormCPUKernel<DType>(
inputs[layernorm::kData].shape_[axis],
outputs[layernorm::kMean].Size(),
param.eps,
inputs[layernorm::kData].dptr<DType>(),
inputs[layernorm::kGamma].dptr<DType>(),
inputs[layernorm::kBeta].dptr<DType>(),
outputs[layernorm::kOut].dptr<DType>(),
outputs[layernorm::kMean].dptr<DType>(),
outputs[layernorm::kStd].dptr<DType>());
});
return true;
}

#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
if (req[0] == kNullOp) return;
if (req[0] == kNullOp) return true;
CHECK_NE(req[0], kAddTo);
CHECK_EQ(inputs.size(), 3U);
int axis = GetRealAxis(param.axis, inputs[0].ndim());
Expand Down Expand Up @@ -113,13 +243,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
outputs[layernorm::kStd].dptr<DType>(),
static_cast<DType>(param.eps));
});
return true;
} else {
// fallback
LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
return false;
}
}
#endif

template<>
void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
#endif
if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
}

template<>
void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
Expand Down Expand Up @@ -175,11 +317,7 @@ axis to be the last item in the input shape.
})
.set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
#if MSHADOW_USE_MKL == 1
.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
#else
.set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
#endif
.set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> heads;
Expand Down