From 2f87167f70ca6bd268b47744c2cf57689093c8d4 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 30 Nov 2020 14:34:03 +0000
Subject: [PATCH 1/9] Layer normalization code from Marian

---
 LICENSE                       |   2 +
 src/operator/nn/layer_norm.cc | 164 +++++++++++++++++++++++++++++++---
 2 files changed, 154 insertions(+), 12 deletions(-)
diff --git a/LICENSE b/LICENSE
index d9c398334c26..610bb5fdfca0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -309,6 +309,8 @@
          (Except 3rdparty/intgemm/test/3rd_party/catch.hpp is under the Boost license, listed separately)
     builtin_fp16.h - For details, see 3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
          Copyright (c) 2009-2015 by llvm/compiler-rt contributors
+    LayerNormCPUKernel from Marian - For details see src/operator/nn/layer_norm.cc
+         Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam Mickiewicz University
 
     =======================================================================================
     3-clause BSD license
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 11178b358c2d..79f2fe6addef 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -15,6 +15,37 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
+ *
+ * Function LayerNormCPUKernel is adapated from Marian
+ * https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
+ * under the MIT license
+ * MIT License
+ *
+ * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
+ * Mickiewicz University
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * 
+ * All or part of this file was contributed by Intel under license:
+ *   Copyright (C) 2017-2018 Intel Corporation
+ *   SPDX-License-Identifier: MIT
+ *
  */
 
 /*!
@@ -68,16 +99,117 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<>
-void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx, const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+/* CPU optimized kernel for LayerNorm assuming axis = -1.
+ * Data is the underlying storage data type.
+ * Accum is the type to use for accumulation.
+ *   Apparently there isn't a reduction operator for half_t and anyway it isn't
+ *   efficient to use on the CPU, so use float for reduction of half_t.
+ *
+ * width is the number of values being summed to compute a mean.
+ * instances is how many independent layer normalization problems are packed into the tensors. 
+ *
+ * Inputs:
+ * data is instances x width
+ * gamma is width
+ * beta is width
+ *
+ * Outputs:
+ * out is instances x width, can be same as data
+ * mean is instances: means of each problem
+ * std is instances: standard deviation of each problem
+ *
+ */
+template <typename Data, typename Accum = typename
+            /* By default accumulate in float32 for float16.  Otherwise use same type. */
+            std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
+                             float,
+                             Data>::type>
+void LayerNormCPUKernel(size_t width,
+                        size_t instances,
+                        Data eps,
+                        const Data *data,
+                        const Data *gamma,
+                        const Data *beta,
+                        Data *out,
+                        Data *mean,
+                        Data *std) {
+  // Parallelize over independent instances to normalize.
+#pragma omp parallel for
+  for (size_t j = 0; j < instances; ++j) {
+    const Data *from = data + j * width;
+
+    // Sum the values to compute mean.
+    Accum sum = 0.f;
+#pragma omp simd reduction(+ : sum)
+    for (size_t i = 0; i < width; ++i) {
+      sum += from[i];
+    }
+    Accum mean_value = sum / width;
+    mean[j] = static_cast<Data>(mean_value);
+
+    // Sum squares from mean to compute stddev.
+    Accum squares = 0.f;
+#pragma omp simd reduction(+ : squares)
+    for (size_t i = 0; i < width; ++i) {
+      Accum off = from[i] - mean_value;
+      squares += off * off;
+    }
+    Accum sigma = std::sqrt(squares / width + eps);
+    std[j] = static_cast<Data>(sigma);
+
+    // Write normalized values.
+    Data *to = out + j * width;
+#pragma omp simd
+    for (size_t i = 0; i < width; ++i) {
+      to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i];
+    }
+  }
+}
+
+/* Wrap the above LayerNormCPUKernel in MXNet's API.  Returns true if it
+ * is able to run.
+ */
+bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx, const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 3U);
+
+  switch (req[layernorm::kOut]) {
+    case kNullOp:
+      return true;
+    case kWriteTo:
+      break;
+    case kWriteInplace:
+      break;
+    default:
+      // Should only be kAddTo, which isn't supported by the others implementation either.
+      return false;
+  }
+  // Axis must be the last one.
+  int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
+  if (axis != inputs[layernorm::kData].ndim() - 1) {
+    return false;
+  }
+  MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
+    LayerNormCPUKernel<DType>(
+        inputs[layernorm::kData].shape_[axis],
+        outputs[layernorm::kMean].Size(),
+        param.eps,
+        inputs[layernorm::kData].dptr<DType>(),
+        inputs[layernorm::kGamma].dptr<DType>(),
+        inputs[layernorm::kBeta].dptr<DType>(),
+        outputs[layernorm::kOut].dptr<DType>(),
+        outputs[layernorm::kMean].dptr<DType>(),
+        outputs[layernorm::kStd].dptr<DType>());
+  });
+  return true;
 }
 
 #if MSHADOW_USE_MKL == 1
-void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                          const OpContext& ctx,
                          const std::vector<TBlob>& inputs,
                          const std::vector<OpReqType>& req,
@@ -113,13 +245,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                                  outputs[layernorm::kStd].dptr<DType>(),
                                  static_cast<DType>(param.eps));
     });
+    return true;
   } else {
     // fallback
-    LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+    return false;
   }
 }
 #endif
 
+template<>
+void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+#if MSHADOW_USE_MKL == 1
+  if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
+#endif
+  if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
+  LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
 
 template<>
 void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
@@ -175,11 +319,7 @@ axis to be the last item in the input shape.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
-#if MSHADOW_USE_MKL == 1
-.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
-#else
 .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
                                            const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> heads;

From 95efe8fcb0e79821c8751f754c2ca8896b15a5a9 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 30 Nov 2020 17:07:58 +0000
Subject: [PATCH 2/9] Remove MKL version of LayerNorm.

Experiment with OMP_NUM_THREADS=4, times in s, c5.12xlarge

|batchxchanne| New code | MKL      |
|    1x   32 | 0.0000288| 0.0000278|
|  128x   32 | 0.0000308| 0.0000311|
| 2560x   32 | 0.0000712| 0.0000672|
| 4096x   32 | 0.0000946| 0.0000910|
| 8192x   32 | 0.0001597| 0.0001523|
|16384x   32 | 0.0002905| 0.0002619|
|    1x   64 | 0.0000264| 0.0000256|
|  128x   64 | 0.0000339| 0.0000330|
| 2560x   64 | 0.0000829| 0.0000972|
| 4096x   64 | 0.0001137| 0.0001356|
| 8192x   64 | 0.0002027| 0.0002435|
|16384x   64 | 0.0003715| 0.0004639|
|    1x  128 | 0.0000262| 0.0000263|
|  128x  128 | 0.0000325| 0.0000389|
| 2560x  128 | 0.0001074| 0.0001580|
| 4096x  128 | 0.0001505| 0.0002336|
| 8192x  128 | 0.0002861| 0.0004481|
|16384x  128 | 0.0005648| 0.0008613|
|    1x  256 | 0.0000273| 0.0000276|
|  128x  256 | 0.0000390| 0.0000431|
| 2560x  256 | 0.0001533| 0.0002811|
| 4096x  256 | 0.0002258| 0.0004300|
| 8192x  256 | 0.0004300| 0.0008464|
|16384x  256 | 0.0010436| 0.0017613|
|    1x  512 | 0.0000256| 0.0000302|
|  128x  512 | 0.0000408| 0.0000551|
| 2560x  512 | 0.0002444| 0.0005225|
| 4096x  512 | 0.0003828| 0.0008147|
| 8192x  512 | 0.0008832| 0.0017192|
|16384x  512 | 0.0058463| 0.0074497|
|    1x  768 | 0.0000252| 0.0000308|
|  128x  768 | 0.0000450| 0.0000676|
| 2560x  768 | 0.0003440| 0.0007719|
| 4096x  768 | 0.0005890| 0.0013346|
| 8192x  768 | 0.0014946| 0.0026145|
|16384x  768 | 0.0089495| 0.0113557|
|    1x 1024 | 0.0000285| 0.0000308|
|  128x 1024 | 0.0000487| 0.0000786|
| 2560x 1024 | 0.0004614| 0.0010190|
| 4096x 1024 | 0.0008083| 0.0017376|
| 8192x 1024 | 0.0059020| 0.0075588|
|16384x 1024 | 0.0116553| 0.0146855|

Benchmark program
```python
import mxnet as mx
import time

def time_procedure(shape, count):
  data = mx.nd.random_uniform(shape=shape, low=-1.0, high = 1.0)
  factors = mx.nd.random_uniform(shape=(shape[-1],))
  mx.nd.waitall()
  begin = time.time()
  for i in range(0, count):
    out = mx.nd.LayerNorm(data, factors, factors)
    mx.nd.waitall()
  return (time.time() - begin) / count

count = 200

for channel in [32, 64, 128, 256, 512, 768, 1024]:
  for batch in [1, 128, 2560, 4096, 8192, 16384]:
    s = (batch, channel)
    timing = time_procedure(s, count)
    print("{:5d}x{:5d} | {:.7f}".format(s[0], s[1], timing))
```
---
 src/operator/nn/layer_norm.cc | 52 -----------------------------------
 1 file changed, 52 deletions(-)

diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 79f2fe6addef..5d3095058861 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -58,10 +58,6 @@
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
 
-#if MSHADOW_USE_MKL == 1
-#include "../mkl_functions-inl.h"
-#endif
-
 namespace mxnet {
 namespace op {
 
@@ -208,59 +204,11 @@ bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-#if MSHADOW_USE_MKL == 1
-bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  if (req[0] == kNullOp) return;
-  CHECK_NE(req[0], kAddTo);
-  CHECK_EQ(inputs.size(), 3U);
-  int axis = GetRealAxis(param.axis, inputs[0].ndim());
-
-  // This optimization only applys for LayerNorm on the last dimension with dtype FP32 or FP64.
-  if (axis == (inputs[layernorm::kData].ndim() - 1) &&
-      (inputs[0].type_flag_ == kFloat32 || inputs[0].type_flag_ == kFloat64)) {
-    // Compute necessary data for the reduce operation.
-    mxnet::TShape red_src_shape, red_dst_shape;
-    BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_,
-                                &red_src_shape, &red_dst_shape);
-    const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape);
-    const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape);
-    const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape);
-    const int outter_size = red_dst_shape.Size();
-    const int channel_size = red_src_shape.Size() / red_dst_shape.Size();
-
-    // call
-    MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
-      mkl_func::LayerNormLastDim(outter_size, channel_size,
-                                 in_data.dptr<DType>(),
-                                 outputs[layernorm::kOut].dptr<DType>(),
-                                 inputs[layernorm::kGamma].dptr<DType>(),
-                                 inputs[layernorm::kBeta].dptr<DType>(),
-                                 outputs[layernorm::kMean].dptr<DType>(),
-                                 outputs[layernorm::kStd].dptr<DType>(),
-                                 static_cast<DType>(param.eps));
-    });
-    return true;
-  } else {
-    // fallback
-    return false;
-  }
-}
-#endif
-
 template<>
 void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx, const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
-#if MSHADOW_USE_MKL == 1
-  if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
-#endif
   if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
   LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
 }

From 40d3326f4e1f2ca95470f1b4b5e13bcee56dcb1b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Fri, 4 Dec 2020 14:11:03 +0000
Subject: [PATCH 3/9] Enable pragma omp simd on MSVC

---
 CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3479387d7c3..1f16b16dcfe9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -447,6 +447,16 @@ if(USE_OPENMP)
     if(OPENMP_FOUND)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      # Enable pragma omp simd
+      # "While the name of this switch is 'experimental', the switch itself, and
+      # the functionality it enables is fully supported and production-ready.
+      # The name reflects that it doesn’t enable any complete subset or
+      # version of an OpenMP standard."
+      # -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
+      if(MSVC)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
+      endif()
       set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
       add_definitions(-DMXNET_USE_OPENMP=1)

From 3605226bc03c14239fc1df93d753bba410c1318d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 7 Dec 2020 10:08:12 +0000
Subject: [PATCH 4/9] Fix MSVC error C3016: 'j': index variable in OpenMP 'for'
 statement must have signed integral type

---
 src/operator/nn/layer_norm.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 5d3095058861..6b9efabdb3b9 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -130,8 +130,10 @@ void LayerNormCPUKernel(size_t width,
                         Data *mean,
                         Data *std) {
   // Parallelize over independent instances to normalize.
+  // MSVC says index variable in OpenMP 'for' statement must have signed integral type.
+  const ssize_t signed_instances = static_cast<ssize_t>(instances);
 #pragma omp parallel for
-  for (size_t j = 0; j < instances; ++j) {
+  for (ssize_t j = 0; j < signed_instances; ++j) {
     const Data *from = data + j * width;
 
     // Sum the values to compute mean.

From dcb61aaf323d916fd094b49b114bf83fad535692 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 7 Dec 2020 12:24:39 +0000
Subject: [PATCH 5/9] Try to make MSVC happy since it doesn't have ssize_t

---
 src/operator/nn/layer_norm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 6b9efabdb3b9..fdb5dc09ae51 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -131,9 +131,9 @@ void LayerNormCPUKernel(size_t width,
                         Data *std) {
   // Parallelize over independent instances to normalize.
   // MSVC says index variable in OpenMP 'for' statement must have signed integral type.
-  const ssize_t signed_instances = static_cast<ssize_t>(instances);
+  const mshadow::index_t signed_instances = static_cast<mshadow::index_t>(instances);
 #pragma omp parallel for
-  for (ssize_t j = 0; j < signed_instances; ++j) {
+  for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
     const Data *from = data + j * width;
 
     // Sum the values to compute mean.

From a11dc7e3428693586a660c0f5838ee92899d4b6b Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 7 Dec 2020 14:32:37 +0000
Subject: [PATCH 6/9] Change gcc 8 PPA to ppa:jonathonf/gcc

---
 ci/docker/install/ubuntu_gcc8.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/docker/install/ubuntu_gcc8.sh b/ci/docker/install/ubuntu_gcc8.sh
index cd31f8213c1a..7488aaecd4ad 100755
--- a/ci/docker/install/ubuntu_gcc8.sh
+++ b/ci/docker/install/ubuntu_gcc8.sh
@@ -17,7 +17,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-sudo add-apt-repository ppa:jonathonf/gcc-8.0
-sudo add-apt-repository ppa:jonathonf/gcc-7.3
+sudo add-apt-repository ppa:jonathonf/gcc
 sudo apt-get update || true
 sudo apt-get install -y gcc-8 g++-8

From 2d2a91e82040a93a2aa31455e3b50c0a0110867d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 21 Dec 2020 11:50:42 +0000
Subject: [PATCH 7/9] Option to use MKL version requested by @samskalicky

---
 CMakeLists.txt                |  4 +++
 src/operator/nn/layer_norm.cc | 48 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f16b16dcfe9..c4b37bb8a2ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT
 option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
 option(USE_LAPACK "Build with lapack support" ON)
 option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal.  No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
 if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
   option(USE_MKLDNN "Build with MKL-DNN support" ON)
 else()
@@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE)
   link_libraries(gcov)
 endif()
 
+if(USE_MKL_LAYERNORM)
+  add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
+endif()
 if(USE_MKLDNN)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(MSVC)
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index fdb5dc09ae51..9c60e08b01f5 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -206,12 +206,60 @@ bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  if (req[0] == kNullOp) return true;
+  CHECK_NE(req[0], kAddTo);
+  CHECK_EQ(inputs.size(), 3U);
+  int axis = GetRealAxis(param.axis, inputs[0].ndim());
+
+  // This optimization only applys for LayerNorm on the last dimension with dtype FP32 or FP64.
+  if (axis == (inputs[layernorm::kData].ndim() - 1) &&
+      (inputs[0].type_flag_ == kFloat32 || inputs[0].type_flag_ == kFloat64)) {
+    // Compute necessary data for the reduce operation.
+    mxnet::TShape red_src_shape, red_dst_shape;
+    BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_,
+                                &red_src_shape, &red_dst_shape);
+    const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape);
+    const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape);
+    const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape);
+    const int outter_size = red_dst_shape.Size();
+    const int channel_size = red_src_shape.Size() / red_dst_shape.Size();
+
+    // call
+    MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
+      mkl_func::LayerNormLastDim(outter_size, channel_size,
+                                 in_data.dptr<DType>(),
+                                 outputs[layernorm::kOut].dptr<DType>(),
+                                 inputs[layernorm::kGamma].dptr<DType>(),
+                                 inputs[layernorm::kBeta].dptr<DType>(),
+                                 outputs[layernorm::kMean].dptr<DType>(),
+                                 outputs[layernorm::kStd].dptr<DType>(),
+                                 static_cast<DType>(param.eps));
+    });
+    return true;
+  } else {
+    // fallback
+    return false;
+  }
+}
+#endif
+
 template<>
 void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx, const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
   if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+  if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
+#endif
   LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
 }
 

From e5093eb1b79373efb25e14b435700617a2098073 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 21 Dec 2020 13:00:48 +0000
Subject: [PATCH 8/9] Fix order if MKL override is on

---
 src/operator/nn/layer_norm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 9c60e08b01f5..08bd1a3955ec 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -256,10 +256,10 @@ void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
                            const OpContext& ctx, const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
-  if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
 #if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
   if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
 #endif
+  if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
   LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
 }
 

From a5665587e05a65a6b003b47e3cc98eeed7fa1f5d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kheafiel@amazon.com>
Date: Mon, 28 Dec 2020 13:18:33 +0000
Subject: [PATCH 9/9] Have CI test MKL layer norm in build_ubuntu_cpu_mkl

---
 Makefile                       | 5 +++++
 ci/docker/runtime_functions.sh | 1 +
 2 files changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index b332b9b5b7b2..d4e221d4bb61 100644
--- a/Makefile
+++ b/Makefile
@@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1)
 	LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a
 endif
 
+# Use MKL's layernorm implementation.  Only has an impact if MKL is compiled in.
+ifeq ($(USE_MKL_LAYERNORM), 1)
+  CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1
+endif
+
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 68499443e51c..60f1c289c0a7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() {
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=mkl                  \
+        USE_MKL_LAYERNORM=1           \
         USE_TVM_OP=1                  \
         USE_MKLDNN=0                  \
         USE_INTEL_PATH=/opt/intel     \