diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3479387d7c3..c4b37bb8a2ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT
 option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
 option(USE_LAPACK "Build with lapack support" ON)
 option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently slower than internal.  No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
 if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
   option(USE_MKLDNN "Build with MKL-DNN support" ON)
 else()
@@ -279,6 +280,9 @@ if(ENABLE_TESTCOVERAGE)
   link_libraries(gcov)
 endif()
 
+if(USE_MKL_LAYERNORM)
+  add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
+endif()
 if(USE_MKLDNN)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(MSVC)
@@ -447,6 +451,16 @@ if(USE_OPENMP)
     if(OPENMP_FOUND)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      # Enable pragma omp simd
+      # "While the name of this switch is 'experimental', the switch itself, and
+      # the functionality it enables is fully supported and production-ready.
+      # The name reflects that it doesn’t enable any complete subset or
+      # version of an OpenMP standard."
+      # -- https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
+      if(MSVC)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
+      endif()
       set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
       add_definitions(-DMXNET_USE_OPENMP=1)
diff --git a/LICENSE b/LICENSE
index 79ca84b2377a..e937dd75c0e3 100644
--- a/LICENSE
+++ b/LICENSE
@@ -248,6 +248,7 @@
     docs/python_docs/themes/mx-theme
     3rdparty/intgemm
     3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
+    src/operator/nn/layer_norm.cc
 
     =======================================================================================
     3-clause BSD license
diff --git a/Makefile b/Makefile
index b332b9b5b7b2..d4e221d4bb61 100644
--- a/Makefile
+++ b/Makefile
@@ -178,6 +178,11 @@ ifeq ($(USE_MKLDNN), 1)
 	LIB_DEP += $(MKLDNNROOT)/lib/libdnnl.a
 endif
 
+# Use MKL's layernorm implementation.  Only has an impact if MKL is compiled in.
+ifeq ($(USE_MKL_LAYERNORM), 1)
+  CFLAGS += -DMXNET_USE_MKL_LAYERNORM=1
+endif
+
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 68499443e51c..60f1c289c0a7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -501,6 +501,7 @@ build_ubuntu_cpu_mkl() {
         DEV=1                         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=mkl                  \
+        USE_MKL_LAYERNORM=1           \
         USE_TVM_OP=1                  \
         USE_MKLDNN=0                  \
         USE_INTEL_PATH=/opt/intel     \
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 11178b358c2d..08bd1a3955ec 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -15,6 +15,37 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
+ *
+ * Function LayerNormCPUKernel is adapated from Marian
+ * https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
+ * under the MIT license
+ * MIT License
+ *
+ * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
+ * Mickiewicz University
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * 
+ * All or part of this file was contributed by Intel under license:
+ *   Copyright (C) 2017-2018 Intel Corporation
+ *   SPDX-License-Identifier: MIT
+ *
  */
 
 /*!
@@ -27,10 +58,6 @@
 #include <nnvm/op_attr_types.h>
 #include "../elemwise_op_common.h"
 
-#if MSHADOW_USE_MKL == 1
-#include "../mkl_functions-inl.h"
-#endif
-
 namespace mxnet {
 namespace op {
 
@@ -68,23 +95,126 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<>
-void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx, const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+/* CPU optimized kernel for LayerNorm assuming axis = -1.
+ * Data is the underlying storage data type.
+ * Accum is the type to use for accumulation.
+ *   Apparently there isn't a reduction operator for half_t and anyway it isn't
+ *   efficient to use on the CPU, so use float for reduction of half_t.
+ *
+ * width is the number of values being summed to compute a mean.
+ * instances is how many independent layer normalization problems are packed into the tensors. 
+ *
+ * Inputs:
+ * data is instances x width
+ * gamma is width
+ * beta is width
+ *
+ * Outputs:
+ * out is instances x width, can be same as data
+ * mean is instances: means of each problem
+ * std is instances: standard deviation of each problem
+ *
+ */
+template <typename Data, typename Accum = typename
+            /* By default accumulate in float32 for float16.  Otherwise use same type. */
+            std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
+                             float,
+                             Data>::type>
+void LayerNormCPUKernel(size_t width,
+                        size_t instances,
+                        Data eps,
+                        const Data *data,
+                        const Data *gamma,
+                        const Data *beta,
+                        Data *out,
+                        Data *mean,
+                        Data *std) {
+  // Parallelize over independent instances to normalize.
+  // MSVC says index variable in OpenMP 'for' statement must have signed integral type.
+  const mshadow::index_t signed_instances = static_cast<mshadow::index_t>(instances);
+#pragma omp parallel for
+  for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
+    const Data *from = data + j * width;
+
+    // Sum the values to compute mean.
+    Accum sum = 0.f;
+#pragma omp simd reduction(+ : sum)
+    for (size_t i = 0; i < width; ++i) {
+      sum += from[i];
+    }
+    Accum mean_value = sum / width;
+    mean[j] = static_cast<Data>(mean_value);
+
+    // Sum squares from mean to compute stddev.
+    Accum squares = 0.f;
+#pragma omp simd reduction(+ : squares)
+    for (size_t i = 0; i < width; ++i) {
+      Accum off = from[i] - mean_value;
+      squares += off * off;
+    }
+    Accum sigma = std::sqrt(squares / width + eps);
+    std[j] = static_cast<Data>(sigma);
+
+    // Write normalized values.
+    Data *to = out + j * width;
+#pragma omp simd
+    for (size_t i = 0; i < width; ++i) {
+      to[i] = (from[i] - mean_value) * gamma[i] / sigma + beta[i];
+    }
+  }
 }
 
-#if MSHADOW_USE_MKL == 1
-void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+/* Wrap the above LayerNormCPUKernel in MXNet's API.  Returns true if it
+ * is able to run.
+ */
+bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx, const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 3U);
+
+  switch (req[layernorm::kOut]) {
+    case kNullOp:
+      return true;
+    case kWriteTo:
+      break;
+    case kWriteInplace:
+      break;
+    default:
+      // Should only be kAddTo, which isn't supported by the others implementation either.
+      return false;
+  }
+  // Axis must be the last one.
+  int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
+  if (axis != inputs[layernorm::kData].ndim() - 1) {
+    return false;
+  }
+  MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
+    LayerNormCPUKernel<DType>(
+        inputs[layernorm::kData].shape_[axis],
+        outputs[layernorm::kMean].Size(),
+        param.eps,
+        inputs[layernorm::kData].dptr<DType>(),
+        inputs[layernorm::kGamma].dptr<DType>(),
+        inputs[layernorm::kBeta].dptr<DType>(),
+        outputs[layernorm::kOut].dptr<DType>(),
+        outputs[layernorm::kMean].dptr<DType>(),
+        outputs[layernorm::kStd].dptr<DType>());
+  });
+  return true;
+}
+
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                          const OpContext& ctx,
                          const std::vector<TBlob>& inputs,
                          const std::vector<OpReqType>& req,
                          const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  if (req[0] == kNullOp) return;
+  if (req[0] == kNullOp) return true;
   CHECK_NE(req[0], kAddTo);
   CHECK_EQ(inputs.size(), 3U);
   int axis = GetRealAxis(param.axis, inputs[0].ndim());
@@ -113,13 +243,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                                  outputs[layernorm::kStd].dptr<DType>(),
                                  static_cast<DType>(param.eps));
     });
+    return true;
   } else {
     // fallback
-    LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+    return false;
   }
 }
 #endif
 
+template<>
+void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+  if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
+#endif
+  if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
+  LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
 
 template<>
 void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
@@ -175,11 +317,7 @@ axis to be the last item in the input shape.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
-#if MSHADOW_USE_MKL == 1
-.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
-#else
 .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
                                            const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> heads;