From 0d0dd526804989661244c7724b6cf9bcb85f1a0d Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Wed, 31 Oct 2018 21:15:02 +0800
Subject: [PATCH 1/6] Refactor L2_normalization

---
 src/operator/l2_normalization-inl.h | 59 +++++++++++++++++++----------
 1 file changed, 38 insertions(+), 21 deletions(-)
diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index d53e0c5caf98..2f86de3692fe 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -86,6 +86,7 @@ class L2NormalizationOp : public Operator {
     CHECK_EQ(out_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     TShape orig_shape = in_data[l2_normalization::kData].shape_;
+    auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (param_.mode == l2_normalization::kInstance) {
       Shape<2> dshape = Shape2(orig_shape[0],
         orig_shape.ProdShape(1, orig_shape.ndim()));
@@ -94,13 +95,17 @@ class L2NormalizationOp : public Operator {
       Tensor<xpu, 2, DType> out = out_data[l2_normalization::kOut]
         .get_with_shape<xpu, 2, DType>(dshape, s);
       Tensor<xpu, 1, DType> norm = out_data[l2_normalization::kNorm].get<xpu, 1, DType>(s);
-      norm = sumall_except_dim<0>(F<mxnet::op::mshadow_op::square>(data));
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
-          s, norm.size(0), norm.dptr_, norm.dptr_, DType(param_.eps));
-      });
-      norm = F<mxnet::op::mshadow_op::square_root>(norm);
-      out = data / broadcast<0>(norm, out.shape_);
+#pragma omp parallel for num_threads(omp_threads)
+      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
+        norm[shape0] = DType(param_.eps);
+        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+          norm[shape0] += data[shape0][shape1] * data[shape0][shape1];
+        }
+        norm[shape0] = std::sqrt(norm[shape0]);
+        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+          out[shape0][shape1] = data[shape0][shape1] / norm[shape0];
+        }
+      }
     } else if (param_.mode == l2_normalization::kChannel) {
       CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
@@ -112,13 +117,19 @@ class L2NormalizationOp : public Operator {
       Shape<2> norm_shape = Shape2(dshape[0], dshape[2]);
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
-      norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(data), 1);
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
-          s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps));
-      });
-      norm = F<mxnet::op::mshadow_op::square_root>(norm);
-      out = data / broadcast_with_axis(norm, 0, orig_shape[1]);
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
+        for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+          norm[shape0][shape2] = DType(param_.eps);
+          for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+            norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
+          }
+          norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]);
+          for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2];
+          }
+        }
+      }
     } else if (param_.mode == l2_normalization::kSpatial) {
       CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
@@ -130,13 +141,19 @@ class L2NormalizationOp : public Operator {
       Shape<2> norm_shape = Shape2(dshape[0], dshape[1]);
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
-      norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(data), 2);
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
-          s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps));
-      });
-      norm = F<mxnet::op::mshadow_op::square_root>(norm);
-      out = data / broadcast_with_axis(norm, 1, dshape[2]);
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
+        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+          norm[shape0][shape1] = DType(param_.eps);
+          for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+            norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
+          }
+          norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]);
+          for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1];
+          }
+        }
+      }
     } else {
       LOG(FATAL) << "Unexpected mode in l2 normalization";
     }

From f0ba8bffa385fcdd5d927e2260b21fbf27a6d0a4 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 1 Nov 2018 09:05:53 +0800
Subject: [PATCH 2/6] Fix windows build

---
 src/operator/l2_normalization-inl.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index 2f86de3692fe..90edec2c95a0 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -35,6 +35,11 @@
 #include "./operator_common.h"
 #include "./mshadow_op.h"
 
+/* VisualStudio only supports openmp 2.0 */
+#ifdef _MSC_VER
+#define collapse(x)
+#endif
+
 namespace mxnet {
 namespace op {
 

From 3e28a054adfd100948fe239da548b62c57812ba0 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 1 Nov 2018 09:42:38 +0800
Subject: [PATCH 3/6] Fix windows build

---
 src/operator/l2_normalization-inl.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index 90edec2c95a0..74287659627c 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -101,13 +101,13 @@ class L2NormalizationOp : public Operator {
         .get_with_shape<xpu, 2, DType>(dshape, s);
       Tensor<xpu, 1, DType> norm = out_data[l2_normalization::kNorm].get<xpu, 1, DType>(s);
 #pragma omp parallel for num_threads(omp_threads)
-      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
         norm[shape0] = DType(param_.eps);
-        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
           norm[shape0] += data[shape0][shape1] * data[shape0][shape1];
         }
         norm[shape0] = std::sqrt(norm[shape0]);
-        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
           out[shape0][shape1] = data[shape0][shape1] / norm[shape0];
         }
       }
@@ -123,14 +123,14 @@ class L2NormalizationOp : public Operator {
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
 #pragma omp parallel for num_threads(omp_threads) collapse(2)
-      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
-        for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
+        for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
           norm[shape0][shape2] = DType(param_.eps);
-          for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
             norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
           }
           norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]);
-          for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
             out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2];
           }
         }
@@ -147,14 +147,14 @@ class L2NormalizationOp : public Operator {
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
 #pragma omp parallel for num_threads(omp_threads) collapse(2)
-      for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) {
-        for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) {
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
           norm[shape0][shape1] = DType(param_.eps);
-          for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
             norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
           }
           norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]);
-          for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) {
+          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
             out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1];
           }
         }

From 393ec379d52aca17903dbfe61d4dfe793c347dc1 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 1 Nov 2018 10:32:04 +0800
Subject: [PATCH 4/6] Move cpu optimization into l2_normalization.cc

---
 src/operator/l2_normalization-inl.h |  66 ++++++------------
 src/operator/l2_normalization.cc    | 102 +++++++++++++++++++++++++++-
 2 files changed, 122 insertions(+), 46 deletions(-)

diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index 74287659627c..c7e71424ada9 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -35,11 +35,6 @@
 #include "./operator_common.h"
 #include "./mshadow_op.h"
 
-/* VisualStudio only supports openmp 2.0 */
-#ifdef _MSC_VER
-#define collapse(x)
-#endif
-
 namespace mxnet {
 namespace op {
 
@@ -91,7 +86,6 @@ class L2NormalizationOp : public Operator {
     CHECK_EQ(out_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     TShape orig_shape = in_data[l2_normalization::kData].shape_;
-    auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
     if (param_.mode == l2_normalization::kInstance) {
       Shape<2> dshape = Shape2(orig_shape[0],
         orig_shape.ProdShape(1, orig_shape.ndim()));
@@ -100,17 +94,13 @@ class L2NormalizationOp : public Operator {
       Tensor<xpu, 2, DType> out = out_data[l2_normalization::kOut]
         .get_with_shape<xpu, 2, DType>(dshape, s);
       Tensor<xpu, 1, DType> norm = out_data[l2_normalization::kNorm].get<xpu, 1, DType>(s);
-#pragma omp parallel for num_threads(omp_threads)
-      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
-        norm[shape0] = DType(param_.eps);
-        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
-          norm[shape0] += data[shape0][shape1] * data[shape0][shape1];
-        }
-        norm[shape0] = std::sqrt(norm[shape0]);
-        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
-          out[shape0][shape1] = data[shape0][shape1] / norm[shape0];
-        }
-      }
+      norm = sumall_except_dim<0>(F<mxnet::op::mshadow_op::square>(data));
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
+          s, norm.size(0), norm.dptr_, norm.dptr_, DType(param_.eps));
+      });
+      norm = F<mxnet::op::mshadow_op::square_root>(norm);
+      out = data / broadcast<0>(norm, out.shape_);
     } else if (param_.mode == l2_normalization::kChannel) {
       CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
@@ -122,19 +112,13 @@ class L2NormalizationOp : public Operator {
       Shape<2> norm_shape = Shape2(dshape[0], dshape[2]);
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
-#pragma omp parallel for num_threads(omp_threads) collapse(2)
-      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
-        for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
-          norm[shape0][shape2] = DType(param_.eps);
-          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
-            norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
-          }
-          norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]);
-          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
-            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2];
-          }
-        }
-      }
+      norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(data), 1);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
+          s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps));
+      });
+      norm = F<mxnet::op::mshadow_op::square_root>(norm);
+      out = data / broadcast_with_axis(norm, 0, orig_shape[1]);
     } else if (param_.mode == l2_normalization::kSpatial) {
       CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
@@ -146,19 +130,13 @@ class L2NormalizationOp : public Operator {
       Shape<2> norm_shape = Shape2(dshape[0], dshape[1]);
       Tensor<xpu, 2, DType> norm = out_data[l2_normalization::kNorm]
         .get_with_shape<xpu, 2, DType>(norm_shape, s);
-#pragma omp parallel for num_threads(omp_threads) collapse(2)
-      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
-        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
-          norm[shape0][shape1] = DType(param_.eps);
-          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
-            norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
-          }
-          norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]);
-          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
-            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1];
-          }
-        }
-      }
+      norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(data), 2);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
+          s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps));
+      });
+      norm = F<mxnet::op::mshadow_op::square_root>(norm);
+      out = data / broadcast_with_axis(norm, 1, dshape[2]);
     } else {
       LOG(FATAL) << "Unexpected mode in l2 normalization";
     }
@@ -238,7 +216,7 @@ class L2NormalizationOp : public Operator {
     }
   }
 
- private:
+ protected:
   L2NormalizationParam param_;
 };  // class L2NormalizationOp
 
diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index f2f485ae6d1b..6801a0a20576 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -23,13 +23,111 @@
  * \brief l2 normalization operator
 */
 #include "./l2_normalization-inl.h"
+
+/* VisualStudio only supports openmp 2.0 */
+#ifdef _MSC_VER
+#define collapse(x)
+#endif
+
 namespace mxnet {
 namespace op {
+
+template<typename DType>
+class L2NormalizationOpCPU : public L2NormalizationOp<cpu, DType> {
+ public:
+  explicit L2NormalizationOpCPU(L2NormalizationParam p)
+      : L2NormalizationOp<cpu, DType>(p) {}
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    if (req[l2_normalization::kOut] == kNullOp) return;
+    CHECK_EQ(req[l2_normalization::kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    TShape orig_shape = in_data[l2_normalization::kData].shape_;
+    auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    if (this->param_.mode == l2_normalization::kInstance) {
+      Shape<2> dshape = Shape2(orig_shape[0],
+        orig_shape.ProdShape(1, orig_shape.ndim()));
+      Tensor<cpu, 2, DType> data = in_data[l2_normalization::kData]
+        .get_with_shape<cpu, 2, DType>(dshape, s);
+      Tensor<cpu, 2, DType> out = out_data[l2_normalization::kOut]
+        .get_with_shape<cpu, 2, DType>(dshape, s);
+      Tensor<cpu, 1, DType> norm = out_data[l2_normalization::kNorm].get<cpu, 1, DType>(s);
+#pragma omp parallel for num_threads(omp_threads)
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
+        norm[shape0] = DType(this->param_.eps);
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
+          norm[shape0] += data[shape0][shape1] * data[shape0][shape1];
+        }
+        norm[shape0] = std::sqrt(norm[shape0]);
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
+          out[shape0][shape1] = data[shape0][shape1] / norm[shape0];
+        }
+      }
+    } else if (this->param_.mode == l2_normalization::kChannel) {
+      CHECK_GE(orig_shape.ndim(), 3U);
+      Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
+        orig_shape.ProdShape(2, orig_shape.ndim()));
+      Tensor<cpu, 3, DType> data = in_data[l2_normalization::kData]
+        .get_with_shape<cpu, 3, DType>(dshape, s);
+      Tensor<cpu, 3, DType> out = out_data[l2_normalization::kOut]
+        .get_with_shape<cpu, 3, DType>(dshape, s);
+      Shape<2> norm_shape = Shape2(dshape[0], dshape[2]);
+      Tensor<cpu, 2, DType> norm = out_data[l2_normalization::kNorm]
+        .get_with_shape<cpu, 2, DType>(norm_shape, s);
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
+        for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
+          norm[shape0][shape2] = DType(this->param_.eps);
+          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
+            norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
+          }
+          norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]);
+          for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
+            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2];
+          }
+        }
+      }
+    } else if (this->param_.mode == l2_normalization::kSpatial) {
+      CHECK_GE(orig_shape.ndim(), 3U);
+      Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
+        orig_shape.ProdShape(2, orig_shape.ndim()));
+      Tensor<cpu, 3, DType> data = in_data[l2_normalization::kData]
+        .get_with_shape<cpu, 3, DType>(dshape, s);
+      Tensor<cpu, 3, DType> out = out_data[l2_normalization::kOut]
+        .get_with_shape<cpu, 3, DType>(dshape, s);
+      Shape<2> norm_shape = Shape2(dshape[0], dshape[1]);
+      Tensor<cpu, 2, DType> norm = out_data[l2_normalization::kNorm]
+        .get_with_shape<cpu, 2, DType>(norm_shape, s);
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+      for (int shape0 = 0; shape0 < static_cast<int>(dshape[0]); shape0++) {
+        for (int shape1 = 0; shape1 < static_cast<int>(dshape[1]); shape1++) {
+          norm[shape0][shape1] = DType(this->param_.eps);
+          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
+            norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2];
+          }
+          norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]);
+          for (int shape2 = 0; shape2 < static_cast<int>(dshape[2]); shape2++) {
+            out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1];
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "Unexpected mode in l2 normalization";
+    }
+  }
+};
+
 template<>
 Operator* CreateOp<cpu>(L2NormalizationParam param, int dtype) {
   Operator* op = nullptr;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new L2NormalizationOp<cpu, DType>(param);
+    op = new L2NormalizationOpCPU<DType>(param);
   });
   return op;
 }
@@ -37,7 +135,7 @@ Operator* CreateOp<cpu>(L2NormalizationParam param, int dtype) {
 // DO_BIND_DISPATCH comes from static_operator_common.h
 Operator* L2NormalizationProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                                 std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+  DO_BIND_DISPATCH(CreateOp, this->param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(L2NormalizationParam);

From 2e7570265ddc478465cd960f9ad825478c1ca2f9 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Thu, 1 Nov 2018 20:24:51 +0800
Subject: [PATCH 5/6] Retrigger CI

---
 src/operator/l2_normalization.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index 6801a0a20576..56702792297d 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -50,6 +50,7 @@ class L2NormalizationOpCPU : public L2NormalizationOp<cpu, DType> {
     Stream<cpu> *s = ctx.get_stream<cpu>();
     TShape orig_shape = in_data[l2_normalization::kData].shape_;
     auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+
     if (this->param_.mode == l2_normalization::kInstance) {
       Shape<2> dshape = Shape2(orig_shape[0],
         orig_shape.ProdShape(1, orig_shape.ndim()));

From 5c1e34511113f50db0299107d0818b558ccc54f8 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Mon, 5 Nov 2018 09:26:21 +0800
Subject: [PATCH 6/6] Retrigger CI

---
 src/operator/l2_normalization.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index 56702792297d..6801a0a20576 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -50,7 +50,6 @@ class L2NormalizationOpCPU : public L2NormalizationOp<cpu, DType> {
     Stream<cpu> *s = ctx.get_stream<cpu>();
     TShape orig_shape = in_data[l2_normalization::kData].shape_;
     auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-
     if (this->param_.mode == l2_normalization::kInstance) {
       Shape<2> dshape = Shape2(orig_shape[0],
         orig_shape.ProdShape(1, orig_shape.ndim()));