From 0d0dd526804989661244c7724b6cf9bcb85f1a0d Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Wed, 31 Oct 2018 21:15:02 +0800 Subject: [PATCH 1/6] Refactor L2_normalization --- src/operator/l2_normalization-inl.h | 59 +++++++++++++++++++---------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h index d53e0c5caf98..2f86de3692fe 100644 --- a/src/operator/l2_normalization-inl.h +++ b/src/operator/l2_normalization-inl.h @@ -86,6 +86,7 @@ class L2NormalizationOp : public Operator { CHECK_EQ(out_data.size(), 2U); Stream *s = ctx.get_stream(); TShape orig_shape = in_data[l2_normalization::kData].shape_; + auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (param_.mode == l2_normalization::kInstance) { Shape<2> dshape = Shape2(orig_shape[0], orig_shape.ProdShape(1, orig_shape.ndim())); @@ -94,13 +95,17 @@ class L2NormalizationOp : public Operator { Tensor out = out_data[l2_normalization::kOut] .get_with_shape(dshape, s); Tensor norm = out_data[l2_normalization::kNorm].get(s); - norm = sumall_except_dim<0>(F(data)); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, norm.size(0), norm.dptr_, norm.dptr_, DType(param_.eps)); - }); - norm = F(norm); - out = data / broadcast<0>(norm, out.shape_); +#pragma omp parallel for num_threads(omp_threads) + for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { + norm[shape0] = DType(param_.eps); + for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + norm[shape0] += data[shape0][shape1] * data[shape0][shape1]; + } + norm[shape0] = std::sqrt(norm[shape0]); + for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + out[shape0][shape1] = data[shape0][shape1] / norm[shape0]; + } + } } else if (param_.mode == l2_normalization::kChannel) { CHECK_GE(orig_shape.ndim(), 3U); Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], @@ -112,13 +117,19 @@ class L2NormalizationOp : public Operator { Shape<2> norm_shape = Shape2(dshape[0], dshape[2]); Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); - norm = reduce_with_axis(F(data), 1); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps)); - }); - norm = F(norm); - out = data / broadcast_with_axis(norm, 0, orig_shape[1]); +#pragma omp parallel for num_threads(omp_threads) collapse(2) + for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { + for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + norm[shape0][shape2] = DType(param_.eps); + for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; + } + norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]); + for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2]; + } + } + } } else if (param_.mode == l2_normalization::kSpatial) { CHECK_GE(orig_shape.ndim(), 3U); Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], @@ -130,13 +141,19 @@ class L2NormalizationOp : public Operator { Shape<2> norm_shape = Shape2(dshape[0], dshape[1]); Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); - norm = reduce_with_axis(F(data), 2); - MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps)); - }); - norm = F(norm); - out = data / broadcast_with_axis(norm, 1, dshape[2]); +#pragma omp parallel for num_threads(omp_threads) collapse(2) + for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { + for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + norm[shape0][shape1] = DType(param_.eps); + for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; + } + norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]); + for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1]; + } + } + } } else { LOG(FATAL) << "Unexpected mode in l2 normalization"; } From f0ba8bffa385fcdd5d927e2260b21fbf27a6d0a4 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Thu, 1 Nov 2018 09:05:53 +0800 Subject: [PATCH 2/6] Fix windows build --- src/operator/l2_normalization-inl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h index 2f86de3692fe..90edec2c95a0 100644 --- a/src/operator/l2_normalization-inl.h +++ b/src/operator/l2_normalization-inl.h @@ -35,6 +35,11 @@ #include "./operator_common.h" #include "./mshadow_op.h" +/* VisualStudio only supports openmp 2.0 */ +#ifdef _MSC_VER +#define collapse(x) +#endif + namespace mxnet { namespace op { From 3e28a054adfd100948fe239da548b62c57812ba0 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Thu, 1 Nov 2018 09:42:38 +0800 Subject: [PATCH 3/6] Fix windows build --- src/operator/l2_normalization-inl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h index 90edec2c95a0..74287659627c 100644 --- a/src/operator/l2_normalization-inl.h +++ b/src/operator/l2_normalization-inl.h @@ -101,13 +101,13 @@ class L2NormalizationOp : public Operator { .get_with_shape(dshape, s); Tensor norm = out_data[l2_normalization::kNorm].get(s); #pragma omp parallel for num_threads(omp_threads) - for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { norm[shape0] = DType(param_.eps); - for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { norm[shape0] += data[shape0][shape1] * data[shape0][shape1]; } norm[shape0] = std::sqrt(norm[shape0]); - for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { out[shape0][shape1] = data[shape0][shape1] / norm[shape0]; } } @@ -123,14 +123,14 @@ class L2NormalizationOp : public Operator { Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); #pragma omp parallel for num_threads(omp_threads) collapse(2) - for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { - for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { norm[shape0][shape2] = DType(param_.eps); - for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; } norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]); - for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2]; } } @@ -147,14 +147,14 @@ class L2NormalizationOp : public Operator { Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); #pragma omp parallel for num_threads(omp_threads) collapse(2) - for (size_t shape0 = 0; shape0 < dshape[0]; shape0++) { - for (size_t shape1 = 0; shape1 < dshape[1]; shape1++) { + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { norm[shape0][shape1] = DType(param_.eps); - for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; } norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]); - for (size_t shape2 = 0; shape2 < dshape[2]; shape2++) { + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1]; } } From 393ec379d52aca17903dbfe61d4dfe793c347dc1 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Thu, 1 Nov 2018 10:32:04 +0800 Subject: [PATCH 4/6] Move cpu optimization into l2_normalization.cc --- src/operator/l2_normalization-inl.h | 66 ++++++------------ src/operator/l2_normalization.cc | 102 +++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 46 deletions(-) diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h index 74287659627c..c7e71424ada9 100644 --- a/src/operator/l2_normalization-inl.h +++ b/src/operator/l2_normalization-inl.h @@ -35,11 +35,6 @@ #include "./operator_common.h" #include "./mshadow_op.h" -/* VisualStudio only supports openmp 2.0 */ -#ifdef _MSC_VER -#define collapse(x) -#endif - namespace mxnet { namespace op { @@ -91,7 +86,6 @@ class L2NormalizationOp : public Operator { CHECK_EQ(out_data.size(), 2U); Stream *s = ctx.get_stream(); TShape orig_shape = in_data[l2_normalization::kData].shape_; - auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (param_.mode == l2_normalization::kInstance) { Shape<2> dshape = Shape2(orig_shape[0], orig_shape.ProdShape(1, orig_shape.ndim())); @@ -100,17 +94,13 @@ class L2NormalizationOp : public Operator { Tensor out = out_data[l2_normalization::kOut] .get_with_shape(dshape, s); Tensor norm = out_data[l2_normalization::kNorm].get(s); -#pragma omp parallel for num_threads(omp_threads) - for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { - norm[shape0] = DType(param_.eps); - for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { - norm[shape0] += data[shape0][shape1] * data[shape0][shape1]; - } - norm[shape0] = std::sqrt(norm[shape0]); - for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { - out[shape0][shape1] = data[shape0][shape1] / norm[shape0]; - } - } + norm = sumall_except_dim<0>(F(data)); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, norm.size(0), norm.dptr_, norm.dptr_, DType(param_.eps)); + }); + norm = F(norm); + out = data / broadcast<0>(norm, out.shape_); } else if (param_.mode == l2_normalization::kChannel) { CHECK_GE(orig_shape.ndim(), 3U); Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], @@ -122,19 +112,13 @@ class L2NormalizationOp : public Operator { Shape<2> norm_shape = Shape2(dshape[0], dshape[2]); Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); -#pragma omp parallel for num_threads(omp_threads) collapse(2) - for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { - for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { - norm[shape0][shape2] = DType(param_.eps); - for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { - norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; - } - norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]); - for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { - out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2]; - } - } - } + norm = reduce_with_axis(F(data), 1); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps)); + }); + norm = F(norm); + out = data / broadcast_with_axis(norm, 0, orig_shape[1]); } else if (param_.mode == l2_normalization::kSpatial) { CHECK_GE(orig_shape.ndim(), 3U); Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], @@ -146,19 +130,13 @@ class L2NormalizationOp : public Operator { Shape<2> norm_shape = Shape2(dshape[0], dshape[1]); Tensor norm = out_data[l2_normalization::kNorm] .get_with_shape(norm_shape, s); -#pragma omp parallel for num_threads(omp_threads) collapse(2) - for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { - for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { - norm[shape0][shape1] = DType(param_.eps); - for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { - norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; - } - norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]); - for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { - out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1]; - } - } - } + norm = reduce_with_axis(F(data), 2); + MXNET_ASSIGN_REQ_SWITCH(req[0], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, norm.size(0) * norm.size(1), norm.dptr_, norm.dptr_, DType(param_.eps)); + }); + norm = F(norm); + out = data / broadcast_with_axis(norm, 1, dshape[2]); } else { LOG(FATAL) << "Unexpected mode in l2 normalization"; } @@ -238,7 +216,7 @@ class L2NormalizationOp : public Operator { } } - private: + protected: L2NormalizationParam param_; }; // class L2NormalizationOp diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc index f2f485ae6d1b..6801a0a20576 100644 --- a/src/operator/l2_normalization.cc +++ b/src/operator/l2_normalization.cc @@ -23,13 +23,111 @@ * \brief l2 normalization operator */ #include "./l2_normalization-inl.h" + +/* VisualStudio only supports openmp 2.0 */ +#ifdef _MSC_VER +#define collapse(x) +#endif + namespace mxnet { namespace op { + +template +class L2NormalizationOpCPU : public L2NormalizationOp { + public: + explicit L2NormalizationOpCPU(L2NormalizationParam p) + : L2NormalizationOp(p) {} + void Forward(const OpContext &ctx, const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) override { + using namespace mshadow; + using namespace mshadow::expr; + if (req[l2_normalization::kOut] == kNullOp) return; + CHECK_EQ(req[l2_normalization::kOut], kWriteTo); + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + Stream *s = ctx.get_stream(); + TShape orig_shape = in_data[l2_normalization::kData].shape_; + auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); + if (this->param_.mode == l2_normalization::kInstance) { + Shape<2> dshape = Shape2(orig_shape[0], + orig_shape.ProdShape(1, orig_shape.ndim())); + Tensor data = in_data[l2_normalization::kData] + .get_with_shape(dshape, s); + Tensor out = out_data[l2_normalization::kOut] + .get_with_shape(dshape, s); + Tensor norm = out_data[l2_normalization::kNorm].get(s); +#pragma omp parallel for num_threads(omp_threads) + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { + norm[shape0] = DType(this->param_.eps); + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { + norm[shape0] += data[shape0][shape1] * data[shape0][shape1]; + } + norm[shape0] = std::sqrt(norm[shape0]); + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { + out[shape0][shape1] = data[shape0][shape1] / norm[shape0]; + } + } + } else if (this->param_.mode == l2_normalization::kChannel) { + CHECK_GE(orig_shape.ndim(), 3U); + Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], + orig_shape.ProdShape(2, orig_shape.ndim())); + Tensor data = in_data[l2_normalization::kData] + .get_with_shape(dshape, s); + Tensor out = out_data[l2_normalization::kOut] + .get_with_shape(dshape, s); + Shape<2> norm_shape = Shape2(dshape[0], dshape[2]); + Tensor norm = out_data[l2_normalization::kNorm] + .get_with_shape(norm_shape, s); +#pragma omp parallel for num_threads(omp_threads) collapse(2) + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { + norm[shape0][shape2] = DType(this->param_.eps); + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { + norm[shape0][shape2] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; + } + norm[shape0][shape2] = std::sqrt(norm[shape0][shape2]); + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { + out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape2]; + } + } + } + } else if (this->param_.mode == l2_normalization::kSpatial) { + CHECK_GE(orig_shape.ndim(), 3U); + Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1], + orig_shape.ProdShape(2, orig_shape.ndim())); + Tensor data = in_data[l2_normalization::kData] + .get_with_shape(dshape, s); + Tensor out = out_data[l2_normalization::kOut] + .get_with_shape(dshape, s); + Shape<2> norm_shape = Shape2(dshape[0], dshape[1]); + Tensor norm = out_data[l2_normalization::kNorm] + .get_with_shape(norm_shape, s); +#pragma omp parallel for num_threads(omp_threads) collapse(2) + for (int shape0 = 0; shape0 < static_cast(dshape[0]); shape0++) { + for (int shape1 = 0; shape1 < static_cast(dshape[1]); shape1++) { + norm[shape0][shape1] = DType(this->param_.eps); + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { + norm[shape0][shape1] += data[shape0][shape1][shape2] * data[shape0][shape1][shape2]; + } + norm[shape0][shape1] = std::sqrt(norm[shape0][shape1]); + for (int shape2 = 0; shape2 < static_cast(dshape[2]); shape2++) { + out[shape0][shape1][shape2] = data[shape0][shape1][shape2] / norm[shape0][shape1]; + } + } + } + } else { + LOG(FATAL) << "Unexpected mode in l2 normalization"; + } + } +}; + template<> Operator* CreateOp(L2NormalizationParam param, int dtype) { Operator* op = nullptr; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new L2NormalizationOp(param); + op = new L2NormalizationOpCPU(param); }); return op; } @@ -37,7 +135,7 @@ Operator* CreateOp(L2NormalizationParam param, int dtype) { // DO_BIND_DISPATCH comes from static_operator_common.h Operator* L2NormalizationProp::CreateOperatorEx(Context ctx, std::vector *in_shape, std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + DO_BIND_DISPATCH(CreateOp, this->param_, in_type->at(0)); } DMLC_REGISTER_PARAMETER(L2NormalizationParam); From 2e7570265ddc478465cd960f9ad825478c1ca2f9 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Thu, 1 Nov 2018 20:24:51 +0800 Subject: [PATCH 5/6] Retrigger CI --- src/operator/l2_normalization.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc index 6801a0a20576..56702792297d 100644 --- a/src/operator/l2_normalization.cc +++ b/src/operator/l2_normalization.cc @@ -50,6 +50,7 @@ class L2NormalizationOpCPU : public L2NormalizationOp { Stream *s = ctx.get_stream(); TShape orig_shape = in_data[l2_normalization::kData].shape_; auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); + if (this->param_.mode == l2_normalization::kInstance) { Shape<2> dshape = Shape2(orig_shape[0], orig_shape.ProdShape(1, orig_shape.ndim())); From 5c1e34511113f50db0299107d0818b558ccc54f8 Mon Sep 17 00:00:00 2001 From: Zhennan Qin Date: Mon, 5 Nov 2018 09:26:21 +0800 Subject: [PATCH 6/6] Retrigger CI --- src/operator/l2_normalization.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc index 56702792297d..6801a0a20576 100644 --- a/src/operator/l2_normalization.cc +++ b/src/operator/l2_normalization.cc @@ -50,7 +50,6 @@ class L2NormalizationOpCPU : public L2NormalizationOp { Stream *s = ctx.get_stream(); TShape orig_shape = in_data[l2_normalization::kData].shape_; auto omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - if (this->param_.mode == l2_normalization::kInstance) { Shape<2> dshape = Shape2(orig_shape[0], orig_shape.ProdShape(1, orig_shape.ndim()));