From ad38228e1b334bf657f4133b903febc42f0d74d2 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 15 Nov 2021 09:07:35 +0100 Subject: [PATCH 1/7] Improve performance of take operator --- src/operator/tensor/indexing_op.cc | 94 ++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 30 deletions(-) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index 30825415e481..4602640db94f 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -60,6 +60,46 @@ struct TakeZeroAxisCPU { } }; +template +struct TakeNonzeroAxisCPU { + /*! + * \brief Map function for take operator + * \param i global thread id + * \param out_data ptr to output buffer + * \param in_data ptr to input buffer + * \param idx ptr to indices buffer + * \param outer_dim_stride stride of dimension before axis + * \param axis_dim_stride stride of axis dimension + * \param idx_size size of the indices tensor + * \param axis_dim dim size of the axis dimension + * \param axis axis id + */ + template + MSHADOW_XINLINE static void Map(index_t i, + DType* out_data, + const DType* in_data, + const IType* indices, + const index_t outer_dim_stride, + const index_t axis_dim_stride, + const int idx_size, + const int axis_dim, + const int axis) { + for (index_t j = 0; j < static_cast(idx_size); ++j) { // 4 + int index = indices[j]; + if (clip) { + index = (index < 0) ? 0 : index; + index = (index > axis_dim - 1) ? (axis_dim - 1) : index; + } else { + index %= axis_dim; + index += (index < 0) ? axis_dim : 0; + } + size_t in_offset = i * outer_dim_stride + index * axis_dim_stride; + size_t out_offset = (i * idx_size + j) * axis_dim_stride; + memcpy(out_data + out_offset, in_data + in_offset, axis_dim_stride * sizeof(DType)); + } + } +}; + /* * \brief returns true if all indices are between [min, max] * \param data_ptr the indices to check @@ -323,6 +363,7 @@ void TakeOpForward(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { using namespace mxnet_op; + if (req[take_::kOut] == kNullOp) return; const TakeParam& param = nnvm::get(attrs.parsed); @@ -375,39 +416,32 @@ void TakeOpForward(const nnvm::NodeAttrs& attrs, for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) { in_strides[i] = stride; } - mshadow::Shape<10> out_strides; - stride = 1; - for (int i = oshape.ndim() - 1; i >= 0; stride *= oshape[i], --i) { - out_strides[i] = stride; + int outer_dimensions = 1; + for (int i = 0; i < actual_axis; i++) { + outer_dimensions *= oshape[i]; } if (param.mode == take_::kClip) { - Kernel, cpu>::Launch(s, - oshape.Size(), - outputs[take_::kOut].dptr(), - inputs[take_::kArr].dptr(), - inputs[take_::kIdx].dptr(), - out_strides[actual_axis - 1], - in_strides[actual_axis - 1], - in_strides[actual_axis], - arrshape.ndim(), - oshape.ndim(), - idxshape.ndim(), - arrshape[actual_axis], - actual_axis); + Kernel, cpu>::Launch(s, + outer_dimensions, + outputs[take_::kOut].dptr(), + inputs[take_::kArr].dptr(), + inputs[take_::kIdx].dptr(), + in_strides[actual_axis - 1], + in_strides[actual_axis], + idxshape.Size(), + arrshape[actual_axis], + actual_axis); } else { - Kernel, cpu>::Launch(s, - oshape.Size(), - outputs[take_::kOut].dptr(), - inputs[take_::kArr].dptr(), - inputs[take_::kIdx].dptr(), - out_strides[actual_axis - 1], - in_strides[actual_axis - 1], - in_strides[actual_axis], - arrshape.ndim(), - oshape.ndim(), - idxshape.ndim(), - arrshape[actual_axis], - actual_axis); + Kernel, cpu>::Launch(s, + outer_dimensions, + outputs[take_::kOut].dptr(), + inputs[take_::kArr].dptr(), + inputs[take_::kIdx].dptr(), + in_strides[actual_axis - 1], + in_strides[actual_axis], + idxshape.Size(), + arrshape[actual_axis], + actual_axis); } } }); From aba82312dec344538fabc46228c9ded396925ace Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 15 Nov 2021 09:38:45 +0100 Subject: [PATCH 2/7] remove comment --- src/operator/tensor/indexing_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index 4602640db94f..c46d1926ca35 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -84,7 +84,7 @@ struct TakeNonzeroAxisCPU { const int idx_size, const int axis_dim, const int axis) { - for (index_t j = 0; j < static_cast(idx_size); ++j) { // 4 + for (index_t j = 0; j < static_cast(idx_size); ++j) { int index = indices[j]; if (clip) { index = (index < 0) ? 0 : index; From 6ce5ee94c921695b0e7c7d41130cf6ae65ae98f8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Fri, 19 Nov 2021 15:47:15 +0100 Subject: [PATCH 3/7] Fix build --- src/operator/tensor/indexing_op.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index c46d1926ca35..ee2470c2aa11 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -95,7 +95,12 @@ struct TakeNonzeroAxisCPU { } size_t in_offset = i * outer_dim_stride + index * axis_dim_stride; size_t out_offset = (i * idx_size + j) * axis_dim_stride; - memcpy(out_data + out_offset, in_data + in_offset, axis_dim_stride * sizeof(DType)); + #pragma GCC diagnostic push +#if __GNUC__ >= 8 +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif + std::memcpy(out_data + out_offset, in_data + in_offset, axis_dim_stride * sizeof(DType)); +#pragma GCC diagnostic pop } } }; From e6839311a0afc3981046395b47263d1659ef3f7a Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 22 Nov 2021 08:04:07 +0100 Subject: [PATCH 4/7] fix sanity --- src/operator/tensor/indexing_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index ee2470c2aa11..28dea2631885 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -95,7 +95,7 @@ struct TakeNonzeroAxisCPU { } size_t in_offset = i * outer_dim_stride + index * axis_dim_stride; size_t out_offset = (i * idx_size + j) * axis_dim_stride; - #pragma GCC diagnostic push +#pragma GCC diagnostic push #if __GNUC__ >= 8 #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif From 8a46b85f0d2a07a4112f1313d06bcf7b55e45f13 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 29 Nov 2021 15:52:02 +0100 Subject: [PATCH 5/7] Add comment --- src/operator/tensor/indexing_op.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 81a04aa24027..af4b559d0692 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -217,6 +217,7 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs, /*! \brief name the struct TakeNonzeroAxis for general take when * axis is not zero, use TakeZeroAxisGPU or TakeZeroAxisCPU for axis zero + * or TakeNonZeroAxisCPU for CPU optimized version */ template struct TakeNonzeroAxis { From 300e3b3a164ea57b1f8f88b7f3a054d9184ae679 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Tue, 7 Dec 2021 15:55:21 +0100 Subject: [PATCH 6/7] review --- src/operator/tensor/indexing_op.cc | 6 +++--- src/operator/tensor/indexing_op.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index 28dea2631885..28eca41b2ae4 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -67,7 +67,7 @@ struct TakeNonzeroAxisCPU { * \param i global thread id * \param out_data ptr to output buffer * \param in_data ptr to input buffer - * \param idx ptr to indices buffer + * \param indices ptr to indices buffer * \param outer_dim_stride stride of dimension before axis * \param axis_dim_stride stride of axis dimension * \param idx_size size of the indices tensor @@ -87,8 +87,8 @@ struct TakeNonzeroAxisCPU { for (index_t j = 0; j < static_cast(idx_size); ++j) { int index = indices[j]; if (clip) { - index = (index < 0) ? 0 : index; - index = (index > axis_dim - 1) ? (axis_dim - 1) : index; + index = std::max(index, 0); + index = std::min(axis_dim - 1, index); } else { index %= axis_dim; index += (index < 0) ? axis_dim : 0; diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index af4b559d0692..ed75c8fd270a 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -215,9 +215,9 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs, return dispatched; } -/*! \brief name the struct TakeNonzeroAxis for general take when - * axis is not zero, use TakeZeroAxisGPU or TakeZeroAxisCPU for axis zero - * or TakeNonZeroAxisCPU for CPU optimized version +/*! \brief TakeNonzeroAxis is desinated for general take when + * axis is not zero (for CPU optimized version use TakeNonZeroAxisCPU and + for axis zero use TakeZeroAxisGPU or TakeZeroAxisCPU) */ template struct TakeNonzeroAxis { From 4cd7a1953d9db4ca3900d41beba6fcb2d53e2916 Mon Sep 17 00:00:00 2001 From: Sheng Zha Date: Mon, 17 Jan 2022 21:09:11 -0500 Subject: [PATCH 7/7] Update src/operator/tensor/indexing_op.h Co-authored-by: bartekkuncer --- src/operator/tensor/indexing_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index ed75c8fd270a..cd97be8dcfc0 100644 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -215,7 +215,7 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs, return dispatched; } -/*! \brief TakeNonzeroAxis is desinated for general take when +/*! \brief TakeNonzeroAxis is designated for general take when * axis is not zero (for CPU optimized version use TakeNonZeroAxisCPU and for axis zero use TakeZeroAxisGPU or TakeZeroAxisCPU) */