From 66e4f16cb15eab504ce05393e8962b9c4d6ffc2c Mon Sep 17 00:00:00 2001 From: reminisce Date: Wed, 15 Mar 2017 16:13:53 -0700 Subject: [PATCH 01/16] Initial check-in of new pooling op --- src/operator/mxnet_op.h | 16 ++ src/operator/nn/pool.cuh | 144 +++++++++++++++ src/operator/nn/pool.h | 117 ++++++++++++ src/operator/pooling-inl.h | 150 +++++++-------- src/operator/pooling.cc | 26 +-- src/operator/pooling.cu | 15 +- src/operator/pooling_v1-inl.h | 335 ++++++++++++++++++++++++++++++++++ src/operator/pooling_v1.cc | 89 +++++++++ src/operator/pooling_v1.cu | 36 ++++ 9 files changed, 819 insertions(+), 109 deletions(-) create mode 100644 src/operator/nn/pool.cuh create mode 100644 src/operator/nn/pool.h create mode 100644 src/operator/pooling_v1-inl.h create mode 100644 src/operator/pooling_v1.cc create mode 100644 src/operator/pooling_v1.cu diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index 6d8b45fc1188..d46b978e0218 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -139,6 +139,22 @@ MSHADOW_XINLINE Shape calc_stride(const Shape& shape) { } +struct fill { + template + MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) { + out[i] = val; + } +}; + + +struct set_zero { + template + MSHADOW_XINLINE static void Map(int i, DType* out) { + out[i] = static_cast(0); + } +}; + + template struct Kernel; diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh new file mode 100644 index 000000000000..e8f53a5d4ada --- /dev/null +++ b/src/operator/nn/pool.cuh @@ -0,0 +1,144 @@ +#ifndef MXNET_OPERATOR_NN_POOL_CUH_ +#define MXNET_OPERATOR_NN_POOL_CUH_ + +#include +#include +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +/*! + * \brief Get the number of blocks for cuda kernel given N + */ +inline int cuda_get_num_blocks(const int N) { + using namespace mshadow::cuda; + return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); +} + +template +__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in_data, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + OpReqType req_type, DType* const out_data, int32_t* mask) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const DType* const in_slice = + in_data + (n * channels + c) * height * width; + int in_index = hstart * width + wstart; + DType max_val = in_slice[in_index]; + int max_idx = in_index; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + in_index = h * width + w; + const DType in_val = in_slice[in_index]; + if (in_val > max_val) { + max_val = in_val; + max_idx = in_index; + } + } + } + mask[index] = max_idx; + KERNEL_ASSIGN(out_data[index], req_type, max_val); + } +} + +template +__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* const out_data, + const int32_t* const mask, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + OpReqType req_type, DType* const in_data) { + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width; + const int h = (index / width) % height; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_height); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); + DType gradient = 0; + const int offset = (n * channels + c) * pooled_height * pooled_width; + const DType* const out_data_slice = out_data + offset; + const int* const mask_slice = mask + offset; + int in_index = h * width + w; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + int pooled_index = ph * pooled_width + pw; + if (mask_slice[pooled_index] == in_index) { + gradient += out_data_slice[pooled_index]; + } + } + } + KERNEL_ASSIGN(in_data[index], req_type, gradient); + } +} + +template +inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, + DType* out_data, int32_t* mask = nullptr) { + if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data, mask); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + } + } +} + +template +inline void unpool(mshadow::Stream* s, const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data, + const int32_t* mask = nullptr) { + if (mxnet::kNullOp == req_type) return; + if (mxnet::kAddTo != req_type) { + mxnet_op::Kernel::Launch(s, ishape.Size(), in_data); + } + if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_2d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_data, mask, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, in_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + } + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_NN_POOL_CUH_ diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h new file mode 100644 index 000000000000..15b0f7132844 --- /dev/null +++ b/src/operator/nn/pool.h @@ -0,0 +1,117 @@ +#ifndef MXNET_OPERATOR_NN_POOL_H_ +#define MXNET_OPERATOR_NN_POOL_H_ + +#include +#include +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +namespace pool_enum { +enum PoolingOpInputs {kData}; +enum PoolingOpOutputs {kOut, kMask}; +enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; +enum PoolingOpPadConventionType {kValid, kFull}; +} // namespace pool_enum + +template +inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data, int32_t* mask) { + const index_t height = ishape[2], width = ishape[3]; + const index_t pooled_height = oshape[2], pooled_width = oshape[3]; + const index_t kernel_h = kernel[0], kernel_w = kernel[1]; + const index_t pad_h = pad[0], pad_w = pad[1]; + const index_t stride_h = stride[0], stride_w = stride[1]; + const index_t in_data_offset = ishape[2] * ishape[3]; + const index_t out_data_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (index_t ph = 0; ph < pooled_height; ++ph) { + for (index_t pw = 0; pw < pooled_width; ++pw) { + index_t tmp_h = ph * stride_h; + index_t tmp_w = pw * stride_w; + index_t hend = std::min(tmp_h + kernel_h - pad_h, height); + index_t wend = std::min(tmp_w + kernel_w - pad_w, width); + index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0); + index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0); + const index_t pool_index = ph * pooled_width + pw; + index_t in_index = hstart * width + wstart; + DType max_val = in_data[in_index]; + mask[pool_index] = in_index; + for (index_t h = hstart; h < hend; ++h) { + for (index_t w = wstart; w < wend; ++w) { + in_index = h * width + w; + if (in_data[in_index] > max_val) { + max_val = in_data[in_index]; + mask[pool_index] = in_index; + } + } + } + KERNEL_ASSIGN(out_data[pool_index], req_type, max_val); + } + } + in_data += in_data_offset; + out_data += out_data_offset; + mask += out_data_offset; + } + } +} + +template +inline void unpool_max_2d_cpu(mshadow::Stream* s, const DType* out_data, const int32_t* mask, + const TShape& ishape, const TShape& oshape, DType* in_data) { + const index_t pooled_height = oshape[2], pooled_width = oshape[3]; + const index_t in_data_offset = ishape[2] * ishape[3]; + const index_t out_data_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (index_t ph = 0; ph < pooled_height; ++ph) { + for (index_t pw = 0; pw < pooled_width; ++pw) { + const index_t out_index = ph * pooled_width + pw; + in_data[mask[out_index]] += out_data[out_index]; + } + } + in_data += in_data_offset; + out_data += out_data_offset; + mask += out_data_offset; + } + } +} + +template +inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, + DType* out_data, int32_t* mask = nullptr) { + if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, mask); + } + } +} + +template +inline void unpool(mshadow::Stream* s, const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data, + const int32_t* mask = nullptr) { + if (mxnet::kNullOp == req_type) return; + if (mxnet::kAddTo != req_type) { + mxnet_op::Kernel::Launch(s, ishape.Size(), in_data); + } + if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_2d_cpu(s, out_data, mask, ishape, oshape, in_data); + } + } +} + +} // namespace op +} // namespace mxnet +#ifdef __CUDACC__ +#include "./pool.cuh" +#endif + +#endif // MXNET_OPERATOR_NN_POOL_H_ diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h index 9ea0155b21be..5258844341c4 100644 --- a/src/operator/pooling-inl.h +++ b/src/operator/pooling-inl.h @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling-inl.h * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #ifndef MXNET_OPERATOR_POOLING_INL_H_ @@ -17,16 +17,19 @@ #include #include #include "./operator_common.h" +#include "./nn/pool.h" namespace mxnet { namespace op { +#if 0 namespace pool_enum { enum PoolingOpInputs {kData}; enum PoolingOpOutputs {kOut}; enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; enum PoolingOpPadConventionType {kValid, kFull}; } // namespace pool_enum +#endif struct PoolingParam : public dmlc::Parameter { TShape kernel; @@ -63,107 +66,60 @@ struct PoolingParam : public dmlc::Parameter { } }; -template +template class PoolingOp : public Operator { public: explicit PoolingOp(PoolingParam p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + virtual void Forward(const OpContext& ctx, + const std::vector& in_data, + const std::vector& req, + const std::vector& out_data, + const std::vector& aux_args) { using namespace mshadow; - using namespace mshadow::expr; CHECK_EQ(in_data.size(), 1U); CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); - if (param_.kernel.ndim() == 3) { - LOG(FATAL) << "3D kernel not implemented"; - } - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]); - if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) { - Assign(out, - req[pool_enum::kOut], - pool(pad(data, param_.pad[0], param_.pad[1]), - out_shape, - param_.global_pool ? data.shape_[2] : param_.kernel[0], - param_.global_pool ? data.shape_[3] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - } else if (param_.pool_type == pool_enum::kAvgPooling) { - Assign(out, - req[pool_enum::kOut], - scalar(1.0f / (param_.global_pool ? - data.shape_[2] * data.shape_[3] : - param_.kernel[0] * param_.kernel[1])) * \ - pool(pad(data, param_.pad[0], param_.pad[1]), - out_shape, - param_.global_pool ? data.shape_[2] : param_.kernel[0], - param_.global_pool ? data.shape_[3] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - } + pool(s, in_data[pool_enum::kData].dptr(), + in_data[pool_enum::kData].shape_, + out_data[pool_enum::kOut].shape_, + param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kOut], + out_data[pool_enum::kOut].dptr(), + pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr() + : nullptr); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + virtual void Backward(const OpContext& ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { using namespace mshadow; - using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); CHECK_EQ(in_data.size(), 1U); CHECK_EQ(out_data.size(), 1U); CHECK_EQ(req.size(), 1U); CHECK_EQ(in_grad.size(), 1U); - // TODO(bing): remove pad (0,0) - if (param_.kernel.ndim() == 3) { - LOG(FATAL) << "3D kernel not implemented"; - } Stream *s = ctx.get_stream(); - Tensor grad = out_grad[pool_enum::kOut].get(s); - Tensor data = in_data[pool_enum::kData].get(s); - Tensor output_data = out_data[pool_enum::kOut].get(s); - Tensor input_grad = in_grad[pool_enum::kData].get(s); - - mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]); - - if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) { - Assign(input_grad, req[pool_enum::kData], - crop(unpool(pad(data, param_.pad[0], param_.pad[1]), - pad(output_data, 0, 0), - pad(grad, 0, 0), - param_.global_pool ? in_shape[0] : param_.kernel[0], - param_.global_pool ? in_shape[1] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1]), - in_shape, - param_.pad[0], - param_.pad[1])); - } else if (param_.pool_type == pool_enum::kAvgPooling) { - Assign(input_grad, req[pool_enum::kData], - scalar(1.0f / (param_.global_pool ? - data.shape_[2] * data.shape_[3] : - param_.kernel[0] * param_.kernel[1])) * \ - crop(unpool(pad(data, param_.pad[0], param_.pad[1]), - pad(output_data, 0, 0), - pad(grad, 0, 0), - param_.global_pool ? in_shape[0] : param_.kernel[0], - param_.global_pool ? in_shape[1] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1]), - in_shape, - param_.pad[0], - param_.pad[1])); - } + unpool(s, out_data[pool_enum::kOut].dptr(), + in_data[pool_enum::kData].shape_, + out_data[pool_enum::kOut].shape_, + param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kData], + in_data[pool_enum::kData].dptr(), + pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr() + : nullptr); } private: @@ -234,7 +190,10 @@ class PoolingProp : public OperatorProperty { } } out_shape->clear(); - out_shape->push_back(oshape); + out_shape->push_back(oshape); // save output shape + if (pool_enum::kMaxPooling == param_.pool_type) { + out_shape->push_back(oshape); // save mask shape + } } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; @@ -267,7 +226,10 @@ class PoolingProp : public OperatorProperty { } out_shape->clear(); - out_shape->push_back(oshape); + out_shape->push_back(oshape); // save output shape + if (pool_enum::kMaxPooling == param_.pool_type) { + out_shape->push_back(oshape); // save mask shape + } } return true; } @@ -285,6 +247,9 @@ class PoolingProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); + if (pool_enum::kMaxPooling == param_.pool_type) { + out_type->push_back(mshadow::kInt32); + } return true; } @@ -302,6 +267,9 @@ class PoolingProp : public OperatorProperty { const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data) const override { + if (pool_enum::kMaxPooling == param_.pool_type) { + return {out_grad[pool_enum::kOut], out_data[pool_enum::kMask]}; + } return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]}; } @@ -317,6 +285,16 @@ class PoolingProp : public OperatorProperty { #endif } + int NumVisibleOutputs() const override { + return 1; + } + + int NumOutputs() const override { + // For max pooling, need to return mask as well + if (pool_enum::kMaxPooling == param_.pool_type) return 2; + return 1; + } + Operator* CreateOperator(Context ctx) const override { LOG(FATAL) << "Not Implemented."; return NULL; diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index a4eed2633232..e4722acaf3e8 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling.cc * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #include "./pooling-inl.h" #if MXNET_USE_MKL2017 == 1 @@ -51,21 +51,15 @@ Operator *CreateOp(PoolingParam param, int dtype) { } #endif MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new PoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new PoolingOp(param); - break; - case pool_enum::kSumPooling: - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + op = new PoolingOp(param); + } else { + LOG(FATAL) << "unknown pooling type"; + return NULL; } - }) + }); return op; } diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu index be2464e3c0ef..d8d5c535fce9 100644 --- a/src/operator/pooling.cu +++ b/src/operator/pooling.cu @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling.cu * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #include #include "./pooling-inl.h" @@ -12,6 +12,7 @@ namespace mxnet { namespace op { + template<> Operator *CreateOp(PoolingParam param, int dtype) { Operator *op = NULL; @@ -25,8 +26,8 @@ Operator *CreateOp(PoolingParam param, int dtype) { op = new CuDNNPoolingOp(param); break; case pool_enum::kSumPooling: - LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied."; - op = new PoolingOp(param); + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + op = new PoolingOp(param); break; default: LOG(FATAL) << "unknown pooling type"; @@ -37,13 +38,13 @@ Operator *CreateOp(PoolingParam param, int dtype) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { switch (param.pool_type) { case pool_enum::kMaxPooling: - op = new PoolingOp(param); + op = new PoolingOp(param); break; case pool_enum::kAvgPooling: - op = new PoolingOp(param); + op = new PoolingOp(param); break; case pool_enum::kSumPooling: - op = new PoolingOp(param); + op = new PoolingOp(param); break; default: LOG(FATAL) << "unknown pooling type"; diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h new file mode 100644 index 000000000000..d354fd7322b8 --- /dev/null +++ b/src/operator/pooling_v1-inl.h @@ -0,0 +1,335 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1-inl.h + * \brief + * \author Bing Xu +*/ + +#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_ +#define MXNET_OPERATOR_POOLING_V1_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" + +namespace mxnet { +namespace op { + +namespace pool_v1_enum { +enum PoolingV1OpInputs {kData}; +enum PoolingV1OpOutputs {kOut}; +enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling}; +enum PoolingV1OpPadConventionType {kValid, kFull}; +} // namespace pool_v1_enum + +struct PoolingV1Param : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape pad; + int pool_type; + int pooling_convention; + bool global_pool; + DMLC_DECLARE_PARAMETER(PoolingV1Param) { + DMLC_DECLARE_FIELD(global_pool).set_default(false) + .describe("Ignore kernel size, do global pooling based on current input feature map. "); + + DMLC_DECLARE_FIELD(kernel) + .enforce_nonzero() + .describe("pooling kernel size: (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pool_type) + .add_enum("max", pool_v1_enum::kMaxPooling) + .add_enum("avg", pool_v1_enum::kAvgPooling) + .add_enum("sum", pool_v1_enum::kSumPooling) + .describe("Pooling type to be applied."); + + DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid) + .add_enum("full", pool_v1_enum::kFull) + .add_enum("valid", pool_v1_enum::kValid) + .describe("Pooling convention to be applied."); + + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .enforce_nonzero() + .describe("stride: for pooling (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for pooling: (y, x) or (d, y, x)"); + } +}; + +template +class PoolingV1Op : public Operator { + public: + explicit PoolingV1Op(PoolingV1Param p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + Stream *s = ctx.get_stream(); + if (param_.kernel.ndim() == 3) { + LOG(FATAL) << "3D kernel not implemented"; + } + Tensor data = in_data[pool_v1_enum::kData].get(s); + Tensor out = out_data[pool_v1_enum::kOut].get(s); + mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]); + if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) { + Assign(out, + req[pool_v1_enum::kOut], + pool(pad(data, param_.pad[0], param_.pad[1]), + out_shape, + param_.global_pool ? data.shape_[2] : param_.kernel[0], + param_.global_pool ? data.shape_[3] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + } else if (param_.pool_type == pool_v1_enum::kAvgPooling) { + Assign(out, + req[pool_v1_enum::kOut], + scalar(1.0f / (param_.global_pool ? + data.shape_[2] * data.shape_[3] : + param_.kernel[0] * param_.kernel[1])) * \ + pool(pad(data, param_.pad[0], param_.pad[1]), + out_shape, + param_.global_pool ? data.shape_[2] : param_.kernel[0], + param_.global_pool ? data.shape_[3] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(req.size(), 1); + CHECK_EQ(in_grad.size(), 1); + // TODO(bing): remove pad (0,0) + if (param_.kernel.ndim() == 3) { + LOG(FATAL) << "3D kernel not implemented"; + } + Stream *s = ctx.get_stream(); + Tensor grad = out_grad[pool_v1_enum::kOut].get(s); + Tensor data = in_data[pool_v1_enum::kData].get(s); + Tensor output_data = out_data[pool_v1_enum::kOut].get(s); + Tensor input_grad = in_grad[pool_v1_enum::kData].get(s); + + mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]); + + if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) { + Assign(input_grad, req[pool_v1_enum::kData], + crop(unpool(pad(data, param_.pad[0], param_.pad[1]), + pad(output_data, 0, 0), + pad(grad, 0, 0), + param_.global_pool ? in_shape[0] : param_.kernel[0], + param_.global_pool ? in_shape[1] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1]), + in_shape, + param_.pad[0], + param_.pad[1])); + } else if (param_.pool_type == pool_v1_enum::kAvgPooling) { + Assign(input_grad, req[pool_v1_enum::kData], + scalar(1.0f / (param_.global_pool ? + data.shape_[2] * data.shape_[3] : + param_.kernel[0] * param_.kernel[1])) * \ + crop(unpool(pad(data, param_.pad[0], param_.pad[1]), + pad(output_data, 0, 0), + pad(grad, 0, 0), + param_.global_pool ? in_shape[0] : param_.kernel[0], + param_.global_pool ? in_shape[1] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1]), + in_shape, + param_.pad[0], + param_.pad[1])); + } + } + + private: + PoolingV1Param param_; +}; // class PoolingV1Op + +template +Operator* CreateOp(PoolingV1Param param, int dtype); + + +#if DMLC_USE_CXX11 +class PoolingV1Prop : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D pooling not supported"; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) + << "pad and kernel should have the same length"; + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + CHECK_EQ(in_shape->size(), 1); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) " + << "Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) + << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] + << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + if (param_.pooling_convention == pool_v1_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); + } else if (param_.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; + } else { + if (param_.pool_type == pool_v1_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + oshape[4] = 1 + static_cast(ceil(static_cast( + dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / param_.stride[2])); + } + } + + out_shape->clear(); + out_shape->push_back(oshape); + } + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 1); + int dtype = (*in_type)[0]; + + if (dtype == -1) { + LOG(FATAL) << "Input type to pooling is not specified."; + return false; + } + + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + PoolingV1Prop *prop_sym = new PoolingV1Prop(); + prop_sym->param_ = this->param_; + return prop_sym; + } + + std::string TypeString() const override { + return "PoolingV1"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData], + out_data[pool_v1_enum::kOut]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { +#if MXNET_USE_CUDNN == 1 + return {}; +#else + return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}}; +#endif + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + PoolingV1Param param_; +}; // class PoolingV1Prop +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_POOLING_V1_INL_H_ diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc new file mode 100644 index 000000000000..cdf4f1af5939 --- /dev/null +++ b/src/operator/pooling_v1.cc @@ -0,0 +1,89 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1.cc + * \brief + * \author Bing Xu +*/ +#include "./pooling_v1-inl.h" + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(PoolingV1Param param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.pool_type) { + case pool_v1_enum::kMaxPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kAvgPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kSumPooling: + op = new PoolingV1Op(param); + break; + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } + }) + + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +} + +DMLC_REGISTER_PARAMETER(PoolingV1Param); + +MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop) +.describe(R"code(Perform pooling on the input. + +The shapes for 2-D pooling is + +- **data**: *(batch_size, channel, height, width)* +- **out**: *(batch_size, num_filter, out_height, out_width)*, with:: + + out_height = f(height, kernel[0], pad[0], stride[0]) + out_width = f(width, kernel[1], pad[1], stride[1]) + +The defintion of *f* depends on ``pooling_convention``, which has two options: + +- **valid** (default):: + + f(x, k, p, s) = floor(x+2*p-k)/s+1 + +- **full**, which is compatible with Caffe:: + + f(x, k, p, s) = ceil(x+2*p-k)/s+1 + +But ``global_pool`` is set to be true, then do a global pooling, namely reset +``kernel=(height, width)``. + +Three pooling options are supported by ``pool_type``: + +- **avg**: average pooling +- **max**: max pooling +- **sum**: sum pooling + +1-D pooling is special case of 2-D pooling with *weight=1* and +*kernel[1]=1*. + +For 3-D pooling, an additional *depth* dimension is added before +*height*. Namely the input data will have shape *(batch_size, channel, depth, +height, width)*. + +)code" ADD_FILELINE) +.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.") +.add_arguments(PoolingV1Param::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu new file mode 100644 index 000000000000..99aebbc6446c --- /dev/null +++ b/src/operator/pooling_v1.cu @@ -0,0 +1,36 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1.cu + * \brief + * \author Bing Xu +*/ +#include +#include "./pooling_v1-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(PoolingV1Param param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.pool_type) { + case pool_v1_enum::kMaxPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kAvgPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kSumPooling: + op = new PoolingV1Op(param); + break; + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } + }); + return op; +} + +} // namespace op +} // namespace mxnet + From 966780c2b576e2379d8528151925701bc3d284cf Mon Sep 17 00:00:00 2001 From: reminisce Date: Thu, 16 Mar 2017 15:22:10 -0700 Subject: [PATCH 02/16] Implemented 2d pooling cpu and gpu --- src/operator/nn/pool.cuh | 92 +++++++++++++++++++---------------- src/operator/nn/pool.h | 65 +++++++++++++++++-------- src/operator/pooling-inl.h | 66 +++++++++---------------- src/operator/pooling.cu | 51 ++++++++----------- src/operator/pooling_v1-inl.h | 2 +- 5 files changed, 138 insertions(+), 138 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index e8f53a5d4ada..bc0f56c8a96f 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -4,6 +4,7 @@ #include #include #include "../mxnet_op.h" +#include "../../common/cuda_utils.h" namespace mxnet { namespace op { @@ -27,7 +28,8 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, - OpReqType req_type, DType* const out_data, int32_t* mask) { + OpReqType req_type, DType* const out_data) { + // index is the output image's pixel index in NCHW CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; const int ph = (index / pooled_width) % pooled_height; @@ -41,60 +43,63 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in wstart = max(wstart, 0); const DType* const in_slice = in_data + (n * channels + c) * height * width; - int in_index = hstart * width + wstart; - DType max_val = in_slice[in_index]; - int max_idx = in_index; + DType max_val = in_slice[hstart * width + wstart]; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - in_index = h * width + w; + const int in_index = h * width + w; const DType in_val = in_slice[in_index]; if (in_val > max_val) { max_val = in_val; - max_idx = in_index; } } } - mask[index] = max_idx; KERNEL_ASSIGN(out_data[index], req_type, max_val); } } template -__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* const out_data, - const int32_t* const mask, const int channels, - const int height, const int width, +__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, - OpReqType req_type, DType* const in_data) { + DType* in_grad) { + // index is the output image's pixel index in NCHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window CUDA_KERNEL_LOOP(index, nthreads) { - // find out the local index - // find out the local offset - const int w = index % width; - const int h = (index / width) % height; - const int c = (index / width / height) % channels; - const int n = index / width / height / channels; - const int phstart = - (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; - const int phend = min((h + pad_h) / stride_h + 1, pooled_height); - const int pwstart = - (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; - const int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - DType gradient = 0; - const int offset = (n * channels + c) * pooled_height * pooled_width; - const DType* const out_data_slice = out_data + offset; - const int* const mask_slice = mask + offset; - int in_index = h * width + w; - for (int ph = phstart; ph < phend; ++ph) { - for (int pw = pwstart; pw < pwend; ++pw) { - int pooled_index = ph * pooled_width + pw; - if (mask_slice[pooled_index] == in_index) { - gradient += out_data_slice[pooled_index]; + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * height * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = hstart * width + wstart; + DType max_val = out_data[index]; + bool found = false; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + max_idx = h * width + w; + if (in_data_slice[max_idx] == max_val) { + found = true; + break; } } + if (found) break; } - KERNEL_ASSIGN(in_data[index], req_type, gradient); + + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); } } @@ -102,7 +107,7 @@ template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, - DType* out_data, int32_t* mask = nullptr) { + DType* out_data) { if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -110,29 +115,30 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is 0, mshadow::Stream::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data, mask); + stride[0], stride[1], pad[0], pad[1], req_type, out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); } } } template -inline void unpool(mshadow::Stream* s, const DType* out_data, const TShape& ishape, - const TShape& oshape, const TShape& kernel, const TShape& pad, - const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data, - const int32_t* mask = nullptr) { +inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + const int pool_type, OpReqType req_type, DType* in_grad) { if (mxnet::kNullOp == req_type) return; if (mxnet::kAddTo != req_type) { - mxnet_op::Kernel::Launch(s, ishape.Size(), in_data); + mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); } if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) unpool_max_2d_gpu_kernel<<::GetStream(s)>>>( - ishape.Size(), out_data, mask, ishape[1], ishape[2], ishape[3], + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, in_data); + stride[0], stride[1], pad[0], pad[1], in_grad); MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); } } diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index 15b0f7132844..19d2877d7062 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -18,7 +18,7 @@ enum PoolingOpPadConventionType {kValid, kFull}; template inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data, int32_t* mask) { + OpReqType req_type, DType* out_data) { const index_t height = ishape[2], width = ishape[3]; const index_t pooled_height = oshape[2], pooled_width = oshape[3]; const index_t kernel_h = kernel[0], kernel_w = kernel[1]; @@ -39,13 +39,11 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS const index_t pool_index = ph * pooled_width + pw; index_t in_index = hstart * width + wstart; DType max_val = in_data[in_index]; - mask[pool_index] = in_index; for (index_t h = hstart; h < hend; ++h) { for (index_t w = wstart; w < wend; ++w) { in_index = h * width + w; if (in_data[in_index] > max_val) { max_val = in_data[in_index]; - mask[pool_index] = in_index; } } } @@ -54,28 +52,53 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS } in_data += in_data_offset; out_data += out_data_offset; - mask += out_data_offset; } } } template -inline void unpool_max_2d_cpu(mshadow::Stream* s, const DType* out_data, const int32_t* mask, - const TShape& ishape, const TShape& oshape, DType* in_data) { +inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const index_t height = ishape[2], width = ishape[3]; const index_t pooled_height = oshape[2], pooled_width = oshape[3]; - const index_t in_data_offset = ishape[2] * ishape[3]; - const index_t out_data_offset = oshape[2] * oshape[3]; + const index_t kernel_h = kernel[0], kernel_w = kernel[1]; + const index_t pad_h = pad[0], pad_w = pad[1]; + const index_t stride_h = stride[0], stride_w = stride[1]; + const index_t in_offset = ishape[2] * ishape[3]; + const index_t out_offset = oshape[2] * oshape[3]; for (index_t n = 0; n < oshape[0]; ++n) { for (index_t c = 0; c < oshape[1]; ++c) { for (index_t ph = 0; ph < pooled_height; ++ph) { for (index_t pw = 0; pw < pooled_width; ++pw) { - const index_t out_index = ph * pooled_width + pw; - in_data[mask[out_index]] += out_data[out_index]; + index_t tmp_h = ph * stride_h; + index_t tmp_w = pw * stride_w; + index_t hend = std::min(tmp_h + kernel_h - pad_h, height); + index_t wend = std::min(tmp_w + kernel_w - pad_w, width); + index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0); + index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0); + const index_t pool_index = ph * pooled_width + pw; + index_t max_idx = hstart * width + wstart; + bool found = false; + for (index_t h = hstart; h < hend; ++h) { + for (index_t w = wstart; w < wend; ++w) { + max_idx = h * width + w; + if (in_data[max_idx] == out_data[pool_index]) { + found = true; + break; + } + } + if (found) break; + } + in_grad[max_idx] += out_grad[pool_index]; } } - in_data += in_data_offset; - out_data += out_data_offset; - mask += out_data_offset; + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; } } } @@ -84,26 +107,26 @@ template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, - DType* out_data, int32_t* mask = nullptr) { + DType* out_data) { if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { - pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, mask); + pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } } } template -inline void unpool(mshadow::Stream* s, const DType* out_data, const TShape& ishape, - const TShape& oshape, const TShape& kernel, const TShape& pad, - const TShape& stride, const int pool_type, OpReqType req_type, DType* in_data, - const int32_t* mask = nullptr) { +inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + const int pool_type, OpReqType req_type, DType* in_grad) { if (mxnet::kNullOp == req_type) return; if (mxnet::kAddTo != req_type) { - mxnet_op::Kernel::Launch(s, ishape.Size(), in_data); + mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); } if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { - unpool_max_2d_cpu(s, out_data, mask, ishape, oshape, in_data); + unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); } } } diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h index 5258844341c4..b1dcad8d3d7e 100644 --- a/src/operator/pooling-inl.h +++ b/src/operator/pooling-inl.h @@ -22,15 +22,6 @@ namespace mxnet { namespace op { -#if 0 -namespace pool_enum { -enum PoolingOpInputs {kData}; -enum PoolingOpOutputs {kOut}; -enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; -enum PoolingOpPadConventionType {kValid, kFull}; -} // namespace pool_enum -#endif - struct PoolingParam : public dmlc::Parameter { TShape kernel; TShape stride; @@ -38,10 +29,14 @@ struct PoolingParam : public dmlc::Parameter { int pool_type; int pooling_convention; bool global_pool; + bool cudnn_off; DMLC_DECLARE_PARAMETER(PoolingParam) { DMLC_DECLARE_FIELD(global_pool).set_default(false) .describe("Ignore kernel size, do global pooling based on current input feature map. "); + DMLC_DECLARE_FIELD(cudnn_off).set_default(false) + .describe("Turn off cudnn pooling and use MXNet pooling operator. "); + DMLC_DECLARE_FIELD(kernel) .enforce_nonzero() .describe("pooling kernel size: (y, x) or (d, y, x)"); @@ -82,17 +77,19 @@ class PoolingOp : public Operator { CHECK_EQ(in_data.size(), 1U); CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[pool_enum::kData].shape_; + pool(s, in_data[pool_enum::kData].dptr(), in_data[pool_enum::kData].shape_, out_data[pool_enum::kOut].shape_, - param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, param_.pad, param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, param_.pool_type, req[pool_enum::kOut], - out_data[pool_enum::kOut].dptr(), - pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr() - : nullptr); + out_data[pool_enum::kOut].dptr()); } virtual void Backward(const OpContext& ctx, @@ -109,17 +106,21 @@ class PoolingOp : public Operator { CHECK_EQ(req.size(), 1U); CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); - unpool(s, out_data[pool_enum::kOut].dptr(), - in_data[pool_enum::kData].shape_, - out_data[pool_enum::kOut].shape_, - param_.global_pool? in_data[pool_enum::kData].shape_ : param_.kernel, + const TShape& ishape = in_data[pool_enum::kData].shape_; + + unpool(s, out_grad[pool_enum::kOut].dptr(), + in_data[pool_enum::kData].dptr(), + out_data[pool_enum::kOut].dptr(), + in_grad[pool_enum::kData].shape_, + out_grad[pool_enum::kOut].shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, param_.pad, param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, param_.pool_type, req[pool_enum::kData], - in_data[pool_enum::kData].dptr(), - pool_enum::kMaxPooling == param_.pool_type? out_data[pool_enum::kMask].dptr() - : nullptr); + in_grad[pool_enum::kData].dptr()); } private: @@ -191,9 +192,6 @@ class PoolingProp : public OperatorProperty { } out_shape->clear(); out_shape->push_back(oshape); // save output shape - if (pool_enum::kMaxPooling == param_.pool_type) { - out_shape->push_back(oshape); // save mask shape - } } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; @@ -227,9 +225,6 @@ class PoolingProp : public OperatorProperty { out_shape->clear(); out_shape->push_back(oshape); // save output shape - if (pool_enum::kMaxPooling == param_.pool_type) { - out_shape->push_back(oshape); // save mask shape - } } return true; } @@ -247,9 +242,6 @@ class PoolingProp : public OperatorProperty { out_type->clear(); out_type->push_back(dtype); - if (pool_enum::kMaxPooling == param_.pool_type) { - out_type->push_back(mshadow::kInt32); - } return true; } @@ -267,10 +259,8 @@ class PoolingProp : public OperatorProperty { const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data) const override { - if (pool_enum::kMaxPooling == param_.pool_type) { - return {out_grad[pool_enum::kOut], out_data[pool_enum::kMask]}; - } - return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]}; + return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], + out_data[pool_enum::kOut]}; } std::vector > BackwardInplaceOption( @@ -285,16 +275,6 @@ class PoolingProp : public OperatorProperty { #endif } - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - // For max pooling, need to return mask as well - if (pool_enum::kMaxPooling == param_.pool_type) return 2; - return 1; - } - Operator* CreateOperator(Context ctx) const override { LOG(FATAL) << "Not Implemented."; return NULL; diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu index d8d5c535fce9..c420852b1c8d 100644 --- a/src/operator/pooling.cu +++ b/src/operator/pooling.cu @@ -15,43 +15,34 @@ namespace op { template<> Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; #if MXNET_USE_CUDNN == 1 MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kSumPooling: - LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (!param.cudnn_off) { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + return new CuDNNPoolingOp(param); + case pool_enum::kAvgPooling: + return new CuDNNPoolingOp(param); + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + return new PoolingOp(param); + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } } }); -#else +#endif // MXNET_USE_CUDNN + Operator *op = NULL; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new PoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new PoolingOp(param); - break; - case pool_enum::kSumPooling: - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + op = new PoolingOp(param); + } else { + LOG(FATAL) << "unknown pooling type"; } }); -#endif // MXNET_USE_CUDNN return op; } diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h index d354fd7322b8..ff71d2e7b8a5 100644 --- a/src/operator/pooling_v1-inl.h +++ b/src/operator/pooling_v1-inl.h @@ -294,7 +294,7 @@ class PoolingV1Prop : public OperatorProperty { } std::string TypeString() const override { - return "PoolingV1"; + return "Pooling_v1"; } std::vector DeclareBackwardDependency( From 8776bec177358a3a032aee2b3da3a19ee9f1336c Mon Sep 17 00:00:00 2001 From: reminisce Date: Thu, 16 Mar 2017 21:55:31 -0700 Subject: [PATCH 03/16] Add 2d avg and sum pooling cpu and gpu --- src/operator/nn/pool.cuh | 120 +++++++++++++++++++++++++++++++++++++-- src/operator/nn/pool.h | 94 ++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+), 4 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index bc0f56c8a96f..3816417ec311 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -23,12 +23,12 @@ inline int cuda_get_num_blocks(const int N) { } template -__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in_data, +__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, - OpReqType req_type, DType* const out_data) { + OpReqType req_type, DType* out_data) { // index is the output image's pixel index in NCHW CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; @@ -41,7 +41,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in const int wend = min(wstart + kernel_w, width); hstart = max(hstart, 0); wstart = max(wstart, 0); - const DType* const in_slice = + const DType* in_slice = in_data + (n * channels + c) * height * width; DType max_val = in_slice[hstart * width + wstart]; for (int h = hstart; h < hend; ++h) { @@ -57,6 +57,40 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* const in } } +template +__global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, OpReqType req_type, + DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += out_slice[h * width + w]; + } + } + KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + } +} + template __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -103,6 +137,46 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr } } +template +__global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + DType* in_grad, bool isAvg = false) { + // index is the input image index in NCHW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int h = (index / width) % height + pad_h; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1); + gradient += out_grad_slice[ph * pooled_width + pw] / pool_size; + } + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -117,6 +191,24 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is oshape[2], oshape[3], kernel[0], kernel[1], stride[0], stride[1], pad[0], pad[1], req_type, out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; } } } @@ -133,13 +225,33 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) - unpool_max_2d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), out_grad, in_data, out_data, ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], stride[0], stride[1], pad[0], pad[1], in_grad); MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; } } } diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index 19d2877d7062..3de8e99d0d86 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -56,6 +56,45 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS } } +template +inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data, bool getAvg = false) { + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_data_offset = ishape[2] * ishape[3]; + const index_t out_data_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, height); + wend = std::min(wend, width); + DType sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += in_data[h*width+w]; + } + } + KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum); + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + template inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -103,6 +142,49 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, } } +template +inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_grad_offset = ishape[2] * ishape[3]; + const index_t out_grad_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = (hend - hstart) * (wend - wstart); + } + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, height); + wend = std::min(wend, width); + const int pool_index = ph * pooled_width + pw; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + in_grad[h*width+w] += out_grad[pool_index] / pool_size; + } + } + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + + template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -111,6 +193,12 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; } } } @@ -127,6 +215,12 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; } } } From 4678368847ea45fd995d8dd21c0b630a6c85fc7f Mon Sep 17 00:00:00 2001 From: reminisce Date: Thu, 16 Mar 2017 22:08:21 -0700 Subject: [PATCH 04/16] Fix lint --- src/operator/nn/pool.cuh | 10 +++++----- src/operator/nn/pool.h | 17 ++++++++--------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 3816417ec311..5e62a67e344f 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -75,8 +75,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, int hend = min(hstart + kernel_h, height + pad_h); int wend = min(wstart + kernel_w, width + pad_w); const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); hend = min(hend, height); wend = min(wend, width); DType sum = 0; @@ -84,8 +84,8 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, in_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - sum += out_slice[h * width + w]; - } + sum += out_slice[h * width + w]; + } } KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); } @@ -176,7 +176,7 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr in_grad[index] += gradient; } } - + template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index 3de8e99d0d86..d6a42de1330f 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -76,16 +76,16 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS int hend = std::min(hstart + kernel_h, height + pad_h); int wend = std::min(wstart + kernel_w, width + pad_w); int pool_size = (hend - hstart) * (wend - wstart); - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); hend = std::min(hend, height); wend = std::min(wend, width); DType sum = 0; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - sum += in_data[h*width+w]; + sum += in_data[h*width+w]; } - } + } KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum); } } @@ -166,16 +166,16 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, if (isAvg) { pool_size = (hend - hstart) * (wend - wstart); } - hstart = std::max(hstart, 0); - wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); hend = std::min(hend, height); wend = std::min(wend, width); const int pool_index = ph * pooled_width + pw; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - in_grad[h*width+w] += out_grad[pool_index] / pool_size; + in_grad[h*width+w] += out_grad[pool_index] / pool_size; } - } + } } } in_grad += in_grad_offset; @@ -184,7 +184,6 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, } } - template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, From 01749a7eb0f5ce4237b019d0fa0d191427980c2d Mon Sep 17 00:00:00 2001 From: reminisce Date: Sun, 19 Mar 2017 13:59:34 -0700 Subject: [PATCH 05/16] Add pooling 3d cpu and bug fix --- src/operator/nn/pool.cuh | 11 +- src/operator/nn/pool.h | 212 ++++++++++++++++++++++++++++++++------- 2 files changed, 181 insertions(+), 42 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 5e62a67e344f..5160dec8c879 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -29,6 +29,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, OpReqType req_type, DType* out_data) { + using mshadow::red::limits::MinValue; // index is the output image's pixel index in NCHW CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; @@ -43,7 +44,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, wstart = max(wstart, 0); const DType* in_slice = in_data + (n * channels + c) * height * width; - DType max_val = in_slice[hstart * width + wstart]; + DType max_val = MinValue(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int in_index = h * width + w; @@ -119,7 +120,7 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr // in data/grad offset batch and channel dims int in_offset = (n * channels + c) * height * width; const DType* in_data_slice = in_data + in_offset; - int max_idx = hstart * width + wstart; + int max_idx = -1; DType max_val = out_data[index]; bool found = false; for (int h = hstart; h < hend; ++h) { @@ -133,7 +134,11 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr if (found) break; } - atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } } } diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index d6a42de1330f..538041621198 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -19,29 +19,29 @@ template inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, OpReqType req_type, DType* out_data) { - const index_t height = ishape[2], width = ishape[3]; - const index_t pooled_height = oshape[2], pooled_width = oshape[3]; - const index_t kernel_h = kernel[0], kernel_w = kernel[1]; - const index_t pad_h = pad[0], pad_w = pad[1]; - const index_t stride_h = stride[0], stride_w = stride[1]; + using mshadow::red::limits::MinValue; + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; const index_t in_data_offset = ishape[2] * ishape[3]; const index_t out_data_offset = oshape[2] * oshape[3]; for (index_t n = 0; n < oshape[0]; ++n) { for (index_t c = 0; c < oshape[1]; ++c) { - for (index_t ph = 0; ph < pooled_height; ++ph) { - for (index_t pw = 0; pw < pooled_width; ++pw) { - index_t tmp_h = ph * stride_h; - index_t tmp_w = pw * stride_w; - index_t hend = std::min(tmp_h + kernel_h - pad_h, height); - index_t wend = std::min(tmp_w + kernel_w - pad_w, width); - index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0); - index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0); - const index_t pool_index = ph * pooled_width + pw; - index_t in_index = hstart * width + wstart; - DType max_val = in_data[in_index]; - for (index_t h = hstart; h < hend; ++h) { - for (index_t w = wstart; w < wend; ++w) { - in_index = h * width + w; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = ph * pooled_width + pw; + DType max_val = MinValue(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int in_index= h * width + w; if (in_data[in_index] > max_val) { max_val = in_data[in_index]; } @@ -56,6 +56,54 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS } } +template +inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data) { + using mshadow::red::limits::MinValue; + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth); + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + DType max_val = MinValue(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int in_index= (d * height + h) * width + w; + if (in_data[in_index] > max_val) { + max_val = in_data[in_index]; + } + } + } + } + KERNEL_ASSIGN(out_data[pool_index], req_type, max_val); + } + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + template inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -101,37 +149,102 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, DType* in_grad) { - const index_t height = ishape[2], width = ishape[3]; - const index_t pooled_height = oshape[2], pooled_width = oshape[3]; - const index_t kernel_h = kernel[0], kernel_w = kernel[1]; - const index_t pad_h = pad[0], pad_w = pad[1]; - const index_t stride_h = stride[0], stride_w = stride[1]; + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; const index_t in_offset = ishape[2] * ishape[3]; const index_t out_offset = oshape[2] * oshape[3]; for (index_t n = 0; n < oshape[0]; ++n) { for (index_t c = 0; c < oshape[1]; ++c) { - for (index_t ph = 0; ph < pooled_height; ++ph) { - for (index_t pw = 0; pw < pooled_width; ++pw) { - index_t tmp_h = ph * stride_h; - index_t tmp_w = pw * stride_w; - index_t hend = std::min(tmp_h + kernel_h - pad_h, height); - index_t wend = std::min(tmp_w + kernel_w - pad_w, width); - index_t hstart = (tmp_h > pad_h? tmp_h - pad_h : 0); - index_t wstart = (tmp_w > pad_w? tmp_w - pad_w : 0); - const index_t pool_index = ph * pooled_width + pw; - index_t max_idx = hstart * width + wstart; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = ph * pooled_width + pw; + int max_idx = -1; bool found = false; - for (index_t h = hstart; h < hend; ++h) { - for (index_t w = wstart; w < wend; ++w) { - max_idx = h * width + w; - if (in_data[max_idx] == out_data[pool_index]) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = h * width + w; + if (in_data[idx] == out_data[pool_index]) { + max_idx = idx; found = true; break; } } if (found) break; } - in_grad[max_idx] += out_grad[pool_index]; + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pool_index]; + } + } + } + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; + } + } +} + +template +inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth); + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + int max_idx = -1; + bool found = false; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = (d * height + h) * width + w; + if (in_data[idx] == out_data[pool_index]) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + if (found) break; + } + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pool_index]; + } + } } } in_data += in_offset; @@ -199,7 +312,18 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } } + } template @@ -221,6 +345,16 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } } } From 3578b2f44c65d157a9fa13a6c1f0f8eb5f6fcdeb Mon Sep 17 00:00:00 2001 From: reminisce Date: Sun, 19 Mar 2017 15:06:48 -0700 Subject: [PATCH 06/16] Added 3d max pooling gpu --- src/operator/mxnet_op.h | 17 ++++ src/operator/nn/im2col.cuh | 18 +--- src/operator/nn/pool.cuh | 190 ++++++++++++++++++++++++++++++++++--- 3 files changed, 198 insertions(+), 27 deletions(-) diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index d46b978e0218..9b5dcfe3d3b1 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -23,6 +23,23 @@ using std::isnan; #endif +#ifdef __CUDACC__ +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + + +/*! + * \brief Get the number of blocks for cuda kernel given N + */ +inline int cuda_get_num_blocks(const int N) { + using namespace mshadow::cuda; + return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); +} +#endif // __CUDACC__ + + /*! \brief operator request type switch */ #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \ switch (req) { \ diff --git a/src/operator/nn/im2col.cuh b/src/operator/nn/im2col.cuh index 8555319f5f68..786fd22f8c9b 100644 --- a/src/operator/nn/im2col.cuh +++ b/src/operator/nn/im2col.cuh @@ -71,20 +71,6 @@ namespace mxnet { namespace op { -// CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -/*! - * \brief Get the number of blocks for cuda kernel given N - */ -inline int cuda_get_num_blocks(const int N) { - using namespace mshadow::cuda; - return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); -} - /*! * \brief im2col gpu kernel. * DO NOT call this directly. Use wrapper function im2col() instead; @@ -141,6 +127,7 @@ inline void im2col_gpu(mshadow::Stream* s, int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col; + using namespace mxnet_op; // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<<::GetStream(s)>>>( @@ -303,6 +290,7 @@ inline void im2col(mshadow::Stream* s, index_t num_spatial_axes = kernel_shape.ndim(); CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim()); + using namespace mxnet_op; switch (num_spatial_axes) { case 1: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) @@ -347,6 +335,7 @@ inline void col2im_gpu(mshadow::Stream* s, const DType* data_col, const int int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height * width; + using namespace mxnet_op; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. // NOLINT_NEXT_LINE(whitespace/operators) @@ -487,6 +476,7 @@ inline void col2im(mshadow::Stream* s, index_t im_size = im_shape.ProdShape(1, im_shape.ndim()); // num_axes should be smaller than block size CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; switch (num_spatial_axes) { case 1: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 5160dec8c879..c4806d5cde58 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -9,19 +9,6 @@ namespace mxnet { namespace op { -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -/*! - * \brief Get the number of blocks for cuda kernel given N - */ -inline int cuda_get_num_blocks(const int N) { - using namespace mshadow::cuda; - return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); -} - template __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -58,6 +45,50 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, } } +template +__global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int depth, const int height, const int width, + const int pooled_depth, const int pooled_height, + const int pooled_width, const int kernel_d, + const int kernel_h, const int kernel_w, const int stride_d, + const int stride_h, const int stride_w, const int pad_d, + const int pad_h, const int pad_w, OpReqType req_type, + DType* out_data) { + using mshadow::red::limits::MinValue; + // index is the output image's pixel index in NCDHW + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int dend = min(dstart + kernel_d, depth); + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const DType* in_slice = + in_data + (n * channels + c) * depth * height * width; + DType max_val = MinValue(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int in_index = (d * height + h) * width + w; + const DType in_val = in_slice[in_index]; + if (in_val > max_val) { + max_val = in_val; + } + } + } + } + KERNEL_ASSIGN(out_data[index], req_type, max_val); + } +} + template __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -142,6 +173,66 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr } } +template +__global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int depth, const int height, + const int width, const int pooled_depth, + const int pooled_height, const int pooled_width, + const int kernel_d, const int kernel_h, + const int kernel_w, const int stride_d, + const int stride_h, const int stride_w, const int pad_d, + const int pad_h, const int pad_w, + DType* in_grad) { + // index is the output image's pixel index in NCDHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int dend = min(dstart + kernel_d, depth); + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * depth * height * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = -1; + DType max_val = out_data[index]; + bool found = false; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int idx = (d * height + h) * width + w; + if (in_data_slice[idx] == max_val) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + if (found) break; + } + + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } + } +} + template __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad, const int channels, const int height, const int width, @@ -187,6 +278,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, DType* out_data) { + using namespace mxnet_op; if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -215,6 +307,40 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + ishape[4], oshape[2], oshape[3], oshape[4], + kernel[0], kernel[1], kernel[2], stride[0], + stride[1], stride[2], pad[0], pad[1], pad[2], req_type, + out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); +#endif + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); +#endif + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } } } @@ -227,6 +353,7 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (mxnet::kAddTo != req_type) { mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); } + using namespace mxnet_op; if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -258,7 +385,44 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); +#endif + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); +#endif + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } } + } } // namespace op From 9302a0ec8efab2ec070a4ef1c4ce801b1d1a1115 Mon Sep 17 00:00:00 2001 From: reminisce Date: Sun, 19 Mar 2017 17:36:49 -0700 Subject: [PATCH 07/16] Added 3d avg/sum pooling gpu --- src/operator/nn/pool.cuh | 134 ++++++++++++++++++++++++++++++++------- src/operator/nn/pool.h | 106 +++++++++++++++++++++++++++++-- 2 files changed, 212 insertions(+), 28 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index c4806d5cde58..52cf579a36ea 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -123,6 +123,48 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, } } +template +__global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int depth, const int height, const int width, + const int pooled_depth, const int pooled_height, + const int pooled_width, const int kernel_d, + const int kernel_h, const int kernel_w, + const int stride_d, const int stride_h, const int stride_w, + const int pad_d, const int pad_h, const int pad_w, + OpReqType req_type, DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = min(dstart + kernel_d, depth + pad_d); + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, depth); + hend = min(hend, height); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * depth * height * width; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += out_slice[(d * height + h) * width + w]; + } + } + } + KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + } +} + template __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -273,6 +315,54 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr } } +template +__global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int depth, const int height, + const int width, const int pooled_depth, + const int pooled_height, const int pooled_width, + const int kernel_d, const int kernel_h, + const int kernel_w, const int stride_d, const int stride_h, + const int stride_w, const int pad_d, const int pad_h, + const int pad_w, DType* in_grad, bool isAvg = false) { + // index is the input image index in NCDHW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int h = (index / width) % height + pad_h; + const int d = (index / width / height) % depth + pad_d; + const int c = (index / width / height / depth) % channels; + const int n = index / width / height / depth / channels; + const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1; + const int pdend = min(d / stride_d + 1, pooled_depth); + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width; + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = min(dstart + kernel_d, depth + pad_d); + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1); + gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size; + } + } + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -317,27 +407,25 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is kernel[0], kernel[1], kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], pad[2], req_type, out_data); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); -#endif + ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], + kernel[1], kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], req_type, out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); -#endif + ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], + kernel[1], kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } @@ -395,29 +483,27 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], pad[2], in_grad); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( ishape.Size(), out_grad, - ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], in_grad, true); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); -#endif + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], + pad[2], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( ishape.Size(), out_grad, - ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], in_grad); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); -#endif + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], + pad[2], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index 538041621198..a204e235b419 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -143,6 +143,54 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS } } +template +inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data, bool getAvg = false) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth + pad_d); + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, depth); + hend = std::min(hend, height); + wend = std::min(wend, width); + DType sum = 0; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += in_data[(d*height+h)*width+w]; + } + } + } + KERNEL_ASSIGN(out_data[(pd*pooled_height+ph)*pooled_width+pw], + req_type, getAvg? sum/pool_size : sum); + } + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + template inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -297,6 +345,56 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, } } +template +inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth + pad_d); + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, depth); + hend = std::min(hend, height); + wend = std::min(wend, width); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size; + } + } + } + } + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -316,9 +414,9 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is if (pool_enum::kMaxPooling == pool_type) { pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } else if (pool_enum::kAvgPooling == pool_type) { - //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); } else if (pool_enum::kSumPooling == pool_type) { - //pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } @@ -349,9 +447,9 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (pool_enum::kMaxPooling == pool_type) { unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); } else if (pool_enum::kAvgPooling == pool_type) { - //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); } else if (pool_enum::kSumPooling == pool_type) { - //unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } From aaac667c38567555a5b06f9c2d8941a6e23a6b50 Mon Sep 17 00:00:00 2001 From: reminisce Date: Sun, 19 Mar 2017 22:25:53 -0700 Subject: [PATCH 08/16] Added max pooling 1d cpu and gpu --- src/operator/nn/pool.cuh | 152 +++++++++++++++++++++++++++++++--- src/operator/nn/pool.h | 101 +++++++++++++++++++++- src/operator/pooling-inl.h | 31 ++++++- src/operator/pooling_v1-inl.h | 2 +- 4 files changed, 268 insertions(+), 18 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 52cf579a36ea..de541616d8ce 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -9,6 +9,34 @@ namespace mxnet { namespace op { +template +__global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + OpReqType req_type, DType* out_data) { + using mshadow::red::limits::MinValue; + // index is the output image's pixel index in NCHW + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + const int wend = min(wstart + kernel_w, width); + wstart = max(wstart, 0); + const DType* in_slice = + in_data + (n * channels + c) * width; + DType max_val = MinValue(); + for (int w = wstart; w < wend; ++w) { + const DType in_val = in_slice[w]; + if (in_val > max_val) { + max_val = in_val; + } + } + KERNEL_ASSIGN(out_data[index], req_type, max_val); + } +} + template __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -34,8 +62,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, DType max_val = MinValue(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - const int in_index = h * width + w; - const DType in_val = in_slice[in_index]; + const DType in_val = in_slice[h * width + w]; if (in_val > max_val) { max_val = in_val; } @@ -77,8 +104,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - const int in_index = (d * height + h) * width + w; - const DType in_val = in_slice[in_index]; + const DType in_val = in_slice[(d * height + h) * width + w]; if (in_val > max_val) { max_val = in_val; } @@ -165,6 +191,45 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, } } +template +__global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* in_grad) { + // index is the output image's pixel index in NCHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + const int wend = min(wstart + kernel_w, width); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = -1; + DType max_val = out_data[index]; + for (int w = wstart; w < wend; ++w) { + if (in_data_slice[w] == max_val) { + max_idx = w; + break; + } + } + + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } + } +} + template __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -198,8 +263,9 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr bool found = false; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - max_idx = h * width + w; - if (in_data_slice[max_idx] == max_val) { + const int idx = h * width + w; + if (in_data_slice[idx] == max_val) { + max_idx = idx; found = true; break; } @@ -255,7 +321,7 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - int idx = (d * height + h) * width + w; + const int idx = (d * height + h) * width + w; if (in_data_slice[idx] == max_val) { max_idx = idx; found = true; @@ -369,7 +435,38 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is const TShape& stride, const int pool_type, OpReqType req_type, DType* out_data) { using namespace mxnet_op; - if (kernel.ndim() == 2) { + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], + oshape[2], kernel[0], stride[0], pad[0], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); +#endif + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); +#endif + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) pool_max_2d_gpu_kernel<<* s, const DType* out_grad, const DType* mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); } using namespace mxnet_op; - if (kernel.ndim() == 2) { + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0], + in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); +#endif + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) +#if 0 + unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); +#endif + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) unpool_max_2d_gpu_kernel<<* s, const DType* out_grad, const DType* } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling"; } - } } // namespace op diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index a204e235b419..bc373b49d996 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -15,6 +15,38 @@ enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; enum PoolingOpPadConventionType {kValid, kFull}; } // namespace pool_enum +template +inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data) { + using mshadow::red::limits::MinValue; + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_data_offset = ishape[2]; + const index_t out_data_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width); + wstart = std::max(wstart, 0); + DType max_val = MinValue(); + for (int w = wstart; w < wend; ++w) { + if (in_data[w] > max_val) { + max_val = in_data[w]; + } + } + KERNEL_ASSIGN(out_data[pw], req_type, max_val); + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + template inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -191,6 +223,46 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS } } +template +inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_offset = ishape[2]; + const index_t out_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width); + wstart = std::max(wstart, 0); + int max_idx = -1; + for (int w = wstart; w < wend; ++w) { + if (in_data[w] == out_data[pw]) { + max_idx = w; + break; + } + } + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pw]; + } + } + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; + } + } +} + template inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -400,7 +472,17 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, DType* out_data) { - if (kernel.ndim() == 2) { + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } else if (pool_enum::kAvgPooling == pool_type) { @@ -420,8 +502,9 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling"; } - } template @@ -433,7 +516,17 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (mxnet::kAddTo != req_type) { mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); } - if (kernel.ndim() == 2) { + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); } else if (pool_enum::kAvgPooling == pool_type) { @@ -453,6 +546,8 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling"; } } diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h index b1dcad8d3d7e..8156c3796539 100644 --- a/src/operator/pooling-inl.h +++ b/src/operator/pooling-inl.h @@ -137,7 +137,10 @@ class PoolingProp : public OperatorProperty { void Init(const std::vector >& kwargs) override { using namespace mshadow; param_.Init(kwargs); - if (param_.kernel.ndim() == 2) { + if (param_.kernel.ndim() == 1) { + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); } else { @@ -160,11 +163,31 @@ class PoolingProp : public OperatorProperty { std::vector *aux_shape) const override { CHECK_EQ(in_shape->size(), 1U); const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) " - << "Or 5D in (batch, channel, d, y, x)"; + CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; TShape oshape = dshape; if (dshape.ndim() == 0) return false; - if (param_.kernel.ndim() == 2) { + if (param_.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param_.global_pool) { + oshape[2] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 2) { CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; if (param_.global_pool) { oshape[2] = 1; diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h index ff71d2e7b8a5..c6bc01e52f5a 100644 --- a/src/operator/pooling_v1-inl.h +++ b/src/operator/pooling_v1-inl.h @@ -245,7 +245,7 @@ class PoolingV1Prop : public OperatorProperty { oshape[3] = 1; oshape[4] = 1; } else { - if (param_.pool_type == pool_v1_enum::kValid) { + if (param_.pooling_convention == pool_v1_enum::kValid) { oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0]; oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / From 076b60f6982c7d37994f422ac27e503fa41d0152 Mon Sep 17 00:00:00 2001 From: reminisce Date: Mon, 20 Mar 2017 10:01:48 -0700 Subject: [PATCH 09/16] Added 1d avg/sum pooling cpu and gpu --- src/operator/nn/pool.cuh | 95 +++++++++++++++++++++++++++++----------- src/operator/nn/pool.h | 72 ++++++++++++++++++++++++++++-- 2 files changed, 137 insertions(+), 30 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index de541616d8ce..026b9e88d104 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -115,6 +115,30 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, } } +template +__global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int width, const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, OpReqType req_type, + DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (wend - wstart) : 1); + wstart = max(wstart, 0); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * width; + for (int w = wstart; w < wend; ++w) { + sum += out_slice[w]; + } + KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + } +} + template __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -341,6 +365,37 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr } } +template +__global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* in_grad, bool isAvg = false) { + // index is the input image index in NCW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int c = (index / width) % channels; + const int n = index / width / channels; + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_width; + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int wstart = pw * stride_w - pad_w; + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (wend - wstart) : 1); + gradient += out_grad_slice[pw] / pool_size; + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + template __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad, const int channels, const int height, const int width, @@ -442,27 +497,21 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is 0, mshadow::Stream::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0], req_type, out_data); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( - oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); -#endif + oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], + kernel[0], stride[0], pad[0], req_type, out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( - oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data); - MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); -#endif + oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], + kernel[0], stride[0], pad[0], req_type, out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } @@ -547,29 +596,23 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* oshape.Size(), out_grad, in_data, out_data, ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0], in_grad); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( ishape.Size(), out_grad, - ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], in_grad, true); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); -#endif + ishape[1], ishape[2], oshape[2], kernel[0], + stride[0], pad[0], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) -#if 0 unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( ishape.Size(), out_grad, - ishape[1], ishape[2], ishape[3], - oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], in_grad); - MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); -#endif + ishape[1], ishape[2], oshape[2], kernel[0], + stride[0], pad[0], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index bc373b49d996..debb7eae404d 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -136,6 +136,37 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS } } +template +inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + OpReqType req_type, DType* out_data, bool getAvg = false) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_data_offset = ishape[2]; + const index_t out_data_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (wend - wstart); + wstart = std::max(wstart, 0); + wend = std::min(wend, width); + DType sum = 0; + for (int w = wstart; w < wend; ++w) { + sum += in_data[w]; + } + KERNEL_ASSIGN(out_data[pw], req_type, getAvg? sum/pool_size : sum); + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + template inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -375,6 +406,39 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data, } } +template +inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_grad_offset = ishape[2]; + const index_t out_grad_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = wend - wstart; + } + wstart = std::max(wstart, 0); + wend = std::min(wend, width); + for (int w = wstart; w < wend; ++w) { + in_grad[w] += out_grad[pw] / pool_size; + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + template inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, const TShape& oshape, const TShape& kernel, @@ -476,9 +540,9 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is if (pool_enum::kMaxPooling == pool_type) { pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } else if (pool_enum::kAvgPooling == pool_type) { - //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); } else if (pool_enum::kSumPooling == pool_type) { - //pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } @@ -520,9 +584,9 @@ inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* if (pool_enum::kMaxPooling == pool_type) { unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); } else if (pool_enum::kAvgPooling == pool_type) { - //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); } else if (pool_enum::kSumPooling == pool_type) { - //unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } From 1e162a1cc4ff843ac9bf1166781e5c6d0eff70f2 Mon Sep 17 00:00:00 2001 From: reminisce Date: Mon, 20 Mar 2017 10:11:24 -0700 Subject: [PATCH 10/16] Added pooling test --- tests/python/gpu/test_operator_gpu.py | 162 +++++++++++++++++++++++++- 1 file changed, 161 insertions(+), 1 deletion(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index a82530928bbc..6aa3912216a3 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -182,6 +182,9 @@ def test_grid_generator_with_type(): check_consistency(sym, ctx_list, grad_req="add") +# Checking max pooling consistency over the data sets of different float types is problematic +# as one max value in a float32 data set may not be the max value in a float16 data set. +# This function will not be called. def test_pooling_with_type(): np.random.seed(1234) ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}}, @@ -204,6 +207,163 @@ def test_pooling_with_type(): check_consistency(sym, ctx_list) +def test_pooling_versions(): + def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride, + pooling_convention='valid', global_pool=False): + ctx_list = [] + sym_list = [] + # PoolingV1 cpu + if 'pool_v1_cpu' in pool_op_list: + ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # PoolingV1 gpu + if 'pool_v1_gpu' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # Pooling cpu + if 'pool_cpu' in pool_op_list: + ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # Pooling gpu + if 'pool_gpu' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, cudnn_off=True, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True, + name='pool')) + # CuDNNPooling + if 'pool_cudnn' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, cudnn_off=False, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=False, + name='pool')) + check_consistency(sym_list, ctx_list) + + def test_1d_pooling(pool_type): + data = (2, 3, 20) + kernel = (4,) + pad = (0,) + stride = (1,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (2,) + stride = (2,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0,) + stride = (1,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + pad = (2,) + stride = (2,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + def test_2d_pooling(pool_type): + data = (2, 3, 20, 20) + kernel = (4, 5) + pad = (0, 0) + stride = (1, 1) + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here + pad = (2, 3) + stride = (2, 3) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0, 0) + stride = (1, 1) + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here + pad = (2, 3) + stride = (2, 3) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + def test_3d_pooling(pool_type): + data = (2, 3, 20, 20, 20) + kernel = (4, 5, 3) + pad = (0, 0, 0) + stride = (1, 1, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (2, 3, 3) + stride = (2, 3, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0, 0, 0) + stride = (1, 1, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + pad = (2, 3, 3) + stride = (2, 3, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + test_1d_pooling('max') + test_1d_pooling('avg') + test_1d_pooling('sum') + + test_2d_pooling('max') + test_2d_pooling('avg') + test_2d_pooling('sum') + + test_3d_pooling('max') + test_3d_pooling('avg') + test_3d_pooling('sum') + + def test_upsampling_with_type(): sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='nearest', num_args=1) ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}}, @@ -490,7 +650,7 @@ def test_unfuse(): test_convolution_options() test_convolution_versions() test_convolution_with_type() - test_pooling_with_type() + test_pooling_versions() test_batchnorm_with_type() test_batchnorm_with_type() test_deconvolution_with_type() From f7b8600ef662234b0be55a741dc884af141a7f3f Mon Sep 17 00:00:00 2001 From: reminisce Date: Mon, 20 Mar 2017 12:09:54 -0700 Subject: [PATCH 11/16] Added description and Caffe copyright notice --- src/operator/nn/pool.cuh | 60 +++++++++++++++++++++++++++++++++++++++- src/operator/nn/pool.h | 58 ++++++++++++++++++++++++++++++++++++++ src/operator/pooling.cc | 6 +++- 3 files changed, 122 insertions(+), 2 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 026b9e88d104..7859ea776626 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -1,3 +1,61 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 by Contributors + * \file pool.h + * \brief Function definitions of pooling 1/2/3-D images. + * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases. + * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu + * \author Jun Wu + */ + #ifndef MXNET_OPERATOR_NN_POOL_CUH_ #define MXNET_OPERATOR_NN_POOL_CUH_ @@ -16,7 +74,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, const int stride_w, const int pad_w, OpReqType req_type, DType* out_data) { using mshadow::red::limits::MinValue; - // index is the output image's pixel index in NCHW + // index is the output image's pixel index in NCW CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; const int c = (index / pooled_width) % channels; diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index debb7eae404d..bc83b5c32307 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -1,3 +1,61 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 by Contributors + * \file pool.h + * \brief Function definitions of pooling 1/2/3-D images. + * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases. + * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp + * \author Jun Wu + */ + #ifndef MXNET_OPERATOR_NN_POOL_H_ #define MXNET_OPERATOR_NN_POOL_H_ diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index e4722acaf3e8..b706c5d9a252 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -79,6 +79,10 @@ DMLC_REGISTER_PARAMETER(PoolingParam); MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) .describe(R"code(Perform pooling on the input. +The shapes for 1-D pooling are +- **data**: *(batch_size, channel, width)*, +- **out**: *(batch_size, num_filter, out_width)*. + The shapes for 2-D pooling is - **data**: *(batch_size, channel, height, width)* @@ -106,7 +110,7 @@ Three pooling options are supported by ``pool_type``: - **max**: max pooling - **sum**: sum pooling -1-D pooling is special case of 2-D pooling with *weight=1* and +1-D pooling is special case of 2-D pooling with *width=1* and *kernel[1]=1*. For 3-D pooling, an additional *depth* dimension is added before From cefe72d3105e6e76d4220e2a878bebfb6e29ecc0 Mon Sep 17 00:00:00 2001 From: reminisce Date: Mon, 20 Mar 2017 13:44:19 -0700 Subject: [PATCH 12/16] Added comments for pool and unpool functions --- src/operator/nn/pool.cuh | 102 ++++++++++++++++++++++++++++++++++----- src/operator/nn/pool.h | 100 +++++++++++++++++++++++++++++++++----- 2 files changed, 177 insertions(+), 25 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 7859ea776626..22d0973a2dd8 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -2,33 +2,33 @@ ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT - * + * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. - * + * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. - * + * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. - * + * * LICENSE - * + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * + * modification, are permitted provided that the following conditions are met: + * * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. + * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * + * and/or other materials provided with the distribution. + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -39,9 +39,9 @@ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * * CONTRIBUTION AGREEMENT - * + * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. @@ -49,7 +49,7 @@ ***************** END Caffe Copyright Notice and Disclaimer ******************** * * Copyright (c) 2017 by Contributors - * \file pool.h + * \file pool.cuh * \brief Function definitions of pooling 1/2/3-D images. * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases. * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu @@ -67,6 +67,10 @@ namespace mxnet { namespace op { +/*! + * \brief max pooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int width, @@ -95,6 +99,10 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief max pooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -130,6 +138,10 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief max pooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int depth, const int height, const int width, @@ -173,6 +185,10 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief avg/sum pooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int width, const int pooled_width, const int kernel_w, @@ -197,6 +213,10 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief avg/sum pooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int height, const int width, @@ -231,6 +251,10 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief avg/sum pooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int depth, const int height, const int width, @@ -273,6 +297,10 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, } } +/*! + * \brief max unpooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -312,6 +340,10 @@ __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief max unpooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -363,6 +395,10 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief max unpooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad, const DType* in_data, const DType* out_data, @@ -423,6 +459,10 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief avg/sum unpooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad, const int channels, const int width, @@ -454,6 +494,10 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief avg/sum unpooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad, const int channels, const int height, const int width, @@ -494,6 +538,10 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief avg/sum unpooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad, const int channels, const int depth, const int height, @@ -542,6 +590,19 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr } } +/*! + * \brief This function serves as an interface for 1/2/3-D pooling operations. + * \param s context stream defining the device in use is gpu + * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW + */ template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -636,6 +697,21 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is } } +/*! + * \brief This function serves as an interface for 1/2/3-D unpooling operations. + * \param s context stream defining the device in use is gpu + * \param out_grad pointer of the gradient of operator's output tensor + * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW + * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param in_grad pointer of the gradient of the operator's input tensor + */ template inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, const TShape& oshape, diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index bc83b5c32307..c32d12a48236 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -2,33 +2,33 @@ ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** * * COPYRIGHT - * + * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. - * + * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. - * + * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. - * + * * LICENSE - * + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * + * modification, are permitted provided that the following conditions are met: + * * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. + * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * + * and/or other materials provided with the distribution. + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -39,9 +39,9 @@ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * + * * CONTRIBUTION AGREEMENT - * + * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. @@ -73,6 +73,10 @@ enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; enum PoolingOpPadConventionType {kValid, kFull}; } // namespace pool_enum +/*! + * \brief max pooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -105,6 +109,10 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief max pooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -146,6 +154,10 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief max pooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -194,6 +206,10 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief avg/sum pooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -225,6 +241,10 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief avg/sum pooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -264,6 +284,10 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief avg/sum pooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ template inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, @@ -312,6 +336,10 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS } } +/*! + * \brief max unpooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -352,6 +380,10 @@ inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data, } } +/*! + * \brief max unpooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -404,6 +436,10 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, } } +/*! + * \brief max unpooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, @@ -464,6 +500,10 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data, } } +/*! + * \brief avg/sum unpooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape, const TShape& oshape, const TShape& kernel, @@ -497,6 +537,10 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape, } } +/*! + * \brief avg/sum unpooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, const TShape& oshape, const TShape& kernel, @@ -539,6 +583,10 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, } } +/*! + * \brief avg/sum unpooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ template inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape, const TShape& oshape, const TShape& kernel, @@ -589,6 +637,19 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape, } } +/*! + * \brief This function serves as an interface for 1/2/3-D pooling operations. + * \param s context stream defining the device in use is cpu + * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW + */ template inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, @@ -629,6 +690,21 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is } } +/*! + * \brief This function serves as an interface for 1/2/3-D unpooling operations. + * \param s context stream defining the device in use is cpu + * \param out_grad pointer of the gradient of operator's output tensor + * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW + * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param in_grad pointer of the gradient of the operator's input tensor + */ template inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, const DType* out_data, const TShape& ishape, const TShape& oshape, From 7e041a1af9cebcea327e498492a2082b422288a4 Mon Sep 17 00:00:00 2001 From: reminisce Date: Tue, 21 Mar 2017 12:55:31 -0700 Subject: [PATCH 13/16] Fixed lint --- src/operator/nn/pool.cuh | 46 +++++++++++++++---------------- src/operator/nn/pool.h | 51 ++++++++++++++++++----------------- src/operator/pooling_v1-inl.h | 6 +++-- 3 files changed, 53 insertions(+), 50 deletions(-) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh index 22d0973a2dd8..54fd3461d80f 100644 --- a/src/operator/nn/pool.cuh +++ b/src/operator/nn/pool.cuh @@ -76,7 +76,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int width, const int pooled_width, const int kernel_w, const int stride_w, const int pad_w, - OpReqType req_type, DType* out_data) { + DType* out_data) { using mshadow::red::limits::MinValue; // index is the output image's pixel index in NCW CUDA_KERNEL_LOOP(index, nthreads) { @@ -95,7 +95,7 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, max_val = in_val; } } - KERNEL_ASSIGN(out_data[index], req_type, max_val); + out_data[index] = max_val; } } @@ -109,7 +109,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, - OpReqType req_type, DType* out_data) { + DType* out_data) { using mshadow::red::limits::MinValue; // index is the output image's pixel index in NCHW CUDA_KERNEL_LOOP(index, nthreads) { @@ -134,7 +134,7 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, } } } - KERNEL_ASSIGN(out_data[index], req_type, max_val); + out_data[index] = max_val; } } @@ -149,7 +149,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int pooled_width, const int kernel_d, const int kernel_h, const int kernel_w, const int stride_d, const int stride_h, const int stride_w, const int pad_d, - const int pad_h, const int pad_w, OpReqType req_type, + const int pad_h, const int pad_w, DType* out_data) { using mshadow::red::limits::MinValue; // index is the output image's pixel index in NCDHW @@ -181,7 +181,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, } } } - KERNEL_ASSIGN(out_data[index], req_type, max_val); + out_data[index] = max_val; } } @@ -192,7 +192,7 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, template __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, const int width, const int pooled_width, const int kernel_w, - const int stride_w, const int pad_w, OpReqType req_type, + const int stride_w, const int pad_w, DType* out_data, bool getAvg = false) { CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; @@ -209,7 +209,7 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, for (int w = wstart; w < wend; ++w) { sum += out_slice[w]; } - KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + out_data[index] = sum / pool_size; } } @@ -223,7 +223,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, - const int pad_h, const int pad_w, OpReqType req_type, + const int pad_h, const int pad_w, DType* out_data, bool getAvg = false) { CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; @@ -247,7 +247,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, sum += out_slice[h * width + w]; } } - KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + out_data[index] = sum / pool_size; } } @@ -263,7 +263,7 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int kernel_h, const int kernel_w, const int stride_d, const int stride_h, const int stride_w, const int pad_d, const int pad_h, const int pad_w, - OpReqType req_type, DType* out_data, bool getAvg = false) { + DType* out_data, bool getAvg = false) { CUDA_KERNEL_LOOP(index, nthreads) { const int pw = index % pooled_width; const int ph = (index / pooled_width) % pooled_height; @@ -293,7 +293,7 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, } } } - KERNEL_ASSIGN(out_data[index], req_type, sum / pool_size); + out_data[index] = sum / pool_size; } } @@ -600,7 +600,7 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr * \param pad pad shape * \param stride stride shape * \param pool_type supported pooling type: max, avg, sum - * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param req_type operator request type, only support kWriteTo for now * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW */ template @@ -608,6 +608,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, DType* out_data) { + CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations"; using namespace mxnet_op; if (kernel.ndim() == 1) { if (pool_enum::kMaxPooling == pool_type) { @@ -615,21 +616,21 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is pool_max_1d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], - oshape[2], kernel[0], stride[0], pad[0], req_type, out_data); + oshape[2], kernel[0], stride[0], pad[0], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], - kernel[0], stride[0], pad[0], req_type, out_data, true); + kernel[0], stride[0], pad[0], out_data, true); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], - kernel[0], stride[0], pad[0], req_type, out_data); + kernel[0], stride[0], pad[0], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; @@ -641,7 +642,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is 0, mshadow::Stream::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data); + stride[0], stride[1], pad[0], pad[1], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -649,7 +650,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is 0, mshadow::Stream::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data, true); + stride[0], stride[1], pad[0], pad[1], out_data, true); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -657,7 +658,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is 0, mshadow::Stream::GetStream(s)>>>( oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], oshape[2], oshape[3], kernel[0], kernel[1], - stride[0], stride[1], pad[0], pad[1], req_type, out_data); + stride[0], stride[1], pad[0], pad[1], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; @@ -670,8 +671,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], kernel[2], stride[0], - stride[1], stride[2], pad[0], pad[1], pad[2], req_type, - out_data); + stride[1], stride[2], pad[0], pad[1], pad[2], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel); } else if (pool_enum::kAvgPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -680,7 +680,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], kernel[2], stride[0], stride[1], stride[2], - pad[0], pad[1], pad[2], req_type, out_data, true); + pad[0], pad[1], pad[2], out_data, true); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); } else if (pool_enum::kSumPooling == pool_type) { // NOLINT_NEXT_LINE(whitespace/operators) @@ -689,7 +689,7 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], kernel[2], stride[0], stride[1], stride[2], - pad[0], pad[1], pad[2], req_type, out_data); + pad[0], pad[1], pad[2], out_data); MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h index c32d12a48236..79accb5d521f 100644 --- a/src/operator/nn/pool.h +++ b/src/operator/nn/pool.h @@ -61,6 +61,7 @@ #include #include +#include #include "../mxnet_op.h" namespace mxnet { @@ -80,7 +81,7 @@ enum PoolingOpPadConventionType {kValid, kFull}; template inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data) { + DType* out_data) { using mshadow::red::limits::MinValue; const int width = ishape[2]; const int pooled_width = oshape[2]; @@ -101,7 +102,7 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS max_val = in_data[w]; } } - KERNEL_ASSIGN(out_data[pw], req_type, max_val); + out_data[pw] = max_val; } in_data += in_data_offset; out_data += out_data_offset; @@ -116,7 +117,7 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS template inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data) { + DType* out_data) { using mshadow::red::limits::MinValue; const int height = ishape[2], width = ishape[3]; const int pooled_height = oshape[2], pooled_width = oshape[3]; @@ -139,13 +140,13 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS DType max_val = MinValue(); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - const int in_index= h * width + w; + const int in_index = h * width + w; if (in_data[in_index] > max_val) { max_val = in_data[in_index]; } } } - KERNEL_ASSIGN(out_data[pool_index], req_type, max_val); + out_data[pool_index] = max_val; } } in_data += in_data_offset; @@ -161,7 +162,7 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS template inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data) { + DType* out_data) { using mshadow::red::limits::MinValue; const int depth = ishape[2], height = ishape[3], width = ishape[4]; const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; @@ -189,14 +190,14 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - const int in_index= (d * height + h) * width + w; + const int in_index = (d * height + h) * width + w; if (in_data[in_index] > max_val) { max_val = in_data[in_index]; } } } } - KERNEL_ASSIGN(out_data[pool_index], req_type, max_val); + out_data[pool_index] = max_val; } } } @@ -213,7 +214,7 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS template inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data, bool getAvg = false) { + DType* out_data, bool getAvg = false) { const int width = ishape[2]; const int pooled_width = oshape[2]; const int kernel_w = kernel[0]; @@ -233,7 +234,7 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS for (int w = wstart; w < wend; ++w) { sum += in_data[w]; } - KERNEL_ASSIGN(out_data[pw], req_type, getAvg? sum/pool_size : sum); + out_data[pw] = (getAvg? sum/pool_size : sum); } in_data += in_data_offset; out_data += out_data_offset; @@ -248,7 +249,7 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS template inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data, bool getAvg = false) { + DType* out_data, bool getAvg = false) { const int height = ishape[2], width = ishape[3]; const int pooled_height = oshape[2], pooled_width = oshape[3]; const int kernel_h = kernel[0], kernel_w = kernel[1]; @@ -275,7 +276,7 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS sum += in_data[h*width+w]; } } - KERNEL_ASSIGN(out_data[ph*pooled_width+pw], req_type, getAvg? sum/pool_size : sum); + out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum); } } in_data += in_data_offset; @@ -291,7 +292,7 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS template inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, - OpReqType req_type, DType* out_data, bool getAvg = false) { + DType* out_data, bool getAvg = false) { const int depth = ishape[2], height = ishape[3], width = ishape[4]; const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; @@ -325,8 +326,7 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS } } } - KERNEL_ASSIGN(out_data[(pd*pooled_height+ph)*pooled_width+pw], - req_type, getAvg? sum/pool_size : sum); + out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum); } } } @@ -647,7 +647,7 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape, * \param pad pad shape * \param stride stride shape * \param pool_type supported pooling type: max, avg, sum - * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param req_type operator request type, only support kWriteTo for now * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW */ template @@ -655,33 +655,34 @@ inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& is const TShape& oshape, const TShape& kernel, const TShape& pad, const TShape& stride, const int pool_type, OpReqType req_type, DType* out_data) { + CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations"; if (kernel.ndim() == 1) { if (pool_enum::kMaxPooling == pool_type) { - pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else if (pool_enum::kAvgPooling == pool_type) { - pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); } else if (pool_enum::kSumPooling == pool_type) { - pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } } else if (kernel.ndim() == 2) { if (pool_enum::kMaxPooling == pool_type) { - pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else if (pool_enum::kAvgPooling == pool_type) { - pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); } else if (pool_enum::kSumPooling == pool_type) { - pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } } else if (kernel.ndim() == 3) { if (pool_enum::kMaxPooling == pool_type) { - pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else if (pool_enum::kAvgPooling == pool_type) { - pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data, true); + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); } else if (pool_enum::kSumPooling == pool_type) { - pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, req_type, out_data); + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); } else { LOG(FATAL) << "Unknown pooling type " << pool_type; } diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h index c6bc01e52f5a..7fef1258093f 100644 --- a/src/operator/pooling_v1-inl.h +++ b/src/operator/pooling_v1-inl.h @@ -86,7 +86,8 @@ class PoolingV1Op : public Operator { Tensor data = in_data[pool_v1_enum::kData].get(s); Tensor out = out_data[pool_v1_enum::kOut].get(s); mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]); - if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) { + if (param_.pool_type == pool_v1_enum::kMaxPooling + || param_.pool_type == pool_v1_enum::kSumPooling) { Assign(out, req[pool_v1_enum::kOut], pool(pad(data, param_.pad[0], param_.pad[1]), @@ -136,7 +137,8 @@ class PoolingV1Op : public Operator { mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]); - if (param_.pool_type == pool_v1_enum::kMaxPooling || param_.pool_type == pool_v1_enum::kSumPooling) { + if (param_.pool_type == pool_v1_enum::kMaxPooling + || param_.pool_type == pool_v1_enum::kSumPooling) { Assign(input_grad, req[pool_v1_enum::kData], crop(unpool(pad(data, param_.pad[0], param_.pad[1]), pad(output_data, 0, 0), From 32a888be781ebfa175481a76ff0bf3d55bc6d956 Mon Sep 17 00:00:00 2001 From: reminisce Date: Tue, 21 Mar 2017 17:49:12 -0700 Subject: [PATCH 14/16] Fix MKL test --- src/operator/pooling.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index b706c5d9a252..c0fabe11e41c 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -21,8 +21,9 @@ template<> Operator *CreateOp(PoolingParam param, int dtype) { Operator *op = NULL; #if MXNET_USE_MKL2017 == 1 - if ((param.pool_type == pool_enum::kMaxPooling) - || (param.pool_type == pool_enum::kAvgPooling)) { + if (param.kernel.ndim() != 1 + && ((param.pool_type == pool_enum::kMaxPooling) + || (param.pool_type == pool_enum::kAvgPooling))) { switch (dtype) { case mshadow::kFloat32: return new MKLPoolingOp(param); From e311deebb8df5aeb7fc184cbc343243994d5c147 Mon Sep 17 00:00:00 2001 From: reminisce Date: Tue, 21 Mar 2017 20:21:13 -0700 Subject: [PATCH 15/16] Fix mkl --- src/operator/pooling.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index c0fabe11e41c..870dbf13d08a 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -21,7 +21,7 @@ template<> Operator *CreateOp(PoolingParam param, int dtype) { Operator *op = NULL; #if MXNET_USE_MKL2017 == 1 - if (param.kernel.ndim() != 1 + if (param.kernel.ndim() == 2 && ((param.pool_type == pool_enum::kMaxPooling) || (param.pool_type == pool_enum::kAvgPooling))) { switch (dtype) { From c65cbdf3b1e6337452a81bf959e4797eb59b316e Mon Sep 17 00:00:00 2001 From: reminisce Date: Tue, 21 Mar 2017 21:07:55 -0700 Subject: [PATCH 16/16] Disable MKL for pad and stride > 0 --- src/operator/pooling.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index 870dbf13d08a..85f15668f19a 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -20,8 +20,13 @@ namespace op { template<> Operator *CreateOp(PoolingParam param, int dtype) { Operator *op = NULL; + // TODO(junwu): Since MKL has a bug when pad and stride > 0, + // we disable MKL in those cases and will re-enable it after + // it is fixed by deleting lines 28 and 29. #if MXNET_USE_MKL2017 == 1 if (param.kernel.ndim() == 2 + && 0 == param.pad[0] && 0 == param.pad[1] + && 0 == param.stride[0] && 0 == param.stride[1] && ((param.pool_type == pool_enum::kMaxPooling) || (param.pool_type == pool_enum::kAvgPooling))) { switch (dtype) {