diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h index 6d8b45fc1188..9b5dcfe3d3b1 100644 --- a/src/operator/mxnet_op.h +++ b/src/operator/mxnet_op.h @@ -23,6 +23,23 @@ using std::isnan; #endif +#ifdef __CUDACC__ +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + + +/*! + * \brief Get the number of blocks for cuda kernel given N + */ +inline int cuda_get_num_blocks(const int N) { + using namespace mshadow::cuda; + return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); +} +#endif // __CUDACC__ + + /*! \brief operator request type switch */ #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \ switch (req) { \ @@ -139,6 +156,22 @@ MSHADOW_XINLINE Shape calc_stride(const Shape& shape) { } +struct fill { + template + MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) { + out[i] = val; + } +}; + + +struct set_zero { + template + MSHADOW_XINLINE static void Map(int i, DType* out) { + out[i] = static_cast(0); + } +}; + + template struct Kernel; diff --git a/src/operator/nn/im2col.cuh b/src/operator/nn/im2col.cuh index 8555319f5f68..786fd22f8c9b 100644 --- a/src/operator/nn/im2col.cuh +++ b/src/operator/nn/im2col.cuh @@ -71,20 +71,6 @@ namespace mxnet { namespace op { -// CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -/*! - * \brief Get the number of blocks for cuda kernel given N - */ -inline int cuda_get_num_blocks(const int N) { - using namespace mshadow::cuda; - return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum); -} - /*! * \brief im2col gpu kernel. * DO NOT call this directly. Use wrapper function im2col() instead; @@ -141,6 +127,7 @@ inline void im2col_gpu(mshadow::Stream* s, int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col; + using namespace mxnet_op; // NOLINT_NEXT_LINE(whitespace/operators) im2col_gpu_kernel<<::GetStream(s)>>>( @@ -303,6 +290,7 @@ inline void im2col(mshadow::Stream* s, index_t num_spatial_axes = kernel_shape.ndim(); CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim()); + using namespace mxnet_op; switch (num_spatial_axes) { case 1: im2col_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) @@ -347,6 +335,7 @@ inline void col2im_gpu(mshadow::Stream* s, const DType* data_col, const int int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height * width; + using namespace mxnet_op; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. // NOLINT_NEXT_LINE(whitespace/operators) @@ -487,6 +476,7 @@ inline void col2im(mshadow::Stream* s, index_t im_size = im_shape.ProdShape(1, im_shape.ndim()); // num_axes should be smaller than block size CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; switch (num_spatial_axes) { case 1: col2im_nd_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh new file mode 100644 index 000000000000..54fd3461d80f --- /dev/null +++ b/src/operator/nn/pool.cuh @@ -0,0 +1,826 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 by Contributors + * \file pool.cuh + * \brief Function definitions of pooling 1/2/3-D images. + * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases. + * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu + * \author Jun Wu + */ + +#ifndef MXNET_OPERATOR_NN_POOL_CUH_ +#define MXNET_OPERATOR_NN_POOL_CUH_ + +#include +#include +#include "../mxnet_op.h" +#include "../../common/cuda_utils.h" + +namespace mxnet { +namespace op { + +/*! + * \brief max pooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* out_data) { + using mshadow::red::limits::MinValue; + // index is the output image's pixel index in NCW + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + const int wend = min(wstart + kernel_w, width); + wstart = max(wstart, 0); + const DType* in_slice = + in_data + (n * channels + c) * width; + DType max_val = MinValue(); + for (int w = wstart; w < wend; ++w) { + const DType in_val = in_slice[w]; + if (in_val > max_val) { + max_val = in_val; + } + } + out_data[index] = max_val; + } +} + +/*! + * \brief max pooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + DType* out_data) { + using mshadow::red::limits::MinValue; + // index is the output image's pixel index in NCHW + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const DType* in_slice = + in_data + (n * channels + c) * height * width; + DType max_val = MinValue(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const DType in_val = in_slice[h * width + w]; + if (in_val > max_val) { + max_val = in_val; + } + } + } + out_data[index] = max_val; + } +} + +/*! + * \brief max pooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int depth, const int height, const int width, + const int pooled_depth, const int pooled_height, + const int pooled_width, const int kernel_d, + const int kernel_h, const int kernel_w, const int stride_d, + const int stride_h, const int stride_w, const int pad_d, + const int pad_h, const int pad_w, + DType* out_data) { + using mshadow::red::limits::MinValue; + // index is the output image's pixel index in NCDHW + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int dend = min(dstart + kernel_d, depth); + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + const DType* in_slice = + in_data + (n * channels + c) * depth * height * width; + DType max_val = MinValue(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const DType in_val = in_slice[(d * height + h) * width + w]; + if (in_val > max_val) { + max_val = in_val; + } + } + } + } + out_data[index] = max_val; + } +} + +/*! + * \brief avg/sum pooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int width, const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (wend - wstart) : 1); + wstart = max(wstart, 0); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * width; + for (int w = wstart; w < wend; ++w) { + sum += out_slice[w]; + } + out_data[index] = sum / pool_size; + } +} + +/*! + * \brief avg/sum pooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, height); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += out_slice[h * width + w]; + } + } + out_data[index] = sum / pool_size; + } +} + +/*! + * \brief avg/sum pooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +__global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels, + const int depth, const int height, const int width, + const int pooled_depth, const int pooled_height, + const int pooled_width, const int kernel_d, + const int kernel_h, const int kernel_w, + const int stride_d, const int stride_h, const int stride_w, + const int pad_d, const int pad_h, const int pad_w, + DType* out_data, bool getAvg = false) { + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = min(dstart + kernel_d, depth + pad_d); + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, depth); + hend = min(hend, height); + wend = min(wend, width); + DType sum = 0; + const DType* out_slice = + in_data + (n * channels + c) * depth * height * width; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += out_slice[(d * height + h) * width + w]; + } + } + } + out_data[index] = sum / pool_size; + } +} + +/*! + * \brief max unpooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* in_grad) { + // index is the output image's pixel index in NCHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int c = (index / pooled_width) % channels; + const int n = index / pooled_width / channels; + int wstart = pw * stride_w - pad_w; + const int wend = min(wstart + kernel_w, width); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = -1; + DType max_val = out_data[index]; + for (int w = wstart; w < wend; ++w) { + if (in_data_slice[w] == max_val) { + max_idx = w; + break; + } + } + + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } + } +} + +/*! + * \brief max unpooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + DType* in_grad) { + // index is the output image's pixel index in NCHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * height * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = -1; + DType max_val = out_data[index]; + bool found = false; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = h * width + w; + if (in_data_slice[idx] == max_val) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } + } +} + +/*! + * \brief max unpooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad, + const DType* in_data, const DType* out_data, + const int channels, const int depth, const int height, + const int width, const int pooled_depth, + const int pooled_height, const int pooled_width, + const int kernel_d, const int kernel_h, + const int kernel_w, const int stride_d, + const int stride_h, const int stride_w, const int pad_d, + const int pad_h, const int pad_w, + DType* in_grad) { + // index is the output image's pixel index in NCDHW + // the order has to be consistent with pooling max + // to avoid adding out_grad to the wrong in_grad + // in the case where there are multiple max pixels + // covered by a kernel window + CUDA_KERNEL_LOOP(index, nthreads) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int pd = (index / pooled_width / pooled_height) % pooled_depth; + const int c = (index / pooled_width / pooled_height / pooled_depth) % channels; + const int n = index / pooled_width / pooled_height / pooled_depth / channels; + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int dend = min(dstart + kernel_d, depth); + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + // in data/grad offset batch and channel dims + int in_offset = (n * channels + c) * depth * height * width; + const DType* in_data_slice = in_data + in_offset; + int max_idx = -1; + DType max_val = out_data[index]; + bool found = false; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = (d * height + h) * width + w; + if (in_data_slice[idx] == max_val) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + if (found) break; + } + + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]); + } + } +} + +/*! + * \brief avg/sum unpooling gpu kernel for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int width, + const int pooled_width, const int kernel_w, + const int stride_w, const int pad_w, + DType* in_grad, bool isAvg = false) { + // index is the input image index in NCW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int c = (index / width) % channels; + const int n = index / width / channels; + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_width; + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int wstart = pw * stride_w - pad_w; + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (wend - wstart) : 1); + gradient += out_grad_slice[pw] / pool_size; + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + +/*! + * \brief avg/sum unpooling gpu kernel for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + DType* in_grad, bool isAvg = false) { + // index is the input image index in NCHW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int h = (index / width) % height + pad_h; + const int c = (index / width / height) % channels; + const int n = index / width / height / channels; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_height * pooled_width; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1); + gradient += out_grad_slice[ph * pooled_width + pw] / pool_size; + } + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + +/*! + * \brief avg/sum unpooling gpu kernel for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +__global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad, + const int channels, const int depth, const int height, + const int width, const int pooled_depth, + const int pooled_height, const int pooled_width, + const int kernel_d, const int kernel_h, + const int kernel_w, const int stride_d, const int stride_h, + const int stride_w, const int pad_d, const int pad_h, + const int pad_w, DType* in_grad, bool isAvg = false) { + // index is the input image index in NCDHW + CUDA_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int w = index % width + pad_w; + const int h = (index / width) % height + pad_h; + const int d = (index / width / height) % depth + pad_d; + const int c = (index / width / height / depth) % channels; + const int n = index / width / height / depth / channels; + const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1; + const int pdend = min(d / stride_d + 1, pooled_depth); + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + DType gradient = 0; + const DType* out_grad_slice = + out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width; + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = min(dstart + kernel_d, depth + pad_d); + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1); + gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size; + } + } + } + // if req=kWriteTo, in_grad has already been assigned zero values in unpool() + // use "+=" here instead of "=" to accommodate when req=kAddTo + in_grad[index] += gradient; + } +} + +/*! + * \brief This function serves as an interface for 1/2/3-D pooling operations. + * \param s context stream defining the device in use is gpu + * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type, only support kWriteTo for now + * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW + */ +template +inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, + DType* out_data) { + CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations"; + using namespace mxnet_op; + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], + oshape[2], kernel[0], stride[0], pad[0], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], + kernel[0], stride[0], pad[0], out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], oshape[2], + kernel[0], stride[0], pad[0], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_max_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + ishape[4], oshape[2], oshape[3], oshape[4], + kernel[0], kernel[1], kernel[2], stride[0], + stride[1], stride[2], pad[0], pad[1], pad[2], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], + kernel[1], kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], out_data, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + pool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), in_data, ishape[1], ishape[2], ishape[3], + ishape[4], oshape[2], oshape[3], oshape[4], kernel[0], + kernel[1], kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], out_data); + MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } +} + +/*! + * \brief This function serves as an interface for 1/2/3-D unpooling operations. + * \param s context stream defining the device in use is gpu + * \param out_grad pointer of the gradient of operator's output tensor + * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW + * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param in_grad pointer of the gradient of the operator's input tensor + */ +template +inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + const int pool_type, OpReqType req_type, DType* in_grad) { + if (mxnet::kNullOp == req_type) return; + if (mxnet::kAddTo != req_type) { + mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); + } + using namespace mxnet_op; + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_1d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0], + in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], oshape[2], kernel[0], + stride[0], pad[0], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_1d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], oshape[2], kernel[0], + stride[0], pad[0], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_2d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_2d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], + oshape[2], oshape[3], kernel[0], kernel[1], + stride[0], stride[1], pad[0], pad[1], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_max_3d_gpu_kernel<<::GetStream(s)>>>( + oshape.Size(), out_grad, in_data, out_data, + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], + pad[0], pad[1], pad[2], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel); + } else if (pool_enum::kAvgPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], + pad[2], in_grad, true); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel); + } else if (pool_enum::kSumPooling == pool_type) { + // NOLINT_NEXT_LINE(whitespace/operators) + unpool_sum_3d_gpu_kernel<<::GetStream(s)>>>( + ishape.Size(), out_grad, + ishape[1], ishape[2], ishape[3], ishape[4], + oshape[2], oshape[3], oshape[4], kernel[0], kernel[1], + kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1], + pad[2], in_grad); + MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling"; + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_NN_POOL_CUH_ diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h new file mode 100644 index 000000000000..79accb5d521f --- /dev/null +++ b/src/operator/nn/pool.h @@ -0,0 +1,759 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 by Contributors + * \file pool.h + * \brief Function definitions of pooling 1/2/3-D images. + * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases. + * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp + * \author Jun Wu + */ + +#ifndef MXNET_OPERATOR_NN_POOL_H_ +#define MXNET_OPERATOR_NN_POOL_H_ + +#include +#include +#include +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +namespace pool_enum { +enum PoolingOpInputs {kData}; +enum PoolingOpOutputs {kOut, kMask}; +enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; +enum PoolingOpPadConventionType {kValid, kFull}; +} // namespace pool_enum + +/*! + * \brief max pooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data) { + using mshadow::red::limits::MinValue; + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_data_offset = ishape[2]; + const index_t out_data_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width); + wstart = std::max(wstart, 0); + DType max_val = MinValue(); + for (int w = wstart; w < wend; ++w) { + if (in_data[w] > max_val) { + max_val = in_data[w]; + } + } + out_data[pw] = max_val; + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief max pooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data) { + using mshadow::red::limits::MinValue; + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_data_offset = ishape[2] * ishape[3]; + const index_t out_data_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = ph * pooled_width + pw; + DType max_val = MinValue(); + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int in_index = h * width + w; + if (in_data[in_index] > max_val) { + max_val = in_data[in_index]; + } + } + } + out_data[pool_index] = max_val; + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief max pooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data) { + using mshadow::red::limits::MinValue; + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth); + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + DType max_val = MinValue(); + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int in_index = (d * height + h) * width + w; + if (in_data[in_index] > max_val) { + max_val = in_data[in_index]; + } + } + } + } + out_data[pool_index] = max_val; + } + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief avg/sum pooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data, bool getAvg = false) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_data_offset = ishape[2]; + const index_t out_data_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (wend - wstart); + wstart = std::max(wstart, 0); + wend = std::min(wend, width); + DType sum = 0; + for (int w = wstart; w < wend; ++w) { + sum += in_data[w]; + } + out_data[pw] = (getAvg? sum/pool_size : sum); + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief avg/sum pooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data, bool getAvg = false) { + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_data_offset = ishape[2] * ishape[3]; + const index_t out_data_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, height); + wend = std::min(wend, width); + DType sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += in_data[h*width+w]; + } + } + out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum); + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief avg/sum pooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface pool(). + */ +template +inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + DType* out_data, bool getAvg = false) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth + pad_d); + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, depth); + hend = std::min(hend, height); + wend = std::min(wend, width); + DType sum = 0; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + sum += in_data[(d*height+h)*width+w]; + } + } + } + out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum); + } + } + } + in_data += in_data_offset; + out_data += out_data_offset; + } + } +} + +/*! + * \brief max unpooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_offset = ishape[2]; + const index_t out_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width); + wstart = std::max(wstart, 0); + int max_idx = -1; + for (int w = wstart; w < wend; ++w) { + if (in_data[w] == out_data[pw]) { + max_idx = w; + break; + } + } + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pw]; + } + } + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; + } + } +} + +/*! + * \brief max unpooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_offset = ishape[2] * ishape[3]; + const index_t out_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = ph * pooled_width + pw; + int max_idx = -1; + bool found = false; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = h * width + w; + if (in_data[idx] == out_data[pool_index]) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pool_index]; + } + } + } + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; + } + } +} + +/*! + * \brief max unpooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth); + int hend = std::min(hstart + kernel_h, height); + int wend = std::min(wstart + kernel_w, width); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + int max_idx = -1; + bool found = false; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int idx = (d * height + h) * width + w; + if (in_data[idx] == out_data[pool_index]) { + max_idx = idx; + found = true; + break; + } + } + if (found) break; + } + if (found) break; + } + // In the case where pad > 0 and kernel = 1, for example, + // max_idx can be -1 reaching this step. + if (max_idx >= 0) { + in_grad[max_idx] += out_grad[pool_index]; + } + } + } + } + in_data += in_offset; + in_grad += in_offset; + out_data += out_offset; + out_grad += out_offset; + } + } +} + +/*! + * \brief avg/sum unpooling cpu function for 1-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int width = ishape[2]; + const int pooled_width = oshape[2]; + const int kernel_w = kernel[0]; + const int pad_w = pad[0]; + const int stride_w = stride[0]; + const index_t in_grad_offset = ishape[2]; + const index_t out_grad_offset = oshape[2]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pw = 0; pw < pooled_width; ++pw) { + int wstart = pw * stride_w - pad_w; + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = wend - wstart; + } + wstart = std::max(wstart, 0); + wend = std::min(wend, width); + for (int w = wstart; w < wend; ++w) { + in_grad[w] += out_grad[pw] / pool_size; + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + +/*! + * \brief avg/sum unpooling cpu function for 2-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int height = ishape[2], width = ishape[3]; + const int pooled_height = oshape[2], pooled_width = oshape[3]; + const int kernel_h = kernel[0], kernel_w = kernel[1]; + const int pad_h = pad[0], pad_w = pad[1]; + const int stride_h = stride[0], stride_w = stride[1]; + const index_t in_grad_offset = ishape[2] * ishape[3]; + const index_t out_grad_offset = oshape[2] * oshape[3]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = (hend - hstart) * (wend - wstart); + } + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + hend = std::min(hend, height); + wend = std::min(wend, width); + const int pool_index = ph * pooled_width + pw; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + in_grad[h*width+w] += out_grad[pool_index] / pool_size; + } + } + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + +/*! + * \brief avg/sum unpooling cpu function for 3-D images. + * Do not call this kernel directly. Use the interface unpool(). + */ +template +inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape, + const TShape& oshape, const TShape& kernel, + const TShape& pad, const TShape& stride, + DType* in_grad, bool isAvg = false) { + const int depth = ishape[2], height = ishape[3], width = ishape[4]; + const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4]; + const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2]; + const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2]; + const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2]; + const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4]; + const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4]; + for (index_t n = 0; n < oshape[0]; ++n) { + for (index_t c = 0; c < oshape[1]; ++c) { + for (int pd = 0; pd < pooled_depth; ++pd) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int dstart = pd * stride_d - pad_d; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int dend = std::min(dstart + kernel_d, depth + pad_d); + int hend = std::min(hstart + kernel_h, height + pad_h); + int wend = std::min(wstart + kernel_w, width + pad_w); + int pool_size = 1; + if (isAvg) { + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); + wstart = std::max(wstart, 0); + dend = std::min(dend, depth); + hend = std::min(hend, height); + wend = std::min(wend, width); + const int pool_index = (pd * pooled_height + ph) * pooled_width + pw; + for (int d = dstart; d < dend; ++d) { + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size; + } + } + } + } + } + } + in_grad += in_grad_offset; + out_grad += out_grad_offset; + } + } +} + +/*! + * \brief This function serves as an interface for 1/2/3-D pooling operations. + * \param s context stream defining the device in use is cpu + * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type, only support kWriteTo for now + * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW + */ +template +inline void pool(mshadow::Stream* s, const DType* in_data, const TShape& ishape, + const TShape& oshape, const TShape& kernel, const TShape& pad, + const TShape& stride, const int pool_type, OpReqType req_type, + DType* out_data) { + CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations"; + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else if (pool_enum::kAvgPooling == pool_type) { + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true); + } else if (pool_enum::kSumPooling == pool_type) { + pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling"; + } +} + +/*! + * \brief This function serves as an interface for 1/2/3-D unpooling operations. + * \param s context stream defining the device in use is cpu + * \param out_grad pointer of the gradient of operator's output tensor + * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW + * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW + * \param ishape input tensor shape + * \param oshape output tensor shape + * \param kernel kernel shape + * \param pad pad shape + * \param stride stride shape + * \param pool_type supported pooling type: max, avg, sum + * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo + * \param in_grad pointer of the gradient of the operator's input tensor + */ +template +inline void unpool(mshadow::Stream* s, const DType* out_grad, const DType* in_data, + const DType* out_data, const TShape& ishape, const TShape& oshape, + const TShape& kernel, const TShape& pad, const TShape& stride, + const int pool_type, OpReqType req_type, DType* in_grad) { + if (mxnet::kNullOp == req_type) return; + if (mxnet::kAddTo != req_type) { + mxnet_op::Kernel::Launch(s, ishape.Size(), in_grad); + } + if (kernel.ndim() == 1) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 2) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else if (kernel.ndim() == 3) { + if (pool_enum::kMaxPooling == pool_type) { + unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad); + } else if (pool_enum::kAvgPooling == pool_type) { + unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true); + } else if (pool_enum::kSumPooling == pool_type) { + unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad); + } else { + LOG(FATAL) << "Unknown pooling type " << pool_type; + } + } else { + LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling"; + } +} + +} // namespace op +} // namespace mxnet +#ifdef __CUDACC__ +#include "./pool.cuh" +#endif + +#endif // MXNET_OPERATOR_NN_POOL_H_ diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h index 9ea0155b21be..8156c3796539 100644 --- a/src/operator/pooling-inl.h +++ b/src/operator/pooling-inl.h @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling-inl.h * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #ifndef MXNET_OPERATOR_POOLING_INL_H_ @@ -17,17 +17,11 @@ #include #include #include "./operator_common.h" +#include "./nn/pool.h" namespace mxnet { namespace op { -namespace pool_enum { -enum PoolingOpInputs {kData}; -enum PoolingOpOutputs {kOut}; -enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling}; -enum PoolingOpPadConventionType {kValid, kFull}; -} // namespace pool_enum - struct PoolingParam : public dmlc::Parameter { TShape kernel; TShape stride; @@ -35,10 +29,14 @@ struct PoolingParam : public dmlc::Parameter { int pool_type; int pooling_convention; bool global_pool; + bool cudnn_off; DMLC_DECLARE_PARAMETER(PoolingParam) { DMLC_DECLARE_FIELD(global_pool).set_default(false) .describe("Ignore kernel size, do global pooling based on current input feature map. "); + DMLC_DECLARE_FIELD(cudnn_off).set_default(false) + .describe("Turn off cudnn pooling and use MXNet pooling operator. "); + DMLC_DECLARE_FIELD(kernel) .enforce_nonzero() .describe("pooling kernel size: (y, x) or (d, y, x)"); @@ -63,107 +61,66 @@ struct PoolingParam : public dmlc::Parameter { } }; -template +template class PoolingOp : public Operator { public: explicit PoolingOp(PoolingParam p) { this->param_ = p; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + virtual void Forward(const OpContext& ctx, + const std::vector& in_data, + const std::vector& req, + const std::vector& out_data, + const std::vector& aux_args) { using namespace mshadow; - using namespace mshadow::expr; CHECK_EQ(in_data.size(), 1U); CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); - if (param_.kernel.ndim() == 3) { - LOG(FATAL) << "3D kernel not implemented"; - } - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]); - if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) { - Assign(out, - req[pool_enum::kOut], - pool(pad(data, param_.pad[0], param_.pad[1]), - out_shape, - param_.global_pool ? data.shape_[2] : param_.kernel[0], - param_.global_pool ? data.shape_[3] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - } else if (param_.pool_type == pool_enum::kAvgPooling) { - Assign(out, - req[pool_enum::kOut], - scalar(1.0f / (param_.global_pool ? - data.shape_[2] * data.shape_[3] : - param_.kernel[0] * param_.kernel[1])) * \ - pool(pad(data, param_.pad[0], param_.pad[1]), - out_shape, - param_.global_pool ? data.shape_[2] : param_.kernel[0], - param_.global_pool ? data.shape_[3] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - } + const TShape& ishape = in_data[pool_enum::kData].shape_; + + pool(s, in_data[pool_enum::kData].dptr(), + in_data[pool_enum::kData].shape_, + out_data[pool_enum::kOut].shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kOut], + out_data[pool_enum::kOut].dptr()); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + virtual void Backward(const OpContext& ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { using namespace mshadow; - using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); CHECK_EQ(in_data.size(), 1U); CHECK_EQ(out_data.size(), 1U); CHECK_EQ(req.size(), 1U); CHECK_EQ(in_grad.size(), 1U); - // TODO(bing): remove pad (0,0) - if (param_.kernel.ndim() == 3) { - LOG(FATAL) << "3D kernel not implemented"; - } Stream *s = ctx.get_stream(); - Tensor grad = out_grad[pool_enum::kOut].get(s); - Tensor data = in_data[pool_enum::kData].get(s); - Tensor output_data = out_data[pool_enum::kOut].get(s); - Tensor input_grad = in_grad[pool_enum::kData].get(s); - - mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]); - - if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) { - Assign(input_grad, req[pool_enum::kData], - crop(unpool(pad(data, param_.pad[0], param_.pad[1]), - pad(output_data, 0, 0), - pad(grad, 0, 0), - param_.global_pool ? in_shape[0] : param_.kernel[0], - param_.global_pool ? in_shape[1] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1]), - in_shape, - param_.pad[0], - param_.pad[1])); - } else if (param_.pool_type == pool_enum::kAvgPooling) { - Assign(input_grad, req[pool_enum::kData], - scalar(1.0f / (param_.global_pool ? - data.shape_[2] * data.shape_[3] : - param_.kernel[0] * param_.kernel[1])) * \ - crop(unpool(pad(data, param_.pad[0], param_.pad[1]), - pad(output_data, 0, 0), - pad(grad, 0, 0), - param_.global_pool ? in_shape[0] : param_.kernel[0], - param_.global_pool ? in_shape[1] : param_.kernel[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1]), - in_shape, - param_.pad[0], - param_.pad[1])); - } + const TShape& ishape = in_data[pool_enum::kData].shape_; + + unpool(s, out_grad[pool_enum::kOut].dptr(), + in_data[pool_enum::kData].dptr(), + out_data[pool_enum::kOut].dptr(), + in_grad[pool_enum::kData].shape_, + out_grad[pool_enum::kOut].shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kData], + in_grad[pool_enum::kData].dptr()); } private: @@ -180,7 +137,10 @@ class PoolingProp : public OperatorProperty { void Init(const std::vector >& kwargs) override { using namespace mshadow; param_.Init(kwargs); - if (param_.kernel.ndim() == 2) { + if (param_.kernel.ndim() == 1) { + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); } else { @@ -203,11 +163,31 @@ class PoolingProp : public OperatorProperty { std::vector *aux_shape) const override { CHECK_EQ(in_shape->size(), 1U); const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) " - << "Or 5D in (batch, channel, d, y, x)"; + CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; TShape oshape = dshape; if (dshape.ndim() == 0) return false; - if (param_.kernel.ndim() == 2) { + if (param_.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param_.global_pool) { + oshape[2] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 2) { CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; if (param_.global_pool) { oshape[2] = 1; @@ -234,7 +214,7 @@ class PoolingProp : public OperatorProperty { } } out_shape->clear(); - out_shape->push_back(oshape); + out_shape->push_back(oshape); // save output shape } else if (param_.kernel.ndim() == 3) { CHECK_EQ(dshape.ndim(), 5U) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; @@ -267,7 +247,7 @@ class PoolingProp : public OperatorProperty { } out_shape->clear(); - out_shape->push_back(oshape); + out_shape->push_back(oshape); // save output shape } return true; } @@ -302,7 +282,8 @@ class PoolingProp : public OperatorProperty { const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data) const override { - return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]}; + return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], + out_data[pool_enum::kOut]}; } std::vector > BackwardInplaceOption( diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc index a4eed2633232..85f15668f19a 100644 --- a/src/operator/pooling.cc +++ b/src/operator/pooling.cc @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling.cc * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #include "./pooling-inl.h" #if MXNET_USE_MKL2017 == 1 @@ -20,9 +20,15 @@ namespace op { template<> Operator *CreateOp(PoolingParam param, int dtype) { Operator *op = NULL; + // TODO(junwu): Since MKL has a bug when pad and stride > 0, + // we disable MKL in those cases and will re-enable it after + // it is fixed by deleting lines 28 and 29. #if MXNET_USE_MKL2017 == 1 - if ((param.pool_type == pool_enum::kMaxPooling) - || (param.pool_type == pool_enum::kAvgPooling)) { + if (param.kernel.ndim() == 2 + && 0 == param.pad[0] && 0 == param.pad[1] + && 0 == param.stride[0] && 0 == param.stride[1] + && ((param.pool_type == pool_enum::kMaxPooling) + || (param.pool_type == pool_enum::kAvgPooling))) { switch (dtype) { case mshadow::kFloat32: return new MKLPoolingOp(param); @@ -51,21 +57,15 @@ Operator *CreateOp(PoolingParam param, int dtype) { } #endif MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new PoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new PoolingOp(param); - break; - case pool_enum::kSumPooling: - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + op = new PoolingOp(param); + } else { + LOG(FATAL) << "unknown pooling type"; + return NULL; } - }) + }); return op; } @@ -85,6 +85,10 @@ DMLC_REGISTER_PARAMETER(PoolingParam); MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) .describe(R"code(Perform pooling on the input. +The shapes for 1-D pooling are +- **data**: *(batch_size, channel, width)*, +- **out**: *(batch_size, num_filter, out_width)*. + The shapes for 2-D pooling is - **data**: *(batch_size, channel, height, width)* @@ -112,7 +116,7 @@ Three pooling options are supported by ``pool_type``: - **max**: max pooling - **sum**: sum pooling -1-D pooling is special case of 2-D pooling with *weight=1* and +1-D pooling is special case of 2-D pooling with *width=1* and *kernel[1]=1*. For 3-D pooling, an additional *depth* dimension is added before diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu index be2464e3c0ef..c420852b1c8d 100644 --- a/src/operator/pooling.cu +++ b/src/operator/pooling.cu @@ -1,8 +1,8 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 by Contributors * \file pooling.cu * \brief - * \author Bing Xu + * \author Bing Xu, Jun Wu */ #include #include "./pooling-inl.h" @@ -12,45 +12,37 @@ namespace mxnet { namespace op { + template<> Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; #if MXNET_USE_CUDNN == 1 MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new CuDNNPoolingOp(param); - break; - case pool_enum::kSumPooling: - LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied."; - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (!param.cudnn_off) { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + return new CuDNNPoolingOp(param); + case pool_enum::kAvgPooling: + return new CuDNNPoolingOp(param); + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + return new PoolingOp(param); + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } } }); -#else +#endif // MXNET_USE_CUDNN + Operator *op = NULL; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - op = new PoolingOp(param); - break; - case pool_enum::kAvgPooling: - op = new PoolingOp(param); - break; - case pool_enum::kSumPooling: - op = new PoolingOp(param); - break; - default: - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + op = new PoolingOp(param); + } else { + LOG(FATAL) << "unknown pooling type"; } }); -#endif // MXNET_USE_CUDNN return op; } diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h new file mode 100644 index 000000000000..7fef1258093f --- /dev/null +++ b/src/operator/pooling_v1-inl.h @@ -0,0 +1,337 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1-inl.h + * \brief + * \author Bing Xu +*/ + +#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_ +#define MXNET_OPERATOR_POOLING_V1_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" + +namespace mxnet { +namespace op { + +namespace pool_v1_enum { +enum PoolingV1OpInputs {kData}; +enum PoolingV1OpOutputs {kOut}; +enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling}; +enum PoolingV1OpPadConventionType {kValid, kFull}; +} // namespace pool_v1_enum + +struct PoolingV1Param : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape pad; + int pool_type; + int pooling_convention; + bool global_pool; + DMLC_DECLARE_PARAMETER(PoolingV1Param) { + DMLC_DECLARE_FIELD(global_pool).set_default(false) + .describe("Ignore kernel size, do global pooling based on current input feature map. "); + + DMLC_DECLARE_FIELD(kernel) + .enforce_nonzero() + .describe("pooling kernel size: (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pool_type) + .add_enum("max", pool_v1_enum::kMaxPooling) + .add_enum("avg", pool_v1_enum::kAvgPooling) + .add_enum("sum", pool_v1_enum::kSumPooling) + .describe("Pooling type to be applied."); + + DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid) + .add_enum("full", pool_v1_enum::kFull) + .add_enum("valid", pool_v1_enum::kValid) + .describe("Pooling convention to be applied."); + + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .enforce_nonzero() + .describe("stride: for pooling (y, x) or (d, y, x)"); + + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for pooling: (y, x) or (d, y, x)"); + } +}; + +template +class PoolingV1Op : public Operator { + public: + explicit PoolingV1Op(PoolingV1Param p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + Stream *s = ctx.get_stream(); + if (param_.kernel.ndim() == 3) { + LOG(FATAL) << "3D kernel not implemented"; + } + Tensor data = in_data[pool_v1_enum::kData].get(s); + Tensor out = out_data[pool_v1_enum::kOut].get(s); + mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]); + if (param_.pool_type == pool_v1_enum::kMaxPooling + || param_.pool_type == pool_v1_enum::kSumPooling) { + Assign(out, + req[pool_v1_enum::kOut], + pool(pad(data, param_.pad[0], param_.pad[1]), + out_shape, + param_.global_pool ? data.shape_[2] : param_.kernel[0], + param_.global_pool ? data.shape_[3] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + } else if (param_.pool_type == pool_v1_enum::kAvgPooling) { + Assign(out, + req[pool_v1_enum::kOut], + scalar(1.0f / (param_.global_pool ? + data.shape_[2] * data.shape_[3] : + param_.kernel[0] * param_.kernel[1])) * \ + pool(pad(data, param_.pad[0], param_.pad[1]), + out_shape, + param_.global_pool ? data.shape_[2] : param_.kernel[0], + param_.global_pool ? data.shape_[3] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(req.size(), 1); + CHECK_EQ(in_grad.size(), 1); + // TODO(bing): remove pad (0,0) + if (param_.kernel.ndim() == 3) { + LOG(FATAL) << "3D kernel not implemented"; + } + Stream *s = ctx.get_stream(); + Tensor grad = out_grad[pool_v1_enum::kOut].get(s); + Tensor data = in_data[pool_v1_enum::kData].get(s); + Tensor output_data = out_data[pool_v1_enum::kOut].get(s); + Tensor input_grad = in_grad[pool_v1_enum::kData].get(s); + + mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]); + + if (param_.pool_type == pool_v1_enum::kMaxPooling + || param_.pool_type == pool_v1_enum::kSumPooling) { + Assign(input_grad, req[pool_v1_enum::kData], + crop(unpool(pad(data, param_.pad[0], param_.pad[1]), + pad(output_data, 0, 0), + pad(grad, 0, 0), + param_.global_pool ? in_shape[0] : param_.kernel[0], + param_.global_pool ? in_shape[1] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1]), + in_shape, + param_.pad[0], + param_.pad[1])); + } else if (param_.pool_type == pool_v1_enum::kAvgPooling) { + Assign(input_grad, req[pool_v1_enum::kData], + scalar(1.0f / (param_.global_pool ? + data.shape_[2] * data.shape_[3] : + param_.kernel[0] * param_.kernel[1])) * \ + crop(unpool(pad(data, param_.pad[0], param_.pad[1]), + pad(output_data, 0, 0), + pad(grad, 0, 0), + param_.global_pool ? in_shape[0] : param_.kernel[0], + param_.global_pool ? in_shape[1] : param_.kernel[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1]), + in_shape, + param_.pad[0], + param_.pad[1])); + } + } + + private: + PoolingV1Param param_; +}; // class PoolingV1Op + +template +Operator* CreateOp(PoolingV1Param param, int dtype); + + +#if DMLC_USE_CXX11 +class PoolingV1Prop : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D pooling not supported"; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) + << "pad and kernel should have the same length"; + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + CHECK_EQ(in_shape->size(), 1); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) " + << "Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) + << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] + << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + if (param_.pooling_convention == pool_v1_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); + } else if (param_.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; + } else { + if (param_.pooling_convention == pool_v1_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + oshape[4] = 1 + static_cast(ceil(static_cast( + dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / param_.stride[2])); + } + } + + out_shape->clear(); + out_shape->push_back(oshape); + } + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 1); + int dtype = (*in_type)[0]; + + if (dtype == -1) { + LOG(FATAL) << "Input type to pooling is not specified."; + return false; + } + + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + PoolingV1Prop *prop_sym = new PoolingV1Prop(); + prop_sym->param_ = this->param_; + return prop_sym; + } + + std::string TypeString() const override { + return "Pooling_v1"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData], + out_data[pool_v1_enum::kOut]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { +#if MXNET_USE_CUDNN == 1 + return {}; +#else + return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}}; +#endif + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + PoolingV1Param param_; +}; // class PoolingV1Prop +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_POOLING_V1_INL_H_ diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc new file mode 100644 index 000000000000..cdf4f1af5939 --- /dev/null +++ b/src/operator/pooling_v1.cc @@ -0,0 +1,89 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1.cc + * \brief + * \author Bing Xu +*/ +#include "./pooling_v1-inl.h" + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(PoolingV1Param param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.pool_type) { + case pool_v1_enum::kMaxPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kAvgPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kSumPooling: + op = new PoolingV1Op(param); + break; + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } + }) + + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +} + +DMLC_REGISTER_PARAMETER(PoolingV1Param); + +MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop) +.describe(R"code(Perform pooling on the input. + +The shapes for 2-D pooling is + +- **data**: *(batch_size, channel, height, width)* +- **out**: *(batch_size, num_filter, out_height, out_width)*, with:: + + out_height = f(height, kernel[0], pad[0], stride[0]) + out_width = f(width, kernel[1], pad[1], stride[1]) + +The defintion of *f* depends on ``pooling_convention``, which has two options: + +- **valid** (default):: + + f(x, k, p, s) = floor(x+2*p-k)/s+1 + +- **full**, which is compatible with Caffe:: + + f(x, k, p, s) = ceil(x+2*p-k)/s+1 + +But ``global_pool`` is set to be true, then do a global pooling, namely reset +``kernel=(height, width)``. + +Three pooling options are supported by ``pool_type``: + +- **avg**: average pooling +- **max**: max pooling +- **sum**: sum pooling + +1-D pooling is special case of 2-D pooling with *weight=1* and +*kernel[1]=1*. + +For 3-D pooling, an additional *depth* dimension is added before +*height*. Namely the input data will have shape *(batch_size, channel, depth, +height, width)*. + +)code" ADD_FILELINE) +.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.") +.add_arguments(PoolingV1Param::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu new file mode 100644 index 000000000000..99aebbc6446c --- /dev/null +++ b/src/operator/pooling_v1.cu @@ -0,0 +1,36 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file pooling_v1.cu + * \brief + * \author Bing Xu +*/ +#include +#include "./pooling_v1-inl.h" + +namespace mxnet { +namespace op { +template<> +Operator *CreateOp(PoolingV1Param param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.pool_type) { + case pool_v1_enum::kMaxPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kAvgPooling: + op = new PoolingV1Op(param); + break; + case pool_v1_enum::kSumPooling: + op = new PoolingV1Op(param); + break; + default: + LOG(FATAL) << "unknown pooling type"; + return NULL; + } + }); + return op; +} + +} // namespace op +} // namespace mxnet + diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index a82530928bbc..6aa3912216a3 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -182,6 +182,9 @@ def test_grid_generator_with_type(): check_consistency(sym, ctx_list, grad_req="add") +# Checking max pooling consistency over the data sets of different float types is problematic +# as one max value in a float32 data set may not be the max value in a float16 data set. +# This function will not be called. def test_pooling_with_type(): np.random.seed(1234) ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}}, @@ -204,6 +207,163 @@ def test_pooling_with_type(): check_consistency(sym, ctx_list) +def test_pooling_versions(): + def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride, + pooling_convention='valid', global_pool=False): + ctx_list = [] + sym_list = [] + # PoolingV1 cpu + if 'pool_v1_cpu' in pool_op_list: + ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # PoolingV1 gpu + if 'pool_v1_gpu' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # Pooling cpu + if 'pool_cpu' in pool_op_list: + ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool')) + # Pooling gpu + if 'pool_gpu' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, cudnn_off=True, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True, + name='pool')) + # CuDNNPooling + if 'pool_cudnn' in pool_op_list: + ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}}) + if not global_pool: + sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention=pooling_convention, cudnn_off=False, name='pool')) + else: + sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=False, + name='pool')) + check_consistency(sym_list, ctx_list) + + def test_1d_pooling(pool_type): + data = (2, 3, 20) + kernel = (4,) + pad = (0,) + stride = (1,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (2,) + stride = (2,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0,) + stride = (1,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + pad = (2,) + stride = (2,) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + def test_2d_pooling(pool_type): + data = (2, 3, 20, 20) + kernel = (4, 5) + pad = (0, 0) + stride = (1, 1) + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here + pad = (2, 3) + stride = (2, 3) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0, 0) + stride = (1, 1) + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here + pad = (2, 3) + stride = (2, 3) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + def test_3d_pooling(pool_type): + data = (2, 3, 20, 20, 20) + kernel = (4, 5, 3) + pad = (0, 0, 0) + stride = (1, 1, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (2, 3, 3) + stride = (2, 3, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='valid', global_pool=False) + + pad = (0, 0, 0) + stride = (1, 1, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + pad = (2, 3, 3) + stride = (2, 3, 1) + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + pooling_convention='full', global_pool=False) + + test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'], + data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type, + global_pool=True) + + test_1d_pooling('max') + test_1d_pooling('avg') + test_1d_pooling('sum') + + test_2d_pooling('max') + test_2d_pooling('avg') + test_2d_pooling('sum') + + test_3d_pooling('max') + test_3d_pooling('avg') + test_3d_pooling('sum') + + def test_upsampling_with_type(): sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='nearest', num_args=1) ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}}, @@ -490,7 +650,7 @@ def test_unfuse(): test_convolution_options() test_convolution_versions() test_convolution_with_type() - test_pooling_with_type() + test_pooling_versions() test_batchnorm_with_type() test_batchnorm_with_type() test_deconvolution_with_type()