diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 6d8b45fc1188..9b5dcfe3d3b1 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -23,6 +23,23 @@ using std::isnan;
 #endif
 
 
+#ifdef __CUDACC__
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+
+/*!
+ * \brief Get the number of blocks for cuda kernel given N
+ */
+inline int cuda_get_num_blocks(const int N) {
+  using namespace mshadow::cuda;
+  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
+}
+#endif  // __CUDACC__
+
+
 /*! \brief operator request type switch */
 #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...)  \
   switch (req) {                                    \
@@ -139,6 +156,22 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
 }
 
 
+struct fill {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) {
+    out[i] = val;
+  }
+};
+
+
+struct set_zero {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out) {
+    out[i] = static_cast<DType>(0);
+  }
+};
+
+
 template<typename OP, typename xpu>
 struct Kernel;
 
diff --git a/src/operator/nn/im2col.cuh b/src/operator/nn/im2col.cuh
index 8555319f5f68..786fd22f8c9b 100644
--- a/src/operator/nn/im2col.cuh
+++ b/src/operator/nn/im2col.cuh
@@ -71,20 +71,6 @@
 namespace mxnet {
 namespace op {
 
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-      i < (n); \
-      i += blockDim.x * gridDim.x)
-
-/*!
- * \brief Get the number of blocks for cuda kernel given N
- */
-inline int cuda_get_num_blocks(const int N) {
-  using namespace mshadow::cuda;
-  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
-}
-
 /*!
  * \brief im2col gpu kernel.
  * DO NOT call this directly. Use wrapper function im2col() instead;
@@ -141,6 +127,7 @@ inline void im2col_gpu(mshadow::Stream<gpu>* s,
   int width_col = (width + 2 * pad_w -
       (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
+  using namespace mxnet_op;
   // NOLINT_NEXT_LINE(whitespace/operators)
   im2col_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
                              0, mshadow::Stream<gpu>::GetStream(s)>>>(
@@ -303,6 +290,7 @@ inline void im2col(mshadow::Stream<gpu>* s,
   index_t num_spatial_axes = kernel_shape.ndim();
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
   index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
+  using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     im2col_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
@@ -347,6 +335,7 @@ inline void col2im_gpu(mshadow::Stream<gpu>* s, const DType* data_col, const int
   int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
       stride_w + 1;
   int num_kernels = channels * height * width;
+  using namespace mxnet_op;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   // NOLINT_NEXT_LINE(whitespace/operators)
@@ -487,6 +476,7 @@ inline void col2im(mshadow::Stream<gpu>* s,
   index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
   // num_axes should be smaller than block size
   CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  using namespace mxnet_op;
   switch (num_spatial_axes) {
   case 1:
     col2im_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
new file mode 100644
index 000000000000..54fd3461d80f
--- /dev/null
+++ b/src/operator/nn/pool.cuh
@@ -0,0 +1,826 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.cuh
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
+ * \author Jun Wu
+ */
+
+#ifndef MXNET_OPERATOR_NN_POOL_CUH_
+#define MXNET_OPERATOR_NN_POOL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "../mxnet_op.h"
+#include "../../common/cuda_utils.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief max pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
+                                       const int channels, const int width,
+                                       const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * width;
+    DType max_val = MinValue<DType>();
+    for (int w = wstart; w < wend; ++w) {
+      const DType in_val = in_slice[w];
+      if (in_val > max_val) {
+        max_val = in_val;
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief max pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
+                                       const int channels, const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w, const int stride_h,
+                                       const int stride_w, const int pad_h, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * height * width;
+    DType max_val = MinValue<DType>();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        const DType in_val = in_slice[h * width + w];
+        if (in_val > max_val) {
+          max_val = in_val;
+        }
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief max pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w, const int stride_d,
+                                       const int stride_h, const int stride_w, const int pad_d,
+                                       const int pad_h, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * depth * height * width;
+    DType max_val = MinValue<DType>();
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          const DType in_val = in_slice[(d * height + h) * width + w];
+          if (in_val > max_val) {
+            max_val = in_val;
+          }
+        }
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int width, const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int c = (index / pooled_width) % channels;
+	  const int n = index / pooled_width / channels;
+	  int wstart = pw * stride_w - pad_w;
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (wend - wstart) : 1);
+	  wstart = max(wstart, 0);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * width;
+    for (int w = wstart; w < wend; ++w) {
+      sum += out_slice[w];
+    }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_h, const int stride_w,
+                                       const int pad_h, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+	  const int c = (index / pooled_width / pooled_height) % channels;
+	  const int n = index / pooled_width / pooled_height / channels;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * height * width;
+	  for (int h = hstart; h < hend; ++h) {
+		  for (int w = wstart; w < wend; ++w) {
+		    sum += out_slice[h * width + w];
+		  }
+	  }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_d, const int stride_h, const int stride_w,
+                                       const int pad_d, const int pad_h, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+	  const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+	  const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+    int dend = min(dstart + kernel_d, depth + pad_d);
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    dstart = max(dstart, 0);
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
+    dend = min(dend, depth);
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          sum += out_slice[(d * height + h) * width + w];
+        }
+      }
+    }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    for (int w = wstart; w < wend; ++w) {
+      if (in_data_slice[w] == max_val) {
+        max_idx = w;
+        break;
+      }
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        const int idx = h * width + w;
+        if (in_data_slice[idx] == max_val) {
+          max_idx = idx;
+          found = true;
+          break;
+        }
+      }
+      if (found) break;
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d,
+                                         const int stride_h, const int stride_w, const int pad_d,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCDHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * depth * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          const int idx = (d * height + h) * width + w;
+          if (in_data_slice[idx] == max_val) {
+            max_idx = idx;
+            found = true;
+            break;
+          }
+        }
+        if (found) break;
+      }
+      if (found) break;
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int c = (index / width) % channels;
+	  const int n = index / width / channels;
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_width;
+    for (int pw = pwstart; pw < pwend; ++pw) {
+      // figure out the pooling size
+      int wstart = pw * stride_w - pad_w;
+      int wend = min(wstart + kernel_w, width + pad_w);
+      int pool_size = (isAvg? (wend - wstart) : 1);
+      gradient += out_grad_slice[pw] / pool_size;
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+	  const int c = (index / width / height) % channels;
+	  const int n = index / width / height / channels;
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_height * pooled_width;
+	  for (int ph = phstart; ph < phend; ++ph) {
+	 	  for (int pw = pwstart; pw < pwend; ++pw) {
+		    // figure out the pooling size
+			  int hstart = ph * stride_h - pad_h;
+			  int wstart = pw * stride_w - pad_w;
+			  int hend = min(hstart + kernel_h, height + pad_h);
+			  int wend = min(wstart + kernel_w, width + pad_w);
+			  int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+			  gradient += out_grad_slice[ph * pooled_width + pw] / pool_size;
+		  }
+	  }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d, const int stride_h,
+                                         const int stride_w, const int pad_d, const int pad_h,
+                                         const int pad_w, DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+    const int d = (index / width / height) % depth + pad_d;
+	  const int c = (index / width / height / depth) % channels;
+	  const int n = index / width / height / depth / channels;
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = min(dstart + kernel_d, depth + pad_d);
+          int hend = min(hstart + kernel_h, height + pad_h);
+          int wend = min(wstart + kernel_w, width + pad_w);
+          int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size;
+        }
+      }
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type, only support kWriteTo for now
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
+template<typename DType>
+inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
+  using namespace mxnet_op;
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2],
+                                   oshape[2], kernel[0], stride[0], pad[0], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4],
+                                   kernel[0], kernel[1], kernel[2], stride[0],
+                                   stride[1], stride[2], pad[0], pad[1], pad[2], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
+template<typename DType>
+inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
+  }
+  using namespace mxnet_op;
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
+                                     in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2],
+                                     pad[0], pad[1], pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_POOL_CUH_
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
new file mode 100644
index 000000000000..79accb5d521f
--- /dev/null
+++ b/src/operator/nn/pool.h
@@ -0,0 +1,759 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.h
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp
+ * \author Jun Wu
+ */
+
+#ifndef MXNET_OPERATOR_NN_POOL_H_
+#define MXNET_OPERATOR_NN_POOL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_enum {
+enum PoolingOpInputs {kData};
+enum PoolingOpOutputs {kOut, kMask};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingOpPadConventionType {kValid, kFull};
+}  // namespace pool_enum
+
+/*!
+ * \brief max pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        DType max_val = MinValue<DType>();
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] > max_val) {
+            max_val = in_data[w];
+          }
+        }
+        out_data[pw] = max_val;
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          DType max_val = MinValue<DType>();
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int in_index = h * width + w;
+              if (in_data[in_index] > max_val) {
+                max_val = in_data[in_index];
+              }
+            }
+          }
+          out_data[pool_index] = max_val;
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            DType max_val = MinValue<DType>();
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int in_index = (d * height + h) * width + w;
+                  if (in_data[in_index] > max_val) {
+                    max_val = in_data[in_index];
+                  }
+                }
+              }
+            }
+            out_data[pool_index] = max_val;
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = (wend - wstart);
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        DType sum = 0;
+        for (int w = wstart; w < wend; ++w) {
+          sum += in_data[w];
+        }
+        out_data[pw] = (getAvg? sum/pool_size : sum);
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          DType sum = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              sum += in_data[h*width+w];
+            }
+          }
+          out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            DType sum = 0;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  sum += in_data[(d*height+h)*width+w];
+                }
+              }
+            }
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_offset = ishape[2];
+  const index_t out_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        int max_idx = -1;
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] == out_data[pw]) {
+            max_idx = w;
+            break;
+          }
+        }
+        // In the case where pad > 0 and kernel = 1, for example,
+        // max_idx can be -1 reaching this step.
+        if (max_idx >= 0) {
+          in_grad[max_idx] += out_grad[pw];
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_offset = ishape[2] * ishape[3];
+  const index_t out_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          int max_idx = -1;
+          bool found = false;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int idx = h * width + w;
+              if (in_data[idx] == out_data[pool_index]) {
+                max_idx = idx;
+                found = true;
+                break;
+              }
+            }
+            if (found) break;
+          }
+          // In the case where pad > 0 and kernel = 1, for example,
+          // max_idx can be -1 reaching this step.
+          if (max_idx >= 0) {
+            in_grad[max_idx] += out_grad[pool_index];
+          }
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            int max_idx = -1;
+            bool found = false;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int idx = (d * height + h) * width + w;
+                  if (in_data[idx] == out_data[pool_index]) {
+                    max_idx = idx;
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) break;
+              }
+              if (found) break;
+            }
+            // In the case where pad > 0 and kernel = 1, for example,
+            // max_idx can be -1 reaching this step.
+            if (max_idx >= 0) {
+              in_grad[max_idx] += out_grad[pool_index];
+            }
+          }
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_grad_offset = ishape[2];
+  const index_t out_grad_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = 1;
+        if (isAvg) {
+          pool_size = wend - wstart;
+        }
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        for (int w = wstart; w < wend; ++w) {
+          in_grad[w] += out_grad[pw] / pool_size;
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_grad_offset = ishape[2] * ishape[3];
+  const index_t out_grad_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = 1;
+          if (isAvg) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              in_grad[h*width+w] += out_grad[pool_index] / pool_size;
+            }
+          }
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = 1;
+            if (isAvg) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size;
+                }
+              }
+            }
+          }
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type, only support kWriteTo for now
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
+template<typename DType>
+inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
+template<typename DType>
+inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
+  }
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#ifdef __CUDACC__
+#include "./pool.cuh"
+#endif
+
+#endif  // MXNET_OPERATOR_NN_POOL_H_
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 9ea0155b21be..8156c3796539 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 
 #ifndef MXNET_OPERATOR_POOLING_INL_H_
@@ -17,17 +17,11 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./nn/pool.h"
 
 namespace mxnet {
 namespace op {
 
-namespace pool_enum {
-enum PoolingOpInputs {kData};
-enum PoolingOpOutputs {kOut};
-enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
-enum PoolingOpPadConventionType {kValid, kFull};
-}  // namespace pool_enum
-
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
   TShape stride;
@@ -35,10 +29,14 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pool_type;
   int pooling_convention;
   bool global_pool;
+  bool cudnn_off;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(global_pool).set_default(false)
     .describe("Ignore kernel size, do global pooling based on current input feature map. ");
 
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
+    .describe("Turn off cudnn pooling and use MXNet pooling operator. ");
+
     DMLC_DECLARE_FIELD(kernel)
     .enforce_nonzero()
     .describe("pooling kernel size: (y, x) or (d, y, x)");
@@ -63,107 +61,66 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   }
 };
 
-template<typename xpu, typename Reducer, typename DType>
+template<typename xpu, typename DType>
 class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
     this->param_ = p;
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> out = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    }
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
+    pool(s, in_data[pool_enum::kData].dptr<DType>(),
+         in_data[pool_enum::kData].shape_,
+         out_data[pool_enum::kOut].shape_,
+         param_.global_pool?
+           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+           : param_.kernel,
+         param_.pad,
+         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+         param_.pool_type,
+         req[pool_enum::kOut],
+         out_data[pool_enum::kOut].dptr<DType>());
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     CHECK_EQ(req.size(), 1U);
     CHECK_EQ(in_grad.size(), 1U);
-    // TODO(bing): remove pad (0,0)
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> output_data = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> input_grad = in_grad[pool_enum::kData].get<xpu, 4, DType>(s);
-
-    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
-
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    }
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
+    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
+           in_data[pool_enum::kData].dptr<DType>(),
+           out_data[pool_enum::kOut].dptr<DType>(),
+           in_grad[pool_enum::kData].shape_,
+           out_grad[pool_enum::kOut].shape_,
+           param_.global_pool?
+             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+             : param_.kernel,
+           param_.pad,
+           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+           param_.pool_type,
+           req[pool_enum::kData],
+           in_grad[pool_enum::kData].dptr<DType>());
   }
 
  private:
@@ -180,7 +137,10 @@ class PoolingProp : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
       if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
       if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
     } else {
@@ -203,11 +163,31 @@ class PoolingProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
-                               << "Or 5D in (batch, channel, d, y, x)";
+    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
+                                << " Or 4D in (batch, channel, y, x) "
+                                << " Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    } else if (param_.kernel.ndim() == 2) {
       CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
       if (param_.global_pool) {
         oshape[2] = 1;
@@ -234,7 +214,7 @@ class PoolingProp : public OperatorProperty {
         }
       }
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
     } else if (param_.kernel.ndim() == 3) {
       CHECK_EQ(dshape.ndim(), 5U)
         << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
@@ -267,7 +247,7 @@ class PoolingProp : public OperatorProperty {
       }
 
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
     }
     return true;
   }
@@ -302,7 +282,8 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
+            out_data[pool_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index a4eed2633232..85f15668f19a 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include "./pooling-inl.h"
 #if MXNET_USE_MKL2017 == 1
@@ -20,9 +20,15 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
+  // TODO(junwu): Since MKL has a bug when pad and stride > 0,
+  // we disable MKL in those cases and will re-enable it after
+  // it is fixed by deleting lines 28 and 29.
 #if MXNET_USE_MKL2017 == 1
-    if ((param.pool_type == pool_enum::kMaxPooling)
-      || (param.pool_type == pool_enum::kAvgPooling)) {
+    if (param.kernel.ndim() == 2
+      && 0 == param.pad[0] && 0 == param.pad[1]
+      && 0 == param.stride[0] && 0 == param.stride[1]
+      && ((param.pool_type == pool_enum::kMaxPooling)
+      || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {
       case mshadow::kFloat32:
         return new MKLPoolingOp<cpu, float>(param);
@@ -51,21 +57,15 @@ Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   }
 #endif
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<cpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<cpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
+      return NULL;
     }
-  })
+  });
 
   return op;
 }
@@ -85,6 +85,10 @@ DMLC_REGISTER_PARAMETER(PoolingParam);
 MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
 .describe(R"code(Perform pooling on the input.
 
+The shapes for 1-D pooling are
+- **data**: *(batch_size, channel, width)*,
+- **out**: *(batch_size, num_filter, out_width)*.
+
 The shapes for 2-D pooling is
 
 - **data**: *(batch_size, channel, height, width)*
@@ -112,7 +116,7 @@ Three pooling options are supported by ``pool_type``:
 - **max**: max pooling
 - **sum**: sum pooling
 
-1-D pooling is special case of 2-D pooling with *weight=1* and
+1-D pooling is special case of 2-D pooling with *width=1* and
 *kernel[1]=1*.
 
 For 3-D pooling, an additional *depth* dimension is added before
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index be2464e3c0ef..c420852b1c8d 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -12,45 +12,37 @@
 
 namespace mxnet {
 namespace op {
+
 template<>
 Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied.";
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (!param.cudnn_off) {
+      switch (param.pool_type) {
+        case pool_enum::kMaxPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kAvgPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kSumPooling:
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+          return new PoolingOp<gpu, DType>(param);
+        default:
+          LOG(FATAL) << "unknown pooling type";
+          return NULL;
+      }
     }
   });
-#else
+#endif  // MXNET_USE_CUDNN
+  Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<gpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<gpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
     }
   });
-#endif  // MXNET_USE_CUDNN
   return op;
 }
 
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
new file mode 100644
index 000000000000..7fef1258093f
--- /dev/null
+++ b/src/operator/pooling_v1-inl.h
@@ -0,0 +1,337 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_
+#define MXNET_OPERATOR_POOLING_V1_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_v1_enum {
+enum PoolingV1OpInputs {kData};
+enum PoolingV1OpOutputs {kOut};
+enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingV1OpPadConventionType {kValid, kFull};
+}  // namespace pool_v1_enum
+
+struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  int pool_type;
+  int pooling_convention;
+  bool global_pool;
+  DMLC_DECLARE_PARAMETER(PoolingV1Param) {
+    DMLC_DECLARE_FIELD(global_pool).set_default(false)
+    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
+
+    DMLC_DECLARE_FIELD(kernel)
+    .enforce_nonzero()
+    .describe("pooling kernel size: (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pool_type)
+    .add_enum("max", pool_v1_enum::kMaxPooling)
+    .add_enum("avg", pool_v1_enum::kAvgPooling)
+    .add_enum("sum", pool_v1_enum::kSumPooling)
+    .describe("Pooling type to be applied.");
+
+    DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid)
+    .add_enum("full", pool_v1_enum::kFull)
+    .add_enum("valid", pool_v1_enum::kValid)
+    .describe("Pooling convention to be applied.");
+
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+    .enforce_nonzero()
+    .describe("stride: for pooling (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+    .describe("pad for pooling: (y, x) or (d, y, x)");
+  }
+};
+
+template<typename xpu, typename Reducer, typename DType>
+class PoolingV1Op : public Operator {
+ public:
+  explicit PoolingV1Op(PoolingV1Param p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    // TODO(bing): remove pad (0,0)
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = out_grad[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> output_data = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> input_grad = in_grad[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+
+    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
+
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    }
+  }
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Op
+
+template<typename xpu>
+Operator* CreateOp(PoolingV1Param param, int dtype);
+
+
+#if DMLC_USE_CXX11
+class PoolingV1Prop : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D pooling not supported";
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
+            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
+                              param_.stride[2];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[4] + 2 * param_.pad[2] -
+                              param_.kernel[2]) / param_.stride[2]));
+        }
+      }
+
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Input type to pooling is not specified.";
+      return false;
+    }
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    PoolingV1Prop *prop_sym = new PoolingV1Prop();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  std::string TypeString() const override {
+    return "Pooling_v1";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData],
+            out_data[pool_v1_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+#if MXNET_USE_CUDNN == 1
+    return {};
+#else
+    return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}};
+#endif
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Prop
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_POOLING_V1_INL_H_
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
new file mode 100644
index 000000000000..cdf4f1af5939
--- /dev/null
+++ b/src/operator/pooling_v1.cc
@@ -0,0 +1,89 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  })
+
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(PoolingV1Param);
+
+MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop)
+.describe(R"code(Perform pooling on the input.
+
+The shapes for 2-D pooling is
+
+- **data**: *(batch_size, channel, height, width)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+
+    out_height = f(height, kernel[0], pad[0], stride[0])
+    out_width = f(width, kernel[1], pad[1], stride[1])
+
+The defintion of *f* depends on ``pooling_convention``, which has two options:
+
+- **valid** (default)::
+
+    f(x, k, p, s) = floor(x+2*p-k)/s+1
+
+- **full**, which is compatible with Caffe::
+
+    f(x, k, p, s) = ceil(x+2*p-k)/s+1
+
+But ``global_pool`` is set to be true, then do a global pooling, namely reset
+``kernel=(height, width)``.
+
+Three pooling options are supported by ``pool_type``:
+
+- **avg**: average pooling
+- **max**: max pooling
+- **sum**: sum pooling
+
+1-D pooling is special case of 2-D pooling with *weight=1* and
+*kernel[1]=1*.
+
+For 3-D pooling, an additional *depth* dimension is added before
+*height*. Namely the input data will have shape *(batch_size, channel, depth,
+height, width)*.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.")
+.add_arguments(PoolingV1Param::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
new file mode 100644
index 000000000000..99aebbc6446c
--- /dev/null
+++ b/src/operator/pooling_v1.cu
@@ -0,0 +1,36 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cu
+ * \brief
+ * \author Bing Xu
+*/
+#include <vector>
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index a82530928bbc..6aa3912216a3 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -182,6 +182,9 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list, grad_req="add")
 
 
+# Checking max pooling consistency over the data sets of different float types is problematic
+# as one max value in a float32 data set may not be the max value in a float16 data set.
+# This function will not be called.
 def test_pooling_with_type():
     np.random.seed(1234)
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
@@ -204,6 +207,163 @@ def test_pooling_with_type():
     check_consistency(sym, ctx_list)
 
 
+def test_pooling_versions():
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
+                                     pooling_convention='valid', global_pool=False):
+        ctx_list = []
+        sym_list = []
+        # PoolingV1 cpu
+        if 'pool_v1_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # PoolingV1 gpu
+        if 'pool_v1_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling cpu
+        if 'pool_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling gpu
+        if 'pool_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
+                                               name='pool'))
+        # CuDNNPooling
+        if 'pool_cudnn' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=False, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=False,
+                                               name='pool'))
+        check_consistency(sym_list, ctx_list)
+
+    def test_1d_pooling(pool_type):
+        data = (2, 3, 20)
+        kernel = (4,)
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_2d_pooling(pool_type):
+        data = (2, 3, 20, 20)
+        kernel = (4, 5)
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_3d_pooling(pool_type):
+        data = (2, 3, 20, 20, 20)
+        kernel = (4, 5, 3)
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    test_1d_pooling('max')
+    test_1d_pooling('avg')
+    test_1d_pooling('sum')
+
+    test_2d_pooling('max')
+    test_2d_pooling('avg')
+    test_2d_pooling('sum')
+
+    test_3d_pooling('max')
+    test_3d_pooling('avg')
+    test_3d_pooling('sum')
+
+
 def test_upsampling_with_type():
     sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='nearest', num_args=1)
     ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
@@ -490,7 +650,7 @@ def test_unfuse():
     test_convolution_options()
     test_convolution_versions()
     test_convolution_with_type()
-    test_pooling_with_type()
+    test_pooling_versions()
     test_batchnorm_with_type()
     test_batchnorm_with_type()
     test_deconvolution_with_type()